From 891456227881da9c565c455010380a40d385a478 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Wed, 13 Feb 2008 15:34:20 -0600
Subject: jfs: le*_add_cpu conversion

replace all:
little_endian_variable = cpu_to_leX(leX_to_cpu(little_endian_variable) +
                                        expression_in_cpu_byteorder);
with:
        leX_add_cpu(&little_endian_variable, expression_in_cpu_byteorder);
generated with semantic patch

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: jfs-discussion@lists.sourceforge.net
---
 fs/jfs/jfs_dmap.c  | 11 +++++------
 fs/jfs/jfs_imap.c  | 15 ++++++---------
 fs/jfs/jfs_xtree.c | 26 ++++++++------------------
 3 files changed, 19 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index e1985066b1c6..2bc7d8aa5740 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2172,7 +2172,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 	}
 
 	/* update the free count for this dmap */
-	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+	le32_add_cpu(&dp->nfree, -nblocks);
 
 	BMAP_LOCK(bmp);
 
@@ -2316,7 +2316,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 
 	/* update the free count for this dmap.
 	 */
-	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+	le32_add_cpu(&dp->nfree, nblocks);
 
 	BMAP_LOCK(bmp);
 
@@ -3226,7 +3226,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 	}
 
 	/* update the free count for this dmap */
-	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+	le32_add_cpu(&dp->nfree, -nblocks);
 
 	/* reconstruct summary tree */
 	dbInitDmapTree(dp);
@@ -3660,9 +3660,8 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
 			goto initTree;
 		}
 	} else {
-		dp->nblocks =
-		    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
-		dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+		le32_add_cpu(&dp->nblocks, nblocks);
+		le32_add_cpu(&dp->nfree, nblocks);
 	}
 
 	/* word number containing start block number */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 9bf29f771737..734ec916beaf 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1019,8 +1019,7 @@ int diFree(struct inode *ip)
 		/* update the free inode counts at the iag, ag and
 		 * map level.
 		 */
-		iagp->nfreeinos =
-		    cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
+		le32_add_cpu(&iagp->nfreeinos, 1);
 		imap->im_agctl[agno].numfree += 1;
 		atomic_inc(&imap->im_numfree);
 
@@ -1219,9 +1218,8 @@ int diFree(struct inode *ip)
 	/* update the number of free inodes and number of free extents
 	 * for the iag.
 	 */
-	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
-				      (INOSPEREXT - 1));
-	iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
+	le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, 1);
 
 	/* update the number of free inodes and backed inodes
 	 * at the ag and inode map level.
@@ -2124,7 +2122,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 	/* update the free inode count at the iag, ag, inode
 	 * map levels.
 	 */
-	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+	le32_add_cpu(&iagp->nfreeinos, -1);
 	imap->im_agctl[agno].numfree -= 1;
 	atomic_dec(&imap->im_numfree);
 
@@ -2378,9 +2376,8 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 	/* update the free inode and free extent counts for the
 	 * iag.
 	 */
-	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
-				      (INOSPEREXT - 1));
-	iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
+	le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
+	le32_add_cpu(&iagp->nfreeexts, -1);
 
 	/* update the free and backed inode counts for the ag.
 	 */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a000aaa75136..5a61ebf2cbcc 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -905,8 +905,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
 
 	/* advance next available entry index */
-	p->header.nextindex =
-	    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+	le16_add_cpu(&p->header.nextindex, 1);
 
 	/* Don't log it if there are no links to the file */
 	if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -997,8 +996,7 @@ xtSplitUp(tid_t tid,
 			    split->addr);
 
 		/* advance next available entry index */
-		sp->header.nextindex =
-		    cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
+		le16_add_cpu(&sp->header.nextindex, 1);
 
 		/* Don't log it if there are no links to the file */
 		if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1167,9 +1165,7 @@ xtSplitUp(tid_t tid,
 				    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
 
 			/* advance next available entry index. */
-			sp->header.nextindex =
-			    cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
-					1);
+			le16_add_cpu(&sp->header.nextindex, 1);
 
 			/* Don't log it if there are no links to the file */
 			if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1738,8 +1734,7 @@ int xtExtend(tid_t tid,		/* transaction id */
 		XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
 
 		/* advance next available entry index */
-		p->header.nextindex =
-		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+		le16_add_cpu(&p->header.nextindex, 1);
 	}
 
 	/* get back old entry */
@@ -1905,8 +1900,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
 		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
 
 		/* advance next available entry index */
-		p->header.nextindex =
-		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+		le16_add_cpu(&p->header.nextindex, 1);
 	}
 
 	/* get back old XAD */
@@ -2567,8 +2561,7 @@ int xtAppend(tid_t tid,		/* transaction id */
 	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
 
 	/* advance next available entry index */
-	p->header.nextindex =
-	    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
+	le16_add_cpu(&p->header.nextindex, 1);
 
 	xtlck->lwm.offset =
 	    (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
@@ -2631,8 +2624,7 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
 	 * delete the entry from the leaf page
 	 */
 	nextindex = le16_to_cpu(p->header.nextindex);
-	p->header.nextindex =
-	    cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
+	le16_add_cpu(&p->header.nextindex, -1);
 
 	/*
 	 * if the leaf page bocome empty, free the page
@@ -2795,9 +2787,7 @@ xtDeleteUp(tid_t tid, struct inode *ip,
 					(nextindex - index -
 					 1) << L2XTSLOTSIZE);
 
-			p->header.nextindex =
-			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
-					1);
+			le16_add_cpu(&p->header.nextindex, -1);
 			jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
 				 (ulong) parent->bn, index);
 		}
-- 
cgit v1.2.3


From 15732a1cb5f9078d460a254449eb59391e531ffc Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 5 Mar 2008 14:38:22 -0600
Subject: jfs: replace __inline with inline

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_dmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 11e6d471b364..1a6eb41569bc 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -61,7 +61,7 @@
  * determine the maximum free string for four (lower level) nodes
  * of the tree.
  */
-static __inline signed char TREEMAX(signed char *cp)
+static inline signed char TREEMAX(signed char *cp)
 {
 	signed char tmp1, tmp2;
 
-- 
cgit v1.2.3


From 1218854afa6f659be90b748cf1bc7badee954a35 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 26 Mar 2008 02:36:06 +0900
Subject: [NET] NETNS: Omit seq_net_private->net without CONFIG_NET_NS.

Without CONFIG_NET_NS, no namespace other than &init_net exists,
no need to store net in seq_net_private.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 fs/proc/proc_net.c       |  6 +++---
 include/linux/seq_file.h |  7 +++++++
 net/core/neighbour.c     |  8 ++++----
 net/ipv4/fib_hash.c      |  5 ++---
 net/ipv4/fib_trie.c      | 13 ++++++-------
 net/ipv4/raw.c           |  4 ++--
 net/ipv4/route.c         | 29 +++++++++++++++--------------
 net/ipv6/addrconf.c      |  4 ++--
 net/ipv6/mcast.c         |  4 ++--
 net/netfilter/x_tables.c |  4 ++--
 net/netlink/af_netlink.c |  6 +++---
 net/unix/af_unix.c       | 10 +++++-----
 12 files changed, 53 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4caa5f774fb7..13cd7835d0df 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -44,7 +44,9 @@ int seq_open_net(struct inode *ino, struct file *f,
 		put_net(net);
 		return -ENOMEM;
 	}
+#ifdef CONFIG_NET_NS
 	p->net = net;
+#endif
 	return 0;
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
@@ -52,12 +54,10 @@ EXPORT_SYMBOL_GPL(seq_open_net);
 int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq;
-	struct seq_net_private *p;
 
 	seq = f->private_data;
-	p = seq->private;
 
-	put_net(p->net);
+	put_net(seq_file_net(seq));
 	seq_release_private(ino, f);
 	return 0;
 }
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 67c2563961f3..d870a8253769 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/mutex.h>
+#include <net/net_namespace.h>
 
 struct seq_operations;
 struct file;
@@ -64,7 +65,9 @@ extern struct list_head *seq_list_next(void *v, struct list_head *head,
 
 struct net;
 struct seq_net_private {
+#ifdef CONFIG_NET_NS
 	struct net *net;
+#endif
 };
 
 int seq_open_net(struct inode *, struct file *,
@@ -72,7 +75,11 @@ int seq_open_net(struct inode *, struct file *,
 int seq_release_net(struct inode *, struct file *);
 static inline struct net *seq_file_net(struct seq_file *seq)
 {
+#ifdef CONFIG_NET_NS
 	return ((struct seq_net_private *)seq->private)->net;
+#else
+	return &init_net;
+#endif
 }
 
 #endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 065fbac7ecd3..b8d491fb4b42 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2145,7 +2145,7 @@ EXPORT_SYMBOL(__neigh_for_each_release);
 static struct neighbour *neigh_get_first(struct seq_file *seq)
 {
 	struct neigh_seq_state *state = seq->private;
-	struct net *net = state->p.net;
+	struct net *net = seq_file_net(seq);
 	struct neigh_table *tbl = state->tbl;
 	struct neighbour *n = NULL;
 	int bucket = state->bucket;
@@ -2186,7 +2186,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 					loff_t *pos)
 {
 	struct neigh_seq_state *state = seq->private;
-	struct net *net = state->p.net;
+	struct net *net = seq_file_net(seq);
 	struct neigh_table *tbl = state->tbl;
 
 	if (state->neigh_sub_iter) {
@@ -2246,7 +2246,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
 static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
 {
 	struct neigh_seq_state *state = seq->private;
-	struct net * net = state->p.net;
+	struct net *net = seq_file_net(seq);
 	struct neigh_table *tbl = state->tbl;
 	struct pneigh_entry *pn = NULL;
 	int bucket = state->bucket;
@@ -2269,7 +2269,7 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
 					    loff_t *pos)
 {
 	struct neigh_seq_state *state = seq->private;
-	struct net * net = state->p.net;
+	struct net *net = seq_file_net(seq);
 	struct neigh_table *tbl = state->tbl;
 
 	pn = pn->next;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 8d58d85dfac6..02088deb0461 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -821,7 +821,7 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
 	struct fib_table *main_table;
 	struct fn_hash *table;
 
-	main_table = fib_get_table(iter->p.net, RT_TABLE_MAIN);
+	main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
 	table = (struct fn_hash *)main_table->tb_data;
 
 	iter->bucket    = 0;
@@ -959,11 +959,10 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(fib_hash_lock)
 {
-	struct fib_iter_state *iter = seq->private;
 	void *v = NULL;
 
 	read_lock(&fib_hash_lock);
-	if (fib_get_table(iter->p.net, RT_TABLE_MAIN))
+	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
 		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 	return v;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ce6cb34e28e1..9e491e70e855 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2279,9 +2279,10 @@ static const struct file_operations fib_triestat_fops = {
 	.release = fib_triestat_seq_release,
 };
 
-static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, loff_t pos)
+static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 {
-	struct net *net = iter->p.net;
+	struct fib_trie_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
 	loff_t idx = 0;
 	unsigned int h;
 
@@ -2309,16 +2310,14 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, loff_t pos)
 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
-	struct fib_trie_iter *iter = seq->private;
-
 	rcu_read_lock();
-	return fib_trie_get_idx(iter, *pos);
+	return fib_trie_get_idx(seq, *pos);
 }
 
 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct fib_trie_iter *iter = seq->private;
-	struct net *net = iter->p.net;
+	struct net *net = seq_file_net(seq);
 	struct fib_table *tb = iter->tb;
 	struct hlist_node *tb_node;
 	unsigned int h;
@@ -2513,7 +2512,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
 	struct fib_table *tb;
 
 	rcu_read_lock();
-	tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
+	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
 	if (!tb)
 		return NULL;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index be19a4048d7c..25dc8b38cac3 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -856,7 +856,7 @@ static struct sock *raw_get_first(struct seq_file *seq)
 		struct hlist_node *node;
 
 		sk_for_each(sk, node, &state->h->ht[state->bucket])
-			if (sock_net(sk) == state->p.net)
+			if (sock_net(sk) == seq_file_net(seq))
 				goto found;
 	}
 	sk = NULL;
@@ -872,7 +872,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 		sk = sk_next(sk);
 try_again:
 		;
-	} while (sk && sock_net(sk) != state->p.net);
+	} while (sk && sock_net(sk) != seq_file_net(seq));
 
 	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
 		sk = sk_head(&state->h->ht[state->bucket]);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 194f5cca3121..eab8d75e5222 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -276,15 +276,16 @@ struct rt_cache_iter_state {
 	int genid;
 };
 
-static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
+static struct rtable *rt_cache_get_first(struct seq_file *seq)
 {
+	struct rt_cache_iter_state *st = seq->private;
 	struct rtable *r = NULL;
 
 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 		rcu_read_lock_bh();
 		r = rcu_dereference(rt_hash_table[st->bucket].chain);
 		while (r) {
-			if (dev_net(r->u.dst.dev) == st->p.net &&
+			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 			    r->rt_genid == st->genid)
 				return r;
 			r = rcu_dereference(r->u.dst.rt_next);
@@ -294,9 +295,10 @@ static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
 	return r;
 }
 
-static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
+static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 					  struct rtable *r)
 {
+	struct rt_cache_iter_state *st = seq->private;
 	r = r->u.dst.rt_next;
 	while (!r) {
 		rcu_read_unlock_bh();
@@ -308,11 +310,12 @@ static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
 	return rcu_dereference(r);
 }
 
-static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
+static struct rtable *rt_cache_get_next(struct seq_file *seq,
 					struct rtable *r)
 {
-	while ((r = __rt_cache_get_next(st, r)) != NULL) {
-		if (dev_net(r->u.dst.dev) != st->p.net)
+	struct rt_cache_iter_state *st = seq->private;
+	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
+		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 			continue;
 		if (r->rt_genid == st->genid)
 			break;
@@ -320,12 +323,12 @@ static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
 	return r;
 }
 
-static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
+static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 {
-	struct rtable *r = rt_cache_get_first(st);
+	struct rtable *r = rt_cache_get_first(seq);
 
 	if (r)
-		while (pos && (r = rt_cache_get_next(st, r)))
+		while (pos && (r = rt_cache_get_next(seq, r)))
 			--pos;
 	return pos ? NULL : r;
 }
@@ -333,9 +336,8 @@ static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t po
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct rt_cache_iter_state *st = seq->private;
-
 	if (*pos)
-		return rt_cache_get_idx(st, *pos - 1);
+		return rt_cache_get_idx(seq, *pos - 1);
 	st->genid = atomic_read(&rt_genid);
 	return SEQ_START_TOKEN;
 }
@@ -343,12 +345,11 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct rtable *r;
-	struct rt_cache_iter_state *st = seq->private;
 
 	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(st);
+		r = rt_cache_get_first(seq);
 	else
-		r = rt_cache_get_next(st, v);
+		r = rt_cache_get_next(seq, v);
 	++*pos;
 	return r;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f2c90f145cbb..ac5d4f4b6312 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2766,7 +2766,7 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq)
 {
 	struct inet6_ifaddr *ifa = NULL;
 	struct if6_iter_state *state = seq->private;
-	struct net *net = state->p.net;
+	struct net *net = seq_file_net(seq);
 
 	for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
 		ifa = inet6_addr_lst[state->bucket];
@@ -2782,7 +2782,7 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq)
 static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa)
 {
 	struct if6_iter_state *state = seq->private;
-	struct net *net = state->p.net;
+	struct net *net = seq_file_net(seq);
 
 	ifa = ifa->lst_next;
 try_again:
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 20a3d8e2f6c6..d810cff818cf 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2355,7 +2355,7 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
 {
 	struct ifmcaddr6 *im = NULL;
 	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
-	struct net *net = state->p.net;
+	struct net *net = seq_file_net(seq);
 
 	state->idev = NULL;
 	for_each_netdev(net, state->dev) {
@@ -2486,7 +2486,7 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
 	struct ip6_sf_list *psf = NULL;
 	struct ifmcaddr6 *im = NULL;
 	struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
-	struct net *net = state->p.net;
+	struct net *net = seq_file_net(seq);
 
 	state->idev = NULL;
 	state->im = NULL;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a6792089fcf9..0bd95680a494 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -727,7 +727,7 @@ struct xt_names_priv {
 static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct xt_names_priv *priv = seq->private;
-	struct net *net = priv->p.net;
+	struct net *net = seq_file_net(seq);
 	int af = priv->af;
 
 	mutex_lock(&xt[af].mutex);
@@ -737,7 +737,7 @@ static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
 static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct xt_names_priv *priv = seq->private;
-	struct net *net = priv->p.net;
+	struct net *net = seq_file_net(seq);
 	int af = priv->af;
 
 	return seq_list_next(v, &net->xt.tables[af], pos);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 712a7bff8560..1d16d95dfaaf 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1758,7 +1758,7 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 
 		for (j = 0; j <= hash->mask; j++) {
 			sk_for_each(s, node, &hash->table[j]) {
-				if (sock_net(s) != iter->p.net)
+				if (sock_net(s) != seq_file_net(seq))
 					continue;
 				if (off == pos) {
 					iter->link = i;
@@ -1794,7 +1794,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	s = v;
 	do {
 		s = sk_next(s);
-	} while (s && (sock_net(s) != iter->p.net));
+	} while (s && sock_net(s) != seq_file_net(seq));
 	if (s)
 		return s;
 
@@ -1806,7 +1806,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 		for (; j <= hash->mask; j++) {
 			s = sk_head(&hash->table[j]);
-			while (s && sock_net(s) != iter->p.net)
+			while (s && sock_net(s) != seq_file_net(seq))
 				s = sk_next(s);
 			if (s) {
 				iter->link = i;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cb9d0cb5f270..4a4793051bcb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2016,13 +2016,14 @@ struct unix_iter_state {
 	struct seq_net_private p;
 	int i;
 };
-static struct sock *unix_seq_idx(struct unix_iter_state *iter, loff_t pos)
+static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos)
 {
+	struct unix_iter_state *iter = seq->private;
 	loff_t off = 0;
 	struct sock *s;
 
 	for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
-		if (sock_net(s) != iter->p.net)
+		if (sock_net(s) != seq_file_net(seq))
 			continue;
 		if (off == pos)
 			return s;
@@ -2035,9 +2036,8 @@ static struct sock *unix_seq_idx(struct unix_iter_state *iter, loff_t pos)
 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(unix_table_lock)
 {
-	struct unix_iter_state *iter = seq->private;
 	spin_lock(&unix_table_lock);
-	return *pos ? unix_seq_idx(iter, *pos - 1) : ((void *) 1);
+	return *pos ? unix_seq_idx(seq, *pos - 1) : ((void *) 1);
 }
 
 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2050,7 +2050,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		sk = first_unix_socket(&iter->i);
 	else
 		sk = next_unix_socket(&iter->i, sk);
-	while (sk && (sock_net(sk) != iter->p.net))
+	while (sk && (sock_net(sk) != seq_file_net(seq)))
 		sk = next_unix_socket(&iter->i, sk);
 	return sk;
 }
-- 
cgit v1.2.3


From 0e5f8be1388093edc324a78ebf241170b258eba3 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 27 Mar 2008 14:25:53 -0700
Subject: [NETNS]: Compile NET /proc support only if CONFIG_NET is set.

This fix broken compilation for 'allnoconfig'. This was introduced by
Introduced by commit 1218854afa6f659be90b748cf1bc7badee954a35 ("[NET]
NETNS: Omit seq_net_private->net without CONFIG_NET_NS.")

Signed-off-by: Denis V. Lunev <den@openvz.org>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c       | 2 ++
 include/linux/seq_file.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 13cd7835d0df..7034facf8b8f 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -51,6 +51,7 @@ int seq_open_net(struct inode *ino, struct file *f,
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
 
+#ifdef CONFIG_NET
 int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq;
@@ -218,3 +219,4 @@ int __init proc_net_init(void)
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }
+#endif /* CONFIG_NET */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index d870a8253769..5da70c3f4417 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -63,6 +63,7 @@ extern struct list_head *seq_list_start_head(struct list_head *head,
 extern struct list_head *seq_list_next(void *v, struct list_head *head,
 		loff_t *ppos);
 
+#ifdef CONFIG_NET
 struct net;
 struct seq_net_private {
 #ifdef CONFIG_NET_NS
@@ -81,6 +82,7 @@ static inline struct net *seq_file_net(struct seq_file *seq)
 	return &init_net;
 #endif
 }
+#endif	/* CONFIG_NET */
 
 #endif
 #endif
-- 
cgit v1.2.3


From 941e6d7d09aaf455c0d7ad383f7f5ae67e4ccf16 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 28 Jan 2008 08:47:38 +0000
Subject: [GFS2] Speed up gfs2_write_alloc_required, deprecate gfs2_extent_map

This patch removes the call to gfs2_extent_map from gfs2_write_alloc_required,
instead we call gfs2_block_map directly. This results in fewer overall calls
to gfs2_block_map in the multi-block case.

Also, gfs2_extent_map is marked as deprecated so that people know that its
going away as soon as all the callers have been converted.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e9456ebd3bb6..a25444ac648b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -548,6 +548,9 @@ out_fail:
 	return error;
 }
 
+/*
+ * Deprecated: do not use in new code
+ */
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 {
 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
@@ -1197,10 +1200,9 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 			      unsigned int len, int *alloc_required)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	u64 lblock, lblock_stop, dblock;
-	u32 extlen;
-	int new = 0;
-	int error = 0;
+	struct buffer_head bh;
+	unsigned int shift;
+	u64 lblock, lblock_stop, size;
 
 	*alloc_required = 0;
 
@@ -1214,6 +1216,8 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 		return 0;
 	}
 
+	*alloc_required = 1;
+	shift = sdp->sd_sb.sb_bsize_shift;
 	if (gfs2_is_dir(ip)) {
 		unsigned int bsize = sdp->sd_jbsize;
 		lblock = offset;
@@ -1221,27 +1225,25 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 		lblock_stop = offset + len + bsize - 1;
 		do_div(lblock_stop, bsize);
 	} else {
-		unsigned int shift = sdp->sd_sb.sb_bsize_shift;
 		u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
 		lblock = offset >> shift;
 		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-		if (lblock_stop > end_of_file) {
-			*alloc_required = 1;
+		if (lblock_stop > end_of_file)
 			return 0;
-		}
 	}
 
-	for (; lblock < lblock_stop; lblock += extlen) {
-		error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
-		if (error)
-			return error;
-
-		if (!dblock) {
-			*alloc_required = 1;
+	size = (lblock_stop - lblock) << shift;
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
+		if (!buffer_mapped(&bh))
 			return 0;
-		}
-	}
+		size -= bh.b_size;
+		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+	} while(size > 0);
 
+	*alloc_required = 0;
 	return 0;
 }
 
-- 
cgit v1.2.3


From ecc30c79157103e8bd7492043ee992b763443832 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 28 Jan 2008 10:37:35 +0000
Subject: [GFS2] Streamline indirect pointer tree height calculation

This patch improves the calculation of the tree height in order to reduce
the number of operations which are carried out on each call to gfs2_block_map.
In the common case, we now make a single comparison, rather than calculating
the required tree height from scratch each time. Also in the case that the
tree does need some extra height, we start from the current height rather from
zero when we work out what the new height ought to be.

In addition the di_height field is moved into the inode proper and reduced
in size to a u8 since the value must be between 0 and GFS2_MAX_META_HEIGHT (10).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c   | 108 +++++++++++++++++++------------------------------------
 fs/gfs2/incore.h |   6 ++--
 fs/gfs2/inode.c  |  20 ++++++-----
 fs/gfs2/inode.h  |   2 +-
 fs/gfs2/super.c  |   2 ++
 5 files changed, 55 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a25444ac648b..5a3187049dd7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -166,7 +166,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 		di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
 	}
 
-	ip->i_di.di_height = 1;
+	ip->i_height = 1;
 	di->di_height = cpu_to_be16(1);
 
 out_brelse:
@@ -176,43 +176,6 @@ out:
 	return error;
 }
 
-/**
- * calc_tree_height - Calculate the height of a metadata tree
- * @ip: The GFS2 inode
- * @size: The proposed size of the file
- *
- * Work out how tall a metadata tree needs to be in order to accommodate a
- * file of a particular size. If size is less than the current size of
- * the inode, then the current size of the inode is used instead of the
- * supplied one.
- *
- * Returns: the height the tree should be
- */
-
-static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	u64 *arr;
-	unsigned int max, height;
-
-	if (ip->i_di.di_size > size)
-		size = ip->i_di.di_size;
-
-	if (gfs2_is_dir(ip)) {
-		arr = sdp->sd_jheightsize;
-		max = sdp->sd_max_jheight;
-	} else {
-		arr = sdp->sd_heightsize;
-		max = sdp->sd_max_height;
-	}
-
-	for (height = 0; height < max; height++)
-		if (arr[height] >= size)
-			break;
-
-	return height;
-}
-
 /**
  * build_height - Build a metadata tree of the requested height
  * @ip: The GFS2 inode
@@ -225,7 +188,7 @@ static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
 static int build_height(struct inode *inode, unsigned height)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned new_height = height - ip->i_di.di_height;
+	unsigned new_height = height - ip->i_height;
 	struct buffer_head *dibh;
 	struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
 	struct gfs2_dinode *di;
@@ -234,7 +197,7 @@ static int build_height(struct inode *inode, unsigned height)
 	u64 bn;
 	unsigned n;
 
-	if (height <= ip->i_di.di_height)
+	if (height <= ip->i_height)
 		return 0;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -270,10 +233,10 @@ static int build_height(struct inode *inode, unsigned height)
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	*(__be64 *)(di + 1) = cpu_to_be64(bn);
-	ip->i_di.di_height += new_height;
+	ip->i_height += new_height;
 	ip->i_di.di_blocks += new_height;
 	gfs2_set_inode_blocks(&ip->i_inode);
-	di->di_height = cpu_to_be16(ip->i_di.di_height);
+	di->di_height = cpu_to_be16(ip->i_height);
 	di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
 	brelse(dibh);
 	return error;
@@ -345,7 +308,7 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
 	u64 b = block;
 	unsigned int i;
 
-	for (i = ip->i_di.di_height; i--;)
+	for (i = ip->i_height; i--;)
 		mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
 
 }
@@ -407,7 +370,7 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (!create)
 		return 0;
 
-	if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+	if (height == ip->i_height - 1 && !gfs2_is_dir(ip))
 		*block = gfs2_alloc_data(ip);
 	else
 		*block = gfs2_alloc_meta(ip);
@@ -458,8 +421,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *bh;
-	unsigned int bsize;
-	unsigned int height;
+	unsigned int bsize = sdp->sd_sb.sb_bsize;
 	unsigned int end_of_metadata;
 	unsigned int x;
 	int error = 0;
@@ -470,7 +432,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	struct metapath mp;
 	u64 size;
 	struct buffer_head *dibh = NULL;
-
+	const u64 *arr = sdp->sd_heightsize;
 	BUG_ON(maxlen == 0);
 
 	if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
@@ -480,23 +442,25 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
 	clear_buffer_boundary(bh_map);
-	bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+	if (gfs2_is_dir(ip)) {
+		bsize = sdp->sd_jbsize;
+		arr = sdp->sd_jheightsize;
+	}
 	size = (lblock + 1) * bsize;
 
-	if (size > ip->i_di.di_size) {
-		height = calc_tree_height(ip, size);
-		if (ip->i_di.di_height < height) {
-			if (!create)
-				goto out_ok;
-	
-			error = build_height(inode, height);
-			if (error)
-				goto out_fail;
-		}
+	if (size > arr[ip->i_height]) {
+		u8 height = ip->i_height;
+		if (!create)
+			goto out_ok;
+		while (size > arr[height])
+			height++;
+		error = build_height(inode, height);
+		if (error)
+			goto out_fail;
 	}
 
 	find_metapath(ip, lblock, &mp);
-	end_of_metadata = ip->i_di.di_height - 1;
+	end_of_metadata = ip->i_height - 1;
 	error = gfs2_meta_inode_buffer(ip, &bh);
 	if (error)
 		goto out_fail;
@@ -624,7 +588,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out;
 
-	if (height < ip->i_di.di_height - 1)
+	if (height < ip->i_height - 1)
 		for (; top < bottom; top++, first = 0) {
 			if (!*top)
 				continue;
@@ -682,7 +646,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		sm->sm_first = 0;
 	}
 
-	metadata = (height != ip->i_di.di_height - 1);
+	metadata = (height != ip->i_height - 1);
 	if (metadata)
 		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 
@@ -807,7 +771,6 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al;
 	struct buffer_head *dibh;
-	unsigned int h;
 	int error;
 
 	al = gfs2_alloc_get(ip);
@@ -833,20 +796,23 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 		goto out_ipres;
 
 	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+		const u64 *arr = sdp->sd_heightsize;
 		if (gfs2_is_stuffed(ip)) {
 			error = gfs2_unstuff_dinode(ip, NULL);
 			if (error)
 				goto out_end_trans;
 		}
 
-		h = calc_tree_height(ip, size);
-		if (ip->i_di.di_height < h) {
-			down_write(&ip->i_rw_mutex);
-			error = build_height(&ip->i_inode, h);
-			up_write(&ip->i_rw_mutex);
-			if (error)
-				goto out_end_trans;
+		down_write(&ip->i_rw_mutex);
+		if (size > arr[ip->i_height]) {
+			u8 height = ip->i_height;
+			while(size > arr[height])
+				height++;
+			error = build_height(&ip->i_inode, height);
 		}
+		up_write(&ip->i_rw_mutex);
+		if (error)
+			goto out_end_trans;
 	}
 
 	ip->i_di.di_size = size;
@@ -989,7 +955,7 @@ out:
 
 static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 {
-	unsigned int height = ip->i_di.di_height;
+	unsigned int height = ip->i_height;
 	u64 lblock;
 	struct metapath mp;
 	int error;
@@ -1040,7 +1006,7 @@ static int trunc_end(struct gfs2_inode *ip)
 		goto out;
 
 	if (!ip->i_di.di_size) {
-		ip->i_di.di_height = 0;
+		ip->i_height = 0;
 		ip->i_di.di_goal_meta =
 			ip->i_di.di_goal_data =
 			ip->i_no_addr;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 525dcae352d6..43472baa92e7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -246,7 +246,6 @@ struct gfs2_dinode_host {
 	u64 di_goal_data;	/* data block goal */
 	u64 di_generation;	/* generation number for NFS */
 	u32 di_flags;		/* GFS2_DIF_... */
-	u16 di_height;		/* height of metadata */
 	/* These only apply to directories  */
 	u16 di_depth;		/* Number of bits in the table */
 	u32 di_entries;		/* The number of entries in the directory */
@@ -268,6 +267,7 @@ struct gfs2_inode {
 	u64 i_last_rg_alloc;
 
 	struct rw_semaphore i_rw_mutex;
+	u8 i_height;
 };
 
 /*
@@ -490,9 +490,9 @@ struct gfs2_sbd {
 	u32 sd_qc_per_block;
 	u32 sd_max_dirres;	/* Max blocks needed to add a directory entry */
 	u32 sd_max_height;	/* Max height of a file's metadata tree */
-	u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
 	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
-	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
 
 	struct gfs2_args sd_args;	/* Mount arguments */
 	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 37725ade3c51..ff66ab7a17c8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -248,12 +248,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
+	u16 height;
 
-	if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
+	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+		goto corrupt;
 	ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
 	ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
 	ip->i_inode.i_rdev = 0;
@@ -290,7 +288,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 
 	di->di_flags = be32_to_cpu(str->di_flags);
 	gfs2_set_inode_flags(&ip->i_inode);
-	di->di_height = be16_to_cpu(str->di_height);
+	height = be16_to_cpu(str->di_height);
+	if (unlikely(height > GFS2_MAX_META_HEIGHT))
+		goto corrupt;
+	ip->i_height = (u8)height;
 
 	di->di_depth = be16_to_cpu(str->di_depth);
 	di->di_entries = be32_to_cpu(str->di_entries);
@@ -300,6 +301,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 		gfs2_set_aops(&ip->i_inode);
 
 	return 0;
+corrupt:
+	if (gfs2_consist_inode(ip))
+		gfs2_dinode_print(ip);
+	return -EIO;
 }
 
 /**
@@ -1401,7 +1406,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_generation = cpu_to_be64(di->di_generation);
 
 	str->di_flags = cpu_to_be32(di->di_flags);
-	str->di_height = cpu_to_be16(di->di_height);
+	str->di_height = cpu_to_be16(ip->i_height);
 	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
 					     !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
 					     GFS2_FORMAT_DE : 0);
@@ -1430,7 +1435,6 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  di_goal_data = %llu\n",
 	       (unsigned long long)di->di_goal_data);
 	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
-	printk(KERN_INFO "  di_height = %u\n", di->di_height);
 	printk(KERN_INFO "  di_depth = %u\n", di->di_depth);
 	printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
 	printk(KERN_INFO "  di_eattr = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d44650662615..db738686ca1d 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -12,7 +12,7 @@
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
-	return !ip->i_di.di_height;
+	return !ip->i_height;
 }
 
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ef0562c3bc71..88497b0b7339 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -316,6 +316,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
 		sdp->sd_heightsize[x] = space;
 	}
 	sdp->sd_max_height = x;
+	sdp->sd_heightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
 
 	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
@@ -334,6 +335,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
 		sdp->sd_jheightsize[x] = space;
 	}
 	sdp->sd_max_jheight = x;
+	sdp->sd_jheightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
 
 	return 0;
-- 
cgit v1.2.3


From fe6c991c52a0dd07d4a19d392fd65048226cb1bc Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 11:13:02 -0600
Subject: [GFS2] Get rid of unneeded parameter in gfs2_rlist_alloc

This patch removed the unnecessary parameter from function
gfs2_rlist_alloc.  The parameter was always passed in as 0.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  | 2 +-
 fs/gfs2/dir.c   | 2 +-
 fs/gfs2/eattr.c | 2 +-
 fs/gfs2/rgrp.c  | 7 +++----
 fs/gfs2/rgrp.h  | 5 ++---
 5 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5a3187049dd7..e8e48b690cef 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -680,7 +680,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	else
 		goto out; /* Nothing to do */
 
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
 
 	for (x = 0; x < rlist.rl_rgrps; x++) {
 		struct gfs2_rgrpd *rgd;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c34709512b19..78c236fb34ec 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1894,7 +1894,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		l_blocks++;
 	}
 
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
 
 	for (x = 0; x < rlist.rl_rgrps; x++) {
 		struct gfs2_rgrpd *rgd;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index bee99704ea10..04febbc17a16 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -1347,7 +1347,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	else
 		goto out;
 
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
 
 	for (x = 0; x < rlist.rl_rgrps; x++) {
 		struct gfs2_rgrpd *rgd;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3552110b2e5f..7b9d6f1d1527 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -1699,8 +1699,7 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
  *
  */
 
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
-		      int flags)
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
 {
 	unsigned int x;
 
@@ -1708,7 +1707,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
 				GFP_NOFS | __GFP_NOFAIL);
 	for (x = 0; x < rlist->rl_rgrps; x++)
 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
-				state, flags,
+				state, 0,
 				&rlist->rl_ghs[x]);
 }
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 149bb161f4b6..5683605695fd 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -64,8 +64,7 @@ struct gfs2_rgrp_list {
 
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
 		    u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
-		      int flags);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 u64 gfs2_ri_total(struct gfs2_sbd *sdp);
 
-- 
cgit v1.2.3


From ca390601a8bbb4ab8301a9469d23cdb1cf77e7cb Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 11:15:57 -0600
Subject: [GFS2] Fix debug inode printing

I noticed that the latest change to i_height got rid of the
value from the inode dump.  This patch adds it back.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index ff66ab7a17c8..db5961a9aa59 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -1435,6 +1435,7 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  di_goal_data = %llu\n",
 	       (unsigned long long)di->di_goal_data);
 	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
+	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
 	printk(KERN_INFO "  di_depth = %u\n", di->di_depth);
 	printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
 	printk(KERN_INFO "  di_eattr = %llu\n",
-- 
cgit v1.2.3


From d0109bfa84d6603becac8c2e87b3716f557f2039 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 11:20:10 -0600
Subject: [GFS2] Only do lo_incore_commit once

This patch is performance related.  When we're doing a log flush,
I noticed we were calling buf_lo_incore_commit twice: once for
data bufs and once for metadata bufs.  Since this is the same
function and does the same thing in both cases, there should be
no reason to call it twice.  Since we only need to call it once,
we can also make it faster by removing it from the generic "lops"
code and making it a stand-along static function.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  1 -
 fs/gfs2/log.c    | 17 ++++++++++++++++-
 fs/gfs2/lops.c   | 17 -----------------
 fs/gfs2/lops.h   | 11 +----------
 4 files changed, 17 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 43472baa92e7..7ae12d2b6262 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -44,7 +44,6 @@ struct gfs2_log_header_host {
 
 struct gfs2_log_operations {
 	void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
-	void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
 	void (*lo_before_commit) (struct gfs2_sbd *sdp);
 	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
 	void (*lo_before_scan) (struct gfs2_jdesc *jd,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 161ab6f2058e..b335304fc5d6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -779,6 +779,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	gfs2_log_unlock(sdp);
 }
 
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head = &tr->tr_list_buf;
+	struct gfs2_bufdata *bd;
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+		list_del_init(&bd->bd_list_tr);
+		tr->tr_num_buf--;
+	}
+	gfs2_log_unlock(sdp);
+	gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+
 /**
  * gfs2_log_commit - Commit a transaction to the log
  * @sdp: the filesystem
@@ -790,7 +805,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	log_refund(sdp, tr);
-	lops_incore_commit(sdp, tr);
+	buf_lo_incore_commit(sdp, tr);
 
 	sdp->sd_vfs->s_dirt = 1;
 	up_read(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index fae59d69d01a..71387372c883 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -152,21 +152,6 @@ out:
 	unlock_buffer(bd->bd_bh);
 }
 
-static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
-{
-	struct list_head *head = &tr->tr_list_buf;
-	struct gfs2_bufdata *bd;
-
-	gfs2_log_lock(sdp);
-	while (!list_empty(head)) {
-		bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
-		list_del_init(&bd->bd_list_tr);
-		tr->tr_num_buf--;
-	}
-	gfs2_log_unlock(sdp);
-	gfs2_assert_warn(sdp, !tr->tr_num_buf);
-}
-
 static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 {
 	struct buffer_head *bh;
@@ -737,7 +722,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 
 const struct gfs2_log_operations gfs2_buf_lops = {
 	.lo_add = buf_lo_add,
-	.lo_incore_commit = buf_lo_incore_commit,
 	.lo_before_commit = buf_lo_before_commit,
 	.lo_after_commit = buf_lo_after_commit,
 	.lo_before_scan = buf_lo_before_scan,
@@ -763,7 +747,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
 
 const struct gfs2_log_operations gfs2_databuf_lops = {
 	.lo_add = databuf_lo_add,
-	.lo_incore_commit = buf_lo_incore_commit,
 	.lo_before_commit = databuf_lo_before_commit,
 	.lo_after_commit = databuf_lo_after_commit,
 	.lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 41a00df75587..3c0b2737658a 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -57,15 +57,6 @@ static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 		le->le_ops->lo_add(sdp, le);
 }
 
-static inline void lops_incore_commit(struct gfs2_sbd *sdp,
-				      struct gfs2_trans *tr)
-{
-	int x;
-	for (x = 0; gfs2_log_ops[x]; x++)
-		if (gfs2_log_ops[x]->lo_incore_commit)
-			gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
-}
-
 static inline void lops_before_commit(struct gfs2_sbd *sdp)
 {
 	int x;
-- 
cgit v1.2.3


From 7eabb77e65c559d9c284da232b9ba5354898028a Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 11:24:35 -0600
Subject: [GFS2] Misc fixups

This patch contains two small fixups that didn't fit elsewhere.
They are: (1) get rid of temp variable in find_metapath.
(2) Remove vestigial "ret" variable from gfs2_writepage_common.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        | 3 +--
 fs/gfs2/ops_address.c | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e8e48b690cef..e84e3845a394 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -305,11 +305,10 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
 			  struct metapath *mp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	u64 b = block;
 	unsigned int i;
 
 	for (i = ip->i_height; i--;)
-		mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 
 }
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ac772b6d9dbb..7523999afc53 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -104,11 +104,9 @@ static int gfs2_writepage_common(struct page *page,
 	loff_t i_size = i_size_read(inode);
 	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	int ret = -EIO;
 
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
 		goto out;
-	ret = 0;
 	if (current->journal_info)
 		goto redirty;
 	/* Is the page fully outside i_size? (truncate in progress) */
-- 
cgit v1.2.3


From ef8c441cb7fece75dbbdb1f59d3f82b6a4be7474 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 14:54:16 -0600
Subject: [GFS2] Only wake the reclaim daemon if we need to

This patch only wakes up the glock reclaim daemon if there is
actually something to be reclaimed.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7175a4d06435..5752dec017c1 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1594,10 +1594,10 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 		gfs2_glock_hold(gl);
 		list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
 		atomic_inc(&sdp->sd_reclaim_count);
-	}
-	spin_unlock(&sdp->sd_reclaim_lock);
-
-	wake_up(&sdp->sd_reclaim_wq);
+		spin_unlock(&sdp->sd_reclaim_lock);
+		wake_up(&sdp->sd_reclaim_wq);
+	} else
+		spin_unlock(&sdp->sd_reclaim_lock);
 }
 
 /**
-- 
cgit v1.2.3


From 048786f1e6042022a8fb2035157a8c8c3a82a4f2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 29 Jan 2008 00:11:34 +0200
Subject: [GFS2] make gfs2_glock_hold() static

gfs2_glock_hold() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 2 +-
 fs/gfs2/glock.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5752dec017c1..befcda0e53a1 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -197,7 +197,7 @@ static void glock_free(struct gfs2_glock *gl)
  *
  */
 
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
 	atomic_inc(&gl->gl_ref);
 }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2f9c6d136b37..ace5770760ce 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -79,7 +79,6 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
 		   u64 number, const struct gfs2_glock_operations *glops,
 		   int create, struct gfs2_glock **glp);
-void gfs2_glock_hold(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 		      struct gfs2_holder *gh);
-- 
cgit v1.2.3


From 3ad62e87cd38817361e165cf4ad496ab76e19e81 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 16:35:13 -0600
Subject: [GFS2] Plug an unlikely leak

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 71387372c883..4390f6f4047d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -404,8 +404,10 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 			blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
 
 			error = gfs2_revoke_add(sdp, blkno, start);
-			if (error < 0)
+			if (error < 0) {
+				brelse(bh);
 				return error;
+			}
 			else if (error)
 				sdp->sd_found_revokes++;
 
-- 
cgit v1.2.3


From 6bdd9be628fa5f4dd14eb89ebddc12840d684277 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 17:20:26 -0600
Subject: [GFS2] Allocate gfs2_rgrpd from slab memory

This patch moves the gfs2_rgrpd structure to its own slab
memory.  This makes it easier to control and monitor, and
yields less memory fragmentation.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/main.c | 10 ++++++++++
 fs/gfs2/rgrp.c |  4 ++--
 fs/gfs2/util.c |  1 +
 fs/gfs2/util.h |  1 +
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 9c7765c12d62..053e2ebbbd50 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -89,6 +89,12 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_bufdata_cachep)
 		goto fail;
 
+	gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
+					      sizeof(struct gfs2_rgrpd),
+					      0, 0, NULL);
+	if (!gfs2_rgrpd_cachep)
+		goto fail;
+
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
 		goto fail;
@@ -108,6 +114,9 @@ fail_unregister:
 fail:
 	gfs2_glock_exit();
 
+	if (gfs2_rgrpd_cachep)
+		kmem_cache_destroy(gfs2_rgrpd_cachep);
+
 	if (gfs2_bufdata_cachep)
 		kmem_cache_destroy(gfs2_bufdata_cachep);
 
@@ -133,6 +142,7 @@ static void __exit exit_gfs2_fs(void)
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 
+	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
 	kmem_cache_destroy(gfs2_inode_cachep);
 	kmem_cache_destroy(gfs2_glock_cachep);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7b9d6f1d1527..dc7e83eed32d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -353,7 +353,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
 		}
 
 		kfree(rgd->rd_bits);
-		kfree(rgd);
+		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
 	}
 }
 
@@ -516,7 +516,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 		return error;
 	}
 
-	rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
 	error = -ENOMEM;
 	if (!rgd)
 		return error;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 424a0774eda8..fe9c28ef77b0 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
 
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 28938a46cf47..ac0c567ebc36 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,6 +147,7 @@ gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
 extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_rgrpd_cachep;
 
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 					   unsigned int *p)
-- 
cgit v1.2.3


From 42d52e3818751633656fb90df1bd5cb5362fa8cc Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 18:38:07 -0600
Subject: [GFS2] Combine rg_flags and rd_flags

This patch reduces the memory required by GFS2 by combining
the rd_flags and rg_flags (in core only).

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  6 +++---
 fs/gfs2/rgrp.c   | 36 +++++++++++++++++++++++-------------
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7ae12d2b6262..39bab7b213e9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -69,7 +69,6 @@ struct gfs2_bitmap {
 };
 
 struct gfs2_rgrp_host {
-	u32 rg_flags;
 	u32 rg_free;
 	u32 rg_dinodes;
 	u64 rg_igeneration;
@@ -95,8 +94,9 @@ struct gfs2_rgrpd {
 	u32 rd_last_alloc_data;
 	u32 rd_last_alloc_meta;
 	struct gfs2_sbd *rd_sbd;
-	unsigned long rd_flags;
-#define GFS2_RDF_CHECK        0x0001          /* Need to check for unlinked inodes */
+	unsigned char rd_flags;
+#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
+#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
 };
 
 enum gfs2_state_bits {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index dc7e83eed32d..da60ce8c5d7d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -655,21 +655,31 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 	return error;
 }
 
-static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
+static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
 	const struct gfs2_rgrp *str = buf;
+	struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+	u32 rg_flags;
 
-	rg->rg_flags = be32_to_cpu(str->rg_flags);
+	rg_flags = be32_to_cpu(str->rg_flags);
+	if (rg_flags & GFS2_RGF_NOALLOC)
+		rgd->rd_flags |= GFS2_RDF_NOALLOC;
+	else
+		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
 	rg->rg_free = be32_to_cpu(str->rg_free);
 	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
 	rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 
-static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
+static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
+	struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+	u32 rg_flags = 0;
 
-	str->rg_flags = cpu_to_be32(rg->rg_flags);
+	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+		rg_flags |= GFS2_RGF_NOALLOC;
+	str->rg_flags = cpu_to_be32(rg_flags);
 	str->rg_free = cpu_to_be32(rg->rg_free);
 	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
 	str->__pad = cpu_to_be32(0);
@@ -727,7 +737,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	}
 
 	if (rgd->rd_rg_vn != gl->gl_vn) {
-		gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_rg_vn = gl->gl_vn;
 	}
 
@@ -840,7 +850,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 
-	if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
 		return 0;
 
 	spin_lock(&sdp->sd_rindex_spin);
@@ -1431,7 +1441,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 	rgd->rd_rg.rg_free--;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	al->al_alloced++;
 
@@ -1476,7 +1486,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 	rgd->rd_rg.rg_free--;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	al->al_alloced++;
 
@@ -1519,7 +1529,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	rgd->rd_rg.rg_dinodes++;
 	*generation = rgd->rd_rg.rg_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	al->al_alloced++;
 
@@ -1553,7 +1563,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	rgd->rd_rg.rg_free += blen;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_trans_add_rg(rgd);
 
@@ -1581,7 +1591,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	rgd->rd_rg.rg_free += blen;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_trans_add_rg(rgd);
 
@@ -1601,7 +1611,7 @@ void gfs2_unlink_di(struct inode *inode)
 	if (!rgd)
 		return;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 	gfs2_trans_add_rg(rgd);
 }
 
@@ -1621,7 +1631,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 	rgd->rd_rg.rg_free++;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_statfs_change(sdp, 0, +1, -1);
 	gfs2_trans_add_rg(rgd);
-- 
cgit v1.2.3


From 29d38cd16358dcaef4a6c50866ecee28025b481a Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 28 Jan 2008 22:31:39 -0600
Subject: [GFS2] Get rid of gl_waiters2

This patch reduces memory by replacing the int variable
gl_waiters2 by a single bit in the gl_flags.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 7 ++++---
 fs/gfs2/incore.h | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index befcda0e53a1..951cb91e7ddb 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -595,11 +595,12 @@ static void run_queue(struct gfs2_glock *gl)
 			blocked = rq_mutex(gh);
 		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
 			blocked = rq_demote(gl);
-			if (gl->gl_waiters2 && !blocked) {
+			if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
+				     !blocked) {
 				set_bit(GLF_DEMOTE, &gl->gl_flags);
 				gl->gl_demote_state = LM_ST_UNLOCKED;
 			}
-			gl->gl_waiters2 = 0;
+			clear_bit(GLF_WAITERS2, &gl->gl_flags);
 		} else if (!list_empty(&gl->gl_waiters3)) {
 			gh = list_entry(gl->gl_waiters3.next,
 					struct gfs2_holder, gh_list);
@@ -710,7 +711,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
 			gl->gl_demote_state != state) {
 		if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
-			gl->gl_waiters2 = 1;
+			set_bit(GLF_WAITERS2, &gl->gl_flags);
 		else 
 			gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 39bab7b213e9..fe14f6a417b5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -167,6 +167,7 @@ enum {
 	GLF_DIRTY		= 5,
 	GLF_DEMOTE_IN_PROGRESS	= 6,
 	GLF_LFLUSH		= 7,
+	GLF_WAITERS2		= 8,
 };
 
 struct gfs2_glock {
@@ -186,7 +187,6 @@ struct gfs2_glock {
 	struct list_head gl_holders;
 	struct list_head gl_waiters1;	/* HIF_MUTEX */
 	struct list_head gl_waiters3;	/* HIF_PROMOTE */
-	int gl_waiters2;		/* GIF_DEMOTE */
 
 	const struct gfs2_glock_operations *gl_ops;
 
-- 
cgit v1.2.3


From 11707ea05e85290d10c482b87e195c198f5eb3cf Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 28 Jan 2008 15:10:29 +0000
Subject: [GFS2] Move part of gfs2_block_map into a separate function

This is required to enable future changes to the block
mapping code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 57 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e84e3845a394..08d1be492ef7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -384,6 +384,35 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 	return 0;
 }
 
+static int lookup_metapath(struct inode *inode, struct metapath *mp,
+			   int create, int *new, u64 *dblock,
+			   struct buffer_head **dibh, struct buffer_head **bh)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int end_of_metadata = ip->i_height - 1;
+	unsigned int x;
+	int ret = gfs2_meta_inode_buffer(ip, bh);
+	if (ret)
+		return ret;
+
+	*dibh = *bh;
+	get_bh(*dibh);
+
+	for (x = 0; x < end_of_metadata; x++) {
+		lookup_block(ip, *bh, x, mp, create, new, dblock);
+		brelse(*bh);
+		*bh = NULL;
+		if (!dblock)
+			return 0;
+
+		ret = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, bh);
+		if (ret)
+			return ret;
+	}
+
+	return lookup_block(ip, *bh, end_of_metadata, mp, create, new, dblock);
+}
+
 static inline void bmap_lock(struct inode *inode, int create)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -419,10 +448,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *bh;
 	unsigned int bsize = sdp->sd_sb.sb_bsize;
-	unsigned int end_of_metadata;
-	unsigned int x;
+	struct buffer_head *bh = NULL;
 	int error = 0;
 	int new = 0;
 	u64 dblock = 0;
@@ -459,25 +486,11 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	}
 
 	find_metapath(ip, lblock, &mp);
-	end_of_metadata = ip->i_height - 1;
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error)
+	error = lookup_metapath(inode, &mp, create, &new, &dblock, &dibh, &bh);
+	if (error < 0)
 		goto out_fail;
-	dibh = bh;
-	get_bh(dibh);
-
-	for (x = 0; x < end_of_metadata; x++) {
-		lookup_block(ip, bh, x, &mp, create, &new, &dblock);
-		brelse(bh);
-		if (!dblock)
-			goto out_ok;
+	boundary = error;
 
-		error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
-		if (error)
-			goto out_fail;
-	}
-
-	boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock);
 	if (dblock) {
 		map_bh(bh_map, inode->i_sb, dblock);
 		if (boundary)
@@ -489,6 +502,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 			goto out_brelse;
 		}
 		while(--maxlen && !buffer_boundary(bh_map)) {
+			unsigned int end_of_metadata = ip->i_height - 1;
 			u64 eblock;
 
 			mp.mp_list[end_of_metadata]++;
@@ -501,7 +515,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 		}
 	}
 out_brelse:
-	brelse(bh);
+	if (bh)
+		brelse(bh);
 out_ok:
 	error = 0;
 out_fail:
-- 
cgit v1.2.3


From dbac6710a6dfcec7fbe7d9571c183d86a4237623 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 29 Jan 2008 09:12:55 +0000
Subject: [GFS2] Introduce array of buffers to struct metapath

The reason for doing this is to allow all the block mapping code
to share the same array. As a result we can remove two arguments
from lookup_metapath since they are now returned via the array.

We also add a function to drop all refs to buffer heads when we
are done with the metapath. The build_height function shares the
struct metapath, but currently still frees its own buffers, and
this will change in a future patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 97 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 08d1be492ef7..2011dd27f8d6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -33,6 +33,7 @@
  * keep it small.
  */
 struct metapath {
+	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
 
@@ -185,12 +186,11 @@ out:
  * Returns: errno
  */
 
-static int build_height(struct inode *inode, unsigned height)
+static int build_height(struct inode *inode, struct metapath *mp, unsigned height)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned new_height = height - ip->i_height;
 	struct buffer_head *dibh;
-	struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
 	struct gfs2_dinode *di;
 	int error;
 	__be64 *bp;
@@ -206,29 +206,29 @@ static int build_height(struct inode *inode, unsigned height)
 
 	for(n = 0; n < new_height; n++) {
 		bn = gfs2_alloc_meta(ip);
-		blocks[n] = gfs2_meta_new(ip->i_gl, bn);
-		gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
+		mp->mp_bh[n] = gfs2_meta_new(ip->i_gl, bn);
+		gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[n], 1);
 	}
 
 	n = 0;
-	bn = blocks[0]->b_blocknr;
+	bn = mp->mp_bh[0]->b_blocknr;
 	if (new_height > 1) {
 		for(; n < new_height-1; n++) {
-			gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
+			gfs2_metatype_set(mp->mp_bh[n], GFS2_METATYPE_IN,
 					  GFS2_FORMAT_IN);
-			gfs2_buffer_clear_tail(blocks[n],
+			gfs2_buffer_clear_tail(mp->mp_bh[n],
 					       sizeof(struct gfs2_meta_header));
-			bp = (__be64 *)(blocks[n]->b_data +
+			bp = (__be64 *)(mp->mp_bh[n]->b_data +
 				     sizeof(struct gfs2_meta_header));
-			*bp = cpu_to_be64(blocks[n+1]->b_blocknr);
-			brelse(blocks[n]);
-			blocks[n] = NULL;
+			*bp = cpu_to_be64(mp->mp_bh[n+1]->b_blocknr);
+			brelse(mp->mp_bh[n]);
+			mp->mp_bh[n] = NULL;
 		}
 	}
-	gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-	gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
+	gfs2_metatype_set(mp->mp_bh[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+	gfs2_buffer_copy_tail(mp->mp_bh[n], sizeof(struct gfs2_meta_header),
 			      dibh, sizeof(struct gfs2_dinode));
-	brelse(blocks[n]);
+	brelse(mp->mp_bh[n]);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -314,7 +314,6 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
 
 /**
  * metapointer - Return pointer to start of metadata in a buffer
- * @bh: The buffer
  * @height: The metadata height (0 = dinode)
  * @mp: The metapath
  *
@@ -323,9 +322,10 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
  * metadata tree.
  */
 
-static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
-			       unsigned int height, const struct metapath *mp)
+static inline __be64 *metapointer(int *boundary, unsigned int height,
+				  const struct metapath *mp)
 {
+	struct buffer_head *bh = mp->mp_bh[height];
 	unsigned int head_size = (height > 0) ?
 		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
 	__be64 *ptr;
@@ -339,7 +339,6 @@ static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
 /**
  * lookup_block - Get the next metadata block in metadata tree
  * @ip: The GFS2 inode
- * @bh: Buffer containing the pointers to metadata blocks
  * @height: The height of the tree (0 = dinode)
  * @mp: The metapath
  * @create: Non-zero if we may create a new meatdata block
@@ -352,12 +351,12 @@ static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
  *
  */
 
-static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
-			unsigned int height, struct metapath *mp, int create,
+static int lookup_block(struct gfs2_inode *ip, unsigned int height,
+			struct metapath *mp, int create,
 			int *new, u64 *block)
 {
 	int boundary;
-	__be64 *ptr = metapointer(bh, &boundary, height, mp);
+	__be64 *ptr = metapointer(&boundary, height, mp);
 
 	if (*ptr) {
 		*block = be64_to_cpu(*ptr);
@@ -374,7 +373,7 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 	else
 		*block = gfs2_alloc_meta(ip);
 
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[height], 1);
 
 	*ptr = cpu_to_be64(*block);
 	ip->i_di.di_blocks++;
@@ -385,32 +384,38 @@ static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
 }
 
 static int lookup_metapath(struct inode *inode, struct metapath *mp,
-			   int create, int *new, u64 *dblock,
-			   struct buffer_head **dibh, struct buffer_head **bh)
+			   int create, int *new, u64 *dblock)
 {
+	struct buffer_head *bh;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
-	int ret = gfs2_meta_inode_buffer(ip, bh);
+	int ret = gfs2_meta_inode_buffer(ip, &bh);
 	if (ret)
 		return ret;
 
-	*dibh = *bh;
-	get_bh(*dibh);
+	mp->mp_bh[0] = bh;
 
 	for (x = 0; x < end_of_metadata; x++) {
-		lookup_block(ip, *bh, x, mp, create, new, dblock);
-		brelse(*bh);
-		*bh = NULL;
+		lookup_block(ip, x, mp, create, new, dblock);
 		if (!dblock)
 			return 0;
 
-		ret = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, bh);
+		ret = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &mp->mp_bh[x+1]);
 		if (ret)
 			return ret;
 	}
 
-	return lookup_block(ip, *bh, end_of_metadata, mp, create, new, dblock);
+	return lookup_block(ip, end_of_metadata, mp, create, new, dblock);
+}
+
+static void release_metapath(struct metapath *mp)
+{
+	int i;
+
+	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++)
+		if (mp->mp_bh[i])
+			brelse(mp->mp_bh[i]);
 }
 
 static inline void bmap_lock(struct inode *inode, int create)
@@ -449,7 +454,6 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned int bsize = sdp->sd_sb.sb_bsize;
-	struct buffer_head *bh = NULL;
 	int error = 0;
 	int new = 0;
 	u64 dblock = 0;
@@ -457,13 +461,13 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
 	struct metapath mp;
 	u64 size;
-	struct buffer_head *dibh = NULL;
 	const u64 *arr = sdp->sd_heightsize;
 	BUG_ON(maxlen == 0);
 
 	if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
 		return 0;
 
+	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
 	bmap_lock(inode, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
@@ -480,13 +484,13 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 			goto out_ok;
 		while (size > arr[height])
 			height++;
-		error = build_height(inode, height);
+		error = build_height(inode, &mp, height);
 		if (error)
 			goto out_fail;
 	}
 
 	find_metapath(ip, lblock, &mp);
-	error = lookup_metapath(inode, &mp, create, &new, &dblock, &dibh, &bh);
+	error = lookup_metapath(inode, &mp, create, &new, &dblock);
 	if (error < 0)
 		goto out_fail;
 	boundary = error;
@@ -496,17 +500,15 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 		if (boundary)
 			set_buffer_boundary(bh_map);
 		if (new) {
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			gfs2_dinode_out(ip, dibh->b_data);
+			gfs2_trans_add_bh(ip->i_gl, mp.mp_bh[0], 1);
+			gfs2_dinode_out(ip, mp.mp_bh[0]->b_data);
 			set_buffer_new(bh_map);
-			goto out_brelse;
+			goto out_ok;
 		}
 		while(--maxlen && !buffer_boundary(bh_map)) {
-			unsigned int end_of_metadata = ip->i_height - 1;
 			u64 eblock;
-
-			mp.mp_list[end_of_metadata]++;
-			boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock);
+			mp.mp_list[ip->i_height - 1]++;
+			boundary = lookup_block(ip, ip->i_height - 1, &mp, 0, &new, &eblock);
 			if (eblock != ++dblock)
 				break;
 			bh_map->b_size += (1 << inode->i_blkbits);
@@ -514,14 +516,10 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 				set_buffer_boundary(bh_map);
 		}
 	}
-out_brelse:
-	if (bh)
-		brelse(bh);
 out_ok:
 	error = 0;
 out_fail:
-	if (dibh)
-		brelse(dibh);
+	release_metapath(&mp);
 	bmap_unlock(inode, create);
 	return error;
 }
@@ -819,10 +817,11 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 
 		down_write(&ip->i_rw_mutex);
 		if (size > arr[ip->i_height]) {
+			struct metapath mp;
 			u8 height = ip->i_height;
 			while(size > arr[height])
 				height++;
-			error = build_height(&ip->i_inode, height);
+			error = build_height(&ip->i_inode, &mp, height);
 		}
 		up_write(&ip->i_rw_mutex);
 		if (error)
-- 
cgit v1.2.3


From 110acf38377b5b341b11644bfe98389eccec627d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 29 Jan 2008 13:30:20 +0000
Subject: [GFS2] Add consts to various bits of rgrp.c

There are a couple of routines which scan bitmaps where we can
mark the bitmaps const, plus a couple of call sites that can
be updated too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index da60ce8c5d7d..5fd87104e595 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -126,23 +126,24 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
  * Return: the block number (bitmap buffer scope) that was found
  */
 
-static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
-		       unsigned char old_state)
+static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
+		       u8 old_state)
 {
-	unsigned char *byte;
+	const u8 *byte;
 	u32 blk = goal;
 	unsigned int bit, bitlong;
-	unsigned long *plong, plong55;
+	const unsigned long *plong;
+#if BITS_PER_LONG == 32
+	const unsigned long plong55 = 0x55555555;
+#else
+	const unsigned long plong55 = 0x5555555555555555;
+#endif
 
 	byte = buffer + (goal / GFS2_NBBY);
-	plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
+	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
 	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
 	bitlong = bit;
-#if BITS_PER_LONG == 32
-	plong55 = 0x55555555;
-#else
-	plong55 = 0x5555555555555555;
-#endif
+
 	while (byte < buffer + buflen) {
 
 		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
@@ -179,14 +180,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
  * Returns: The number of bits
  */
 
-static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
-			      unsigned int buflen, unsigned char state)
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
+			 unsigned int buflen, u8 state)
 {
-	unsigned char *byte = buffer;
-	unsigned char *end = buffer + buflen;
-	unsigned char state1 = state << 2;
-	unsigned char state2 = state << 4;
-	unsigned char state3 = state << 6;
+	const u8 *byte = buffer;
+	const u8 *end = buffer + buflen;
+	const u8 state1 = state << 2;
+	const u8 state2 = state << 4;
+	const u8 state3 = state << 6;
 	u32 count = 0;
 
 	for (; byte < end; byte++) {
@@ -1327,12 +1328,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 	for (x = 0; x <= length; x++) {
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
+		const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
 		if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
-			blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
-					  bi->bi_len, goal, old_state);
-		else
-			blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
-					  bi->bi_len, goal, old_state);
+			buffer = bi->bi_clone + bi->bi_offset;
+
+		blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
 		if (blk != BFITNOENT)
 			break;
 
-- 
cgit v1.2.3


From ab0d756681c9502a2ab9e2e4ab3685bc0567f4ee Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 29 Jan 2008 13:56:15 -0600
Subject: [GFS2] Eliminate gl_req_bh

This patch further reduces the memory needs of GFS2 by
eliminating the gl_req_bh variable from struct gfs2_glock.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 107 ++++++++++++++++++++++++++-----------------------------
 fs/gfs2/incore.h |   1 -
 2 files changed, 51 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 951cb91e7ddb..a8387e0b5068 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -338,7 +338,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_ip = 0;
 	gl->gl_ops = glops;
 	gl->gl_req_gh = NULL;
-	gl->gl_req_bh = NULL;
 	gl->gl_vn = 0;
 	gl->gl_stamp = jiffies;
 	gl->gl_tchange = jiffies;
@@ -743,6 +742,50 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 	gl->gl_tchange = jiffies;
 }
 
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh = gl->gl_req_gh;
+
+	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
+	gfs2_assert_warn(sdp, !ret);
+
+	state_change(gl, LM_ST_UNLOCKED);
+
+	if (glops->go_inval)
+		glops->go_inval(gl, DIO_METADATA);
+
+	if (gh) {
+		spin_lock(&gl->gl_spin);
+		list_del_init(&gh->gh_list);
+		gh->gh_error = 0;
+		spin_unlock(&gl->gl_spin);
+	}
+
+	spin_lock(&gl->gl_spin);
+	gfs2_demote_wake(gl);
+	gl->gl_req_gh = NULL;
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+
+	gfs2_glock_put(gl);
+
+	if (gh)
+		gfs2_holder_wake(gh);
+}
+
 /**
  * xmote_bh - Called after the lock module is done acquiring a lock
  * @gl: The glock in question
@@ -758,6 +801,11 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 	int prev_state = gl->gl_state;
 	int op_done = 1;
 
+	if ((ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
+		drop_bh(gl, ret);
+		return;
+	}
+
 	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
 	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
 	gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
@@ -783,7 +831,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 		} else {
 			spin_lock(&gl->gl_spin);
 			if (gl->gl_state != gl->gl_demote_state) {
-				gl->gl_req_bh = NULL;
 				spin_unlock(&gl->gl_spin);
 				gfs2_glock_drop_th(gl);
 				gfs2_glock_put(gl);
@@ -825,7 +872,6 @@ out:
 	if (op_done) {
 		spin_lock(&gl->gl_spin);
 		gl->gl_req_gh = NULL;
-		gl->gl_req_bh = NULL;
 		clear_bit(GLF_LOCK, &gl->gl_flags);
 		spin_unlock(&gl->gl_spin);
 	}
@@ -864,7 +910,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 	gfs2_assert_warn(sdp, state != gl->gl_state);
 
 	gfs2_glock_hold(gl);
-	gl->gl_req_bh = xmote_bh;
 
 	lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
 
@@ -877,51 +922,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 		xmote_bh(gl, lck_ret);
 }
 
-/**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	struct gfs2_holder *gh = gl->gl_req_gh;
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, !ret);
-
-	state_change(gl, LM_ST_UNLOCKED);
-
-	if (glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA);
-
-	if (gh) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-	}
-
-	spin_lock(&gl->gl_spin);
-	gfs2_demote_wake(gl);
-	gl->gl_req_gh = NULL;
-	gl->gl_req_bh = NULL;
-	clear_bit(GLF_LOCK, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
-
-	gfs2_glock_put(gl);
-
-	if (gh)
-		gfs2_holder_wake(gh);
-}
-
 /**
  * gfs2_glock_drop_th - call into the lock module to unlock a lock
  * @gl: the glock
@@ -942,7 +942,6 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 	gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
 
 	gfs2_glock_hold(gl);
-	gl->gl_req_bh = drop_bh;
 
 	ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
 
@@ -971,8 +970,7 @@ static void do_cancels(struct gfs2_holder *gh)
 	while (gl->gl_req_gh != gh &&
 	       !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
 	       !list_empty(&gh->gh_list)) {
-		if (gl->gl_req_bh && !(gl->gl_req_gh &&
-				     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+		if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
 			spin_unlock(&gl->gl_spin);
 			gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
 			msleep(100);
@@ -1042,7 +1040,6 @@ static int glock_wait_internal(struct gfs2_holder *gh)
 
 		spin_lock(&gl->gl_spin);
 		gl->gl_req_gh = NULL;
-		gl->gl_req_bh = NULL;
 		clear_bit(GLF_LOCK, &gl->gl_flags);
 		run_queue(gl);
 		spin_unlock(&gl->gl_spin);
@@ -1535,8 +1532,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 		gl = gfs2_glock_find(sdp, &async->lc_name);
 		if (gfs2_assert_warn(sdp, gl))
 			return;
-		if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
-			gl->gl_req_bh(gl, async->lc_ret);
+		xmote_bh(gl, async->lc_ret);
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gfs2_glock_put(gl);
 		up_read(&gfs2_umount_flush_sem);
@@ -1898,7 +1894,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 		print_dbg(gi, "  gl_owner = -1\n");
 	print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
 	print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-	print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
 	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
 	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
 	print_dbg(gi, "  reclaim = %s\n",
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fe14f6a417b5..65aa46acb082 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -191,7 +191,6 @@ struct gfs2_glock {
 	const struct gfs2_glock_operations *gl_ops;
 
 	struct gfs2_holder *gl_req_gh;
-	gfs2_glop_bh_t gl_req_bh;
 
 	void *gl_lock;
 	char *gl_lvb;
-- 
cgit v1.2.3


From da755fdb414470d6dce3df12ad188de9131cf96c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 30 Jan 2008 15:34:04 +0000
Subject: [GFS2] Remove lm.[ch] and distribute content

The functions in lm.c were just wrappers which were mostly
only used in one other file. By moving the functions to
the files where they are being used, they can be marked
static and also this will usually result in them being inlined
since they are often only used from one point in the code.

A couple of really trivial functions have been inlined by hand
into the function which called them as it makes the code clearer
to do that.

We also gain from one fewer function call in the glock lock and
unlock paths.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile     |   2 +-
 fs/gfs2/glock.c      |  51 ++++++++++++-
 fs/gfs2/incore.h     |   2 +
 fs/gfs2/lm.c         | 210 ---------------------------------------------------
 fs/gfs2/lm.h         |  42 -----------
 fs/gfs2/ops_file.c   |  31 +++++++-
 fs/gfs2/ops_fstype.c |  71 ++++++++++++++++-
 fs/gfs2/ops_super.c  |   1 -
 fs/gfs2/recovery.c   |  11 ++-
 fs/gfs2/super.h      |   1 +
 fs/gfs2/sys.c        |   1 -
 fs/gfs2/util.c       |  23 +++++-
 fs/gfs2/util.h       |   1 +
 13 files changed, 184 insertions(+), 263 deletions(-)
 delete mode 100644 fs/gfs2/lm.c
 delete mode 100644 fs/gfs2/lm.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 8fff11058cee..e2350df02a07 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
-	glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+	glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a8387e0b5068..611f84d22573 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -35,7 +35,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "lops.h"
 #include "meta_io.h"
 #include "quota.h"
@@ -183,7 +182,8 @@ static void glock_free(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct inode *aspace = gl->gl_aspace;
 
-	gfs2_lm_put_lock(sdp, gl->gl_lock);
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
 
 	if (aspace)
 		gfs2_aspace_put(aspace);
@@ -293,6 +293,16 @@ static void glock_work_func(struct work_struct *work)
 	gfs2_glock_put(gl);
 }
 
+static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		     void **lockp)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+				sdp->sd_lockstruct.ls_lockspace, name, lockp);
+	return error;
+}
+
 /**
  * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
  * @sdp: The GFS2 superblock
@@ -882,6 +892,17 @@ out:
 		gfs2_holder_wake(gh);
 }
 
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+				 unsigned int cur_state, unsigned int req_state,
+				 unsigned int flags)
+{
+	int ret = 0;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+							 req_state, flags);
+	return ret;
+}
+
 /**
  * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
  * @gl: The glock in question
@@ -922,6 +943,15 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 		xmote_bh(gl, lck_ret);
 }
 
+static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+				   unsigned int cur_state)
+{
+	int ret = 0;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+	return ret;
+}
+
 /**
  * gfs2_glock_drop_th - call into the lock module to unlock a lock
  * @gl: the glock
@@ -964,6 +994,7 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 static void do_cancels(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	spin_lock(&gl->gl_spin);
 
@@ -972,7 +1003,8 @@ static void do_cancels(struct gfs2_holder *gh)
 	       !list_empty(&gh->gh_list)) {
 		if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
 			spin_unlock(&gl->gl_spin);
-			gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+			if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+				sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
 			msleep(100);
 			spin_lock(&gl->gl_spin);
 		} else {
@@ -1426,6 +1458,14 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 		gfs2_glock_dq_uninit(&ghs[x]);
 }
 
+static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+	return error;
+}
+
 /**
  * gfs2_lvb_hold - attach a LVB from a glock
  * @gl: The glock in question
@@ -1461,12 +1501,15 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 
 void gfs2_lvb_unhold(struct gfs2_glock *gl)
 {
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
 	gfs2_glock_hold(gl);
 	gfs2_glmutex_lock(gl);
 
 	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
 	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-		gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+		if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+			sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
 		gl->gl_lvb = NULL;
 		gfs2_glock_put(gl);
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 65aa46acb082..8dee4672c3d8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -211,6 +211,8 @@ struct gfs2_glock {
 	struct delayed_work gl_work;
 };
 
+#define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
+
 struct gfs2_alloc {
 	/* Quota stuff */
 
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
deleted file mode 100644
index cfcc39b86a53..000000000000
--- a/fs/gfs2/lm.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "lm.h"
-#include "super.h"
-#include "util.h"
-
-/**
- * gfs2_lm_mount - mount a locking protocol
- * @sdp: the filesystem
- * @args: mount arguements
- * @silent: if 1, don't complain if the FS isn't a GFS2 fs
- *
- * Returns: errno
- */
-
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
-{
-	char *proto = sdp->sd_proto_name;
-	char *table = sdp->sd_table_name;
-	int flags = 0;
-	int error;
-
-	if (sdp->sd_args.ar_spectator)
-		flags |= LM_MFLAG_SPECTATOR;
-
-	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
-
-	error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
-				     gfs2_glock_cb, sdp,
-				     GFS2_MIN_LVB_SIZE, flags,
-				     &sdp->sd_lockstruct, &sdp->sd_kobj);
-	if (error) {
-		fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
-			proto, table, sdp->sd_args.ar_hostdata);
-		goto out;
-	}
-
-	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
-				  GFS2_MIN_LVB_SIZE)) {
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-		goto out;
-	}
-
-	if (sdp->sd_args.ar_spectator)
-		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
-	else
-		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
-			 sdp->sd_lockstruct.ls_jid);
-
-	fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-
-	if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
-	    !sdp->sd_args.ar_ignore_local_fs) {
-		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
-	}
-
-out:
-	return error;
-}
-
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
-{
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
-					sdp->sd_lockstruct.ls_lockspace);
-}
-
-void gfs2_lm_unmount(struct gfs2_sbd *sdp)
-{
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-}
-
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-{
-	va_list args;
-
-	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return 0;
-
-	va_start(args, fmt);
-	vprintk(fmt, args);
-	va_end(args);
-
-	fs_err(sdp, "about to withdraw this file system\n");
-	BUG_ON(sdp->sd_args.ar_debug);
-
-	fs_err(sdp, "telling LM to withdraw\n");
-	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
-	fs_err(sdp, "withdrawn\n");
-	dump_stack();
-
-	return -1;
-}
-
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		     void **lockp)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
-				sdp->sd_lockstruct.ls_lockspace, name, lockp);
-	return error;
-}
-
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
-{
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
-}
-
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-			  unsigned int cur_state, unsigned int req_state,
-			  unsigned int flags)
-{
-	int ret = 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-							 req_state, flags);
-	return ret;
-}
-
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-			    unsigned int cur_state)
-{
-	int ret = 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-	return ret;
-}
-
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
-{
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
-}
-
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
-	return error;
-}
-
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
-{
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
-}
-
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		      struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
-}
-
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		  struct file *file, int cmd, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
-	return error;
-}
-
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		    struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_punlock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
-}
-
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-			   unsigned int message)
-{
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
-			sdp->sd_lockstruct.ls_lockspace, jid, message);
-}
-
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
deleted file mode 100644
index 21cdc30ee08c..000000000000
--- a/fs/gfs2/lm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __LM_DOT_H__
-#define __LM_DOT_H__
-
-struct gfs2_sbd;
-
-#define GFS2_MIN_LVB_SIZE 32
-
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
-void gfs2_lm_unmount(struct gfs2_sbd *sdp);
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-				__attribute__ ((format(printf, 2, 3)));
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		     void **lockp);
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-			 unsigned int cur_state, unsigned int req_state,
-			 unsigned int flags);
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-			   unsigned int cur_state);
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		      struct file *file, struct file_lock *fl);
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		  struct file *file, int cmd, struct file_lock *fl);
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		    struct file *file, struct file_lock *fl);
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-			   unsigned int message);
-
-#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index f4842f2548cd..f97a8b86c485 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -30,7 +30,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "log.h"
 #include "meta_io.h"
 #include "quota.h"
@@ -596,6 +595,36 @@ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
 	return generic_setlease(file, arg, fl);
 }
 
+static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		      struct file *file, struct file_lock *fl)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+	return error;
+}
+
+static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		  struct file *file, int cmd, struct file_lock *fl)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_plock(
+				sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+	return error;
+}
+
+static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		    struct file *file, struct file_lock *fl)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+	return error;
+}
+
 /**
  * gfs2_lock - acquire/release a posix lock on a file
  * @file: the file pointer
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4bee6aa845e4..5b6a34517167 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -26,7 +26,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "mount.h"
 #include "ops_fstype.h"
 #include "ops_dentry.h"
@@ -363,6 +362,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 	return rc;
 }
 
+static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+					sdp->sd_lockstruct.ls_lockspace);
+}
+
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct gfs2_holder ji_gh;
@@ -704,6 +710,69 @@ fail:
 	return error;
 }
 
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+	char *proto = sdp->sd_proto_name;
+	char *table = sdp->sd_table_name;
+	int flags = 0;
+	int error;
+
+	if (sdp->sd_args.ar_spectator)
+		flags |= LM_MFLAG_SPECTATOR;
+
+	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+
+	error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+				     gfs2_glock_cb, sdp,
+				     GFS2_MIN_LVB_SIZE, flags,
+				     &sdp->sd_lockstruct, &sdp->sd_kobj);
+	if (error) {
+		fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+			proto, table, sdp->sd_args.ar_hostdata);
+		goto out;
+	}
+
+	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+				  GFS2_MIN_LVB_SIZE)) {
+		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+		goto out;
+	}
+
+	if (sdp->sd_args.ar_spectator)
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+	else
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+			 sdp->sd_lockstruct.ls_jid);
+
+	fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+
+	if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+	    !sdp->sd_args.ar_ignore_local_fs) {
+		sdp->sd_args.ar_localflocks = 1;
+		sdp->sd_args.ar_localcaching = 1;
+	}
+
+out:
+	return error;
+}
+
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+
 /**
  * fill_super - Read in superblock
  * @sb: The VFS superblock
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 5e524217944a..2278c68b7e35 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -25,7 +25,6 @@
 #include "incore.h"
 #include "glock.h"
 #include "inode.h"
-#include "lm.h"
 #include "log.h"
 #include "mount.h"
 #include "ops_super.h"
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 6fb07d67ca8a..b17d3b8b2321 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -20,7 +20,6 @@
 #include "bmap.h"
 #include "glock.h"
 #include "glops.h"
-#include "lm.h"
 #include "lops.h"
 #include "meta_io.h"
 #include "recovery.h"
@@ -425,6 +424,16 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 	return error;
 }
 
+
+static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+				  unsigned int message)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+			sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
+
+
 /**
  * gfs2_recover_journal - recovery a given journal
  * @jd: the struct gfs2_jdesc describing the journal
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 60a870e430be..44361ecc44f7 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -17,6 +17,7 @@ void gfs2_tune_init(struct gfs2_tune *gt);
 int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
 int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
 int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index eaa3b7b2f99e..cc35ec862ee8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -20,7 +20,6 @@
 
 #include "gfs2.h"
 #include "incore.h"
-#include "lm.h"
 #include "sys.h"
 #include "super.h"
 #include "glock.h"
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index fe9c28ef77b0..d31e355c61fb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -19,7 +19,6 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "glock.h"
-#include "lm.h"
 #include "util.h"
 
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
@@ -33,6 +32,28 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
 	       sdp->sd_fsname);
 }
 
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+	va_list args;
+
+	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return 0;
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	fs_err(sdp, "about to withdraw this file system\n");
+	BUG_ON(sdp->sd_args.ar_debug);
+
+	fs_err(sdp, "telling LM to withdraw\n");
+	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+	fs_err(sdp, "withdrawn\n");
+	dump_stack();
+
+	return -1;
+}
+
 /**
  * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
  * Returns: -1 if this call withdrew the machine,
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index ac0c567ebc36..509c5d60bd80 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -164,6 +164,7 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
 
 void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
 		      unsigned int bit, int new_value);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
 
 #endif /* __UTIL_DOT_H__ */
 
-- 
cgit v1.2.3


From cf45b752c9f23939e40d823b0600bf876e97b0e0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 31 Jan 2008 10:31:39 -0600
Subject: [GFS2] Remove rgrp and glock version numbers

This patch further reduces GFS2's memory requirements by
eliminating the 64-bit version number fields in lieu of
a couple bits.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      |  3 +--
 fs/gfs2/glops.c      | 10 ++++++++--
 fs/gfs2/incore.h     |  5 ++---
 fs/gfs2/ops_fstype.c |  4 ++--
 fs/gfs2/rgrp.c       | 14 +++++++-------
 5 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 611f84d22573..d00dc37e3d51 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -348,7 +348,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_ip = 0;
 	gl->gl_ops = glops;
 	gl->gl_req_gh = NULL;
-	gl->gl_vn = 0;
 	gl->gl_stamp = jiffies;
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c663b7a0f410..d31badadef8f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -126,7 +126,13 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
 		return;
 
 	gfs2_meta_inval(gl);
-	gl->gl_vn++;
+	if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
+		gl->gl_sbd->sd_rindex_uptodate = 0;
+	else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
+		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+
+		rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+	}
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8dee4672c3d8..e9c58dc76869 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -85,7 +85,6 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	struct gfs2_rgrp_host rd_rg;
-	u64 rd_rg_vn;
 	struct gfs2_bitmap *rd_bits;
 	unsigned int rd_bh_count;
 	struct mutex rd_mutex;
@@ -97,6 +96,7 @@ struct gfs2_rgrpd {
 	unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
 #define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
 };
 
 enum gfs2_state_bits {
@@ -196,7 +196,6 @@ struct gfs2_glock {
 	char *gl_lvb;
 	atomic_t gl_lvb_count;
 
-	u64 gl_vn;
 	unsigned long gl_stamp;
 	unsigned long gl_tchange;
 	void *gl_object;
@@ -533,7 +532,7 @@ struct gfs2_sbd {
 
 	/* Resource group stuff */
 
-	u64 sd_rindex_vn;
+	int sd_rindex_uptodate;
 	spinlock_t sd_rindex_spin;
 	struct mutex sd_rindex_mutex;
 	struct list_head sd_rindex_list;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5b6a34517167..c4b7a210c0c0 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -548,7 +548,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	}
 	ip = GFS2_I(sdp->sd_rindex);
 	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
-	sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
 	sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5fd87104e595..3f10b1fafd66 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -540,7 +540,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 		return error;
 
 	rgd->rd_gl->gl_object = rgd;
-	rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
 	rgd->rd_flags |= GFS2_RDF_CHECK;
 	return error;
 }
@@ -576,7 +576,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 		}
 	}
 
-	sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
 
@@ -610,7 +610,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 		}
 	}
 
-	sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+	sdp->sd_rindex_uptodate = 1;
 	return 0;
 }
 
@@ -643,9 +643,9 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 		return error;
 
 	/* Read new copy from disk if we don't have the latest */
-	if (sdp->sd_rindex_vn != gl->gl_vn) {
+	if (!sdp->sd_rindex_uptodate) {
 		mutex_lock(&sdp->sd_rindex_mutex);
-		if (sdp->sd_rindex_vn != gl->gl_vn) {
+		if (!sdp->sd_rindex_uptodate) {
 			error = gfs2_ri_update(ip);
 			if (error)
 				gfs2_glock_dq_uninit(ri_gh);
@@ -737,9 +737,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		}
 	}
 
-	if (rgd->rd_rg_vn != gl->gl_vn) {
+	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-		rgd->rd_rg_vn = gl->gl_vn;
+		rgd->rd_flags |= GFS2_RDF_UPTODATE;
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-- 
cgit v1.2.3


From 9a0045088d888c9c539c8c626a366cb52c0fbdab Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 1 Feb 2008 09:23:44 +0000
Subject: [GFS2] Shrink & rename di_depth

This patch forms a pair with the previous patch which shrunk
di_height. Like that patch di_depth is renamed i_depth and moved
into struct gfs2_inode directly. Also the field goes from 16 bits
to 8 bits since it is also limited to a max value which is rather
small (17 in this case). In addition we also now validate the field
against this maximum value when its read in.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c    | 34 +++++++++++++++++-----------------
 fs/gfs2/incore.h |  2 +-
 fs/gfs2/inode.c  | 11 +++++++----
 3 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 78c236fb34ec..081daa96a9d9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -757,7 +757,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 
 	if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
 		struct gfs2_leaf *leaf;
-		unsigned hsize = 1 << ip->i_di.di_depth;
+		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
 		u64 ln;
 		if (hsize * sizeof(u64) != ip->i_di.di_size) {
@@ -765,7 +765,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 			return ERR_PTR(-EIO);
 		}
 
-		index = name->hash >> (32 - ip->i_di.di_depth);
+		index = name->hash >> (32 - ip->i_depth);
 		error = get_first_leaf(ip, index, &bh);
 		if (error)
 			return ERR_PTR(error);
@@ -910,7 +910,7 @@ static int dir_make_exhash(struct inode *inode)
 	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
-	dip->i_di.di_depth = y;
+	dip->i_depth = y;
 
 	gfs2_dinode_out(dip, dibh->b_data);
 
@@ -941,7 +941,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	int x, moved = 0;
 	int error;
 
-	index = name->hash >> (32 - dip->i_di.di_depth);
+	index = name->hash >> (32 - dip->i_depth);
 	error = get_leaf_nr(dip, index, &leaf_no);
 	if (error)
 		return error;
@@ -952,7 +952,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 		return error;
 
 	oleaf = (struct gfs2_leaf *)obh->b_data;
-	if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+	if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
 		brelse(obh);
 		return 1; /* can't split */
 	}
@@ -967,10 +967,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	bn = nbh->b_blocknr;
 
 	/*  Compute the start and len of leaf pointers in the hash table.  */
-	len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+	len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
 	half_len = len >> 1;
 	if (!half_len) {
-		printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+		printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
 		gfs2_consist_inode(dip);
 		error = -EIO;
 		goto fail_brelse;
@@ -997,7 +997,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	kfree(lp);
 
 	/*  Compute the divider  */
-	divider = (start + half_len) << (32 - dip->i_di.di_depth);
+	divider = (start + half_len) << (32 - dip->i_depth);
 
 	/*  Copy the entries  */
 	dirent_first(dip, obh, &dent);
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	int x;
 	int error = 0;
 
-	hsize = 1 << dip->i_di.di_depth;
+	hsize = 1 << dip->i_depth;
 	if (hsize * sizeof(u64) != dip->i_di.di_size) {
 		gfs2_consist_inode(dip);
 		return -EIO;
@@ -1125,7 +1125,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(sdp, !error)) {
-		dip->i_di.di_depth++;
+		dip->i_depth++;
 		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1370,14 +1370,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	int error = 0;
 	unsigned depth = 0;
 
-	hsize = 1 << dip->i_di.di_depth;
+	hsize = 1 << dip->i_depth;
 	if (hsize * sizeof(u64) != dip->i_di.di_size) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
 
 	hash = gfs2_dir_offset2hash(*offset);
-	index = hash >> (32 - dip->i_di.di_depth);
+	index = hash >> (32 - dip->i_depth);
 
 	lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
 	if (!lp)
@@ -1405,7 +1405,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 		if (error)
 			break;
 
-		len = 1 << (dip->i_di.di_depth - depth);
+		len = 1 << (dip->i_depth - depth);
 		index = (index & ~(len - 1)) + len;
 	}
 
@@ -1549,7 +1549,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 	u32 index;
 	u64 bn;
 
-	index = name->hash >> (32 - ip->i_di.di_depth);
+	index = name->hash >> (32 - ip->i_depth);
 	error = get_first_leaf(ip, index, &obh);
 	if (error)
 		return error;
@@ -1641,7 +1641,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			continue;
 		if (error < 0)
 			break;
-		if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+		if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
 			error = dir_double_exhash(ip);
 			if (error)
 				break;
@@ -1785,7 +1785,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	u64 leaf_no;
 	int error = 0;
 
-	hsize = 1 << dip->i_di.di_depth;
+	hsize = 1 << dip->i_depth;
 	if (hsize * sizeof(u64) != dip->i_di.di_size) {
 		gfs2_consist_inode(dip);
 		return -EIO;
@@ -1817,7 +1817,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 			if (error)
 				goto out;
 			leaf = (struct gfs2_leaf *)bh->b_data;
-			len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
 			brelse(bh);
 
 			error = lc(dip, index, len, leaf_no, data);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e9c58dc76869..9dfdde3612a4 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -246,7 +246,6 @@ struct gfs2_dinode_host {
 	u64 di_generation;	/* generation number for NFS */
 	u32 di_flags;		/* GFS2_DIF_... */
 	/* These only apply to directories  */
-	u16 di_depth;		/* Number of bits in the table */
 	u32 di_entries;		/* The number of entries in the directory */
 	u64 di_eattr;		/* extended attribute block number */
 };
@@ -267,6 +266,7 @@ struct gfs2_inode {
 
 	struct rw_semaphore i_rw_mutex;
 	u8 i_height;
+	u8 i_depth;
 };
 
 /*
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index db5961a9aa59..65fdfee9ca9b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -248,7 +248,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
-	u16 height;
+	u16 height, depth;
 
 	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
 		goto corrupt;
@@ -293,7 +293,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 		goto corrupt;
 	ip->i_height = (u8)height;
 
-	di->di_depth = be16_to_cpu(str->di_depth);
+	depth = be16_to_cpu(str->di_depth);
+	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+		goto corrupt;
+	ip->i_depth = (u8)depth;
 	di->di_entries = be32_to_cpu(str->di_entries);
 
 	di->di_eattr = be64_to_cpu(str->di_eattr);
@@ -1410,7 +1413,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
 					     !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
 					     GFS2_FORMAT_DE : 0);
-	str->di_depth = cpu_to_be16(di->di_depth);
+	str->di_depth = cpu_to_be16(ip->i_depth);
 	str->di_entries = cpu_to_be32(di->di_entries);
 
 	str->di_eattr = cpu_to_be64(di->di_eattr);
@@ -1436,7 +1439,7 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	       (unsigned long long)di->di_goal_data);
 	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
 	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
-	printk(KERN_INFO "  di_depth = %u\n", di->di_depth);
+	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
 	printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
 	printk(KERN_INFO "  di_eattr = %llu\n",
 	       (unsigned long long)di->di_eattr);
-- 
cgit v1.2.3


From 9feb7c889f2a3b088a7f6583e609bd39997c0f47 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 5 Feb 2008 17:11:40 -0600
Subject: [GFS2] Remove unused counters

This is kind of trivial in the greater scheme of things, but
this removes three counters that AFAICT are never used.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 3 ---
 fs/gfs2/sys.c    | 6 ------
 2 files changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9dfdde3612a4..b67e44baa73c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -636,9 +636,6 @@ struct gfs2_sbd {
 
 	/* Counters */
 
-	atomic_t sd_glock_count;
-	atomic_t sd_glock_held_count;
-	atomic_t sd_inode_count;
 	atomic_t sd_reclaimed;
 
 	char sd_fsname[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index cc35ec862ee8..9ab9fc85ecd0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -327,15 +327,9 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 }                                                                           \
 static struct counters_attr counters_attr_##name = __ATTR_RO(name)
 
-COUNTERS_ATTR(glock_count,      "%u\n");
-COUNTERS_ATTR(glock_held_count, "%u\n");
-COUNTERS_ATTR(inode_count,      "%u\n");
 COUNTERS_ATTR(reclaimed,        "%u\n");
 
 static struct attribute *counters_attrs[] = {
-	&counters_attr_glock_count.attr,
-	&counters_attr_glock_held_count.attr,
-	&counters_attr_inode_count.attr,
 	&counters_attr_reclaimed.attr,
 	NULL,
 };
-- 
cgit v1.2.3


From ce276b06e8b81845926387e93f77bf81e14b5cc2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 6 Feb 2008 09:25:45 +0000
Subject: [GFS2] Reduce inode size by merging fields

There were three fields being used to keep track of the location
of the most recently allocated block for each inode. These have
been merged into a single field in order to better keep the
data and metadata for an inode close on disk, and also to reduce
the space required for storage.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c   |  4 +---
 fs/gfs2/incore.h |  5 +----
 fs/gfs2/inode.c  | 13 +++++--------
 fs/gfs2/rgrp.c   | 40 +++++++++++++++++-----------------------
 4 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 2011dd27f8d6..30d718b3438f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1020,9 +1020,7 @@ static int trunc_end(struct gfs2_inode *ip)
 
 	if (!ip->i_di.di_size) {
 		ip->i_height = 0;
-		ip->i_di.di_goal_meta =
-			ip->i_di.di_goal_data =
-			ip->i_no_addr;
+		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	}
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b67e44baa73c..c50dcdf79929 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -241,8 +241,6 @@ enum {
 struct gfs2_dinode_host {
 	u64 di_size;		/* number of bytes in file */
 	u64 di_blocks;		/* number of blocks in file */
-	u64 di_goal_meta;	/* rgrp to alloc from next */
-	u64 di_goal_data;	/* data block goal */
 	u64 di_generation;	/* generation number for NFS */
 	u32 di_flags;		/* GFS2_DIF_... */
 	/* These only apply to directories  */
@@ -262,8 +260,7 @@ struct gfs2_inode {
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
 	struct gfs2_alloc *i_alloc;
-	u64 i_last_rg_alloc;
-
+	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
 	u8 i_height;
 	u8 i_depth;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 65fdfee9ca9b..c3fe8aa03c4e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -282,8 +282,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
 	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
 
-	di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
-	di->di_goal_data = be64_to_cpu(str->di_goal_data);
+	ip->i_goal = be64_to_cpu(str->di_goal_meta);
 	di->di_generation = be64_to_cpu(str->di_generation);
 
 	di->di_flags = be32_to_cpu(str->di_flags);
@@ -1404,8 +1403,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
 	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
 
-	str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
-	str->di_goal_data = cpu_to_be64(di->di_goal_data);
+	str->di_goal_meta = cpu_to_be64(ip->i_goal);
+	str->di_goal_data = cpu_to_be64(ip->i_goal);
 	str->di_generation = cpu_to_be64(di->di_generation);
 
 	str->di_flags = cpu_to_be32(di->di_flags);
@@ -1433,10 +1432,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
 	printk(KERN_INFO "  di_blocks = %llu\n",
 	       (unsigned long long)di->di_blocks);
-	printk(KERN_INFO "  di_goal_meta = %llu\n",
-	       (unsigned long long)di->di_goal_meta);
-	printk(KERN_INFO "  di_goal_data = %llu\n",
-	       (unsigned long long)di->di_goal_data);
+	printk(KERN_INFO "  i_goal = %llu\n",
+	       (unsigned long long)ip->i_goal);
 	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
 	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
 	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3f10b1fafd66..66193b45e50b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -915,24 +915,20 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
 					    u64 rglast)
 {
-	struct gfs2_rgrpd *rgd = NULL;
+	struct gfs2_rgrpd *rgd;
 
 	spin_lock(&sdp->sd_rindex_spin);
 
-	if (list_empty(&sdp->sd_rindex_recent_list))
-		goto out;
-
-	if (!rglast)
-		goto first;
-
-	list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-		if (rgd->rd_addr == rglast)
-			goto out;
+	if (rglast) {
+		list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+			if (rgrp_contains_block(rgd, rglast))
+				goto out;
+		}
 	}
-
-first:
-	rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
-			 rd_recent);
+	rgd = NULL;
+	if (!list_empty(&sdp->sd_rindex_recent_list))
+		rgd = list_entry(sdp->sd_rindex_recent_list.next,
+				 struct gfs2_rgrpd, rd_recent);
 out:
 	spin_unlock(&sdp->sd_rindex_spin);
 	return rgd;
@@ -1078,7 +1074,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 
 	/* Try recently successful rgrps */
 
-	rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+	rgd = recent_rgrp_first(sdp, ip->i_goal);
 
 	while (rgd) {
 		rg_locked = 0;
@@ -1162,8 +1158,6 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	}
 
 out:
-	ip->i_last_rg_alloc = rgd->rd_addr;
-
 	if (begin) {
 		recent_rgrp_add(rgd);
 		rgd = gfs2_rgrpd_get_next(rgd);
@@ -1425,8 +1419,8 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 	u32 goal, blk;
 	u64 block;
 
-	if (rgrp_contains_block(rgd, ip->i_di.di_goal_data))
-		goal = ip->i_di.di_goal_data - rgd->rd_data0;
+	if (rgrp_contains_block(rgd, ip->i_goal))
+		goal = ip->i_goal - rgd->rd_data0;
 	else
 		goal = rgd->rd_last_alloc_data;
 
@@ -1435,7 +1429,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 	rgd->rd_last_alloc_data = blk;
 
 	block = rgd->rd_data0 + blk;
-	ip->i_di.di_goal_data = block;
+	ip->i_goal = block;
 
 	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
 	rgd->rd_rg.rg_free--;
@@ -1470,8 +1464,8 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 	u32 goal, blk;
 	u64 block;
 
-	if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta))
-		goal = ip->i_di.di_goal_meta - rgd->rd_data0;
+	if (rgrp_contains_block(rgd, ip->i_goal))
+		goal = ip->i_goal - rgd->rd_data0;
 	else
 		goal = rgd->rd_last_alloc_meta;
 
@@ -1480,7 +1474,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 	rgd->rd_last_alloc_meta = blk;
 
 	block = rgd->rd_data0 + blk;
-	ip->i_di.di_goal_meta = block;
+	ip->i_goal = block;
 
 	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
 	rgd->rd_rg.rg_free--;
-- 
cgit v1.2.3


From ac576cc5bed0dd7759e2b196468c7df93d6aeeee Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 1 Feb 2008 10:34:15 +0000
Subject: [GFS2] Merge the rd_last_alloc_meta and rd_last_alloc_data fields

We don't need to keep track of when we last allocated data
and metadata separately since the only thing thats important
when searching for a free block is whether its free or not,
which is independent from what type of block it is.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  3 +--
 fs/gfs2/rgrp.c   | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c50dcdf79929..898b456b386f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -90,8 +90,7 @@ struct gfs2_rgrpd {
 	struct mutex rd_mutex;
 	u32 rd_free_clone;
 	struct gfs2_log_element rd_le;
-	u32 rd_last_alloc_data;
-	u32 rd_last_alloc_meta;
+	u32 rd_last_alloc;
 	struct gfs2_sbd *rd_sbd;
 	unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 66193b45e50b..cc28845ba6fd 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1422,11 +1422,11 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
 	else
-		goal = rgd->rd_last_alloc_data;
+		goal = rgd->rd_last_alloc;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
 	BUG_ON(blk == BFITNOENT);
-	rgd->rd_last_alloc_data = blk;
+	rgd->rd_last_alloc = blk;
 
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
@@ -1467,11 +1467,11 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 	if (rgrp_contains_block(rgd, ip->i_goal))
 		goal = ip->i_goal - rgd->rd_data0;
 	else
-		goal = rgd->rd_last_alloc_meta;
+		goal = rgd->rd_last_alloc;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
 	BUG_ON(blk == BFITNOENT);
-	rgd->rd_last_alloc_meta = blk;
+	rgd->rd_last_alloc = blk;
 
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
@@ -1510,11 +1510,11 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	u32 blk;
 	u64 block;
 
-	blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+	blk = rgblk_search(rgd, rgd->rd_last_alloc,
 			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
 	BUG_ON(blk == BFITNOENT);
 
-	rgd->rd_last_alloc_meta = blk;
+	rgd->rd_last_alloc = blk;
 
 	block = rgd->rd_data0 + blk;
 
-- 
cgit v1.2.3


From 5731be53e3d82aedd06e02574f833a57b07a08d2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 1 Feb 2008 13:16:55 +0000
Subject: [GFS2] Update gfs2_trans_add_unrevoke to accept extents

By adding an extra argument to gfs2_trans_add_unrevoke we can now
specify an extent length of blocks to unrevoke. This means that
we only need to make one pass through the list for each extent
rather than each block. Currently the only extent length which
is used is 1, but that will change in the future.

Also gfs2_trans_add_unrevoke is removed from gfs2_alloc_meta
since its the only difference between this and gfs2_alloc_data
which is left. This will allow a future patch to merge these
two functions into one (i.e. one call to allocate both data
and metadata in a single extent in the future).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  |  7 +++++--
 fs/gfs2/dir.c   |  2 +-
 fs/gfs2/eattr.c |  6 +++---
 fs/gfs2/rgrp.c  |  3 +--
 fs/gfs2/trans.c | 25 ++++++++++---------------
 fs/gfs2/trans.h |  2 +-
 6 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 30d718b3438f..651e5320bb4f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -138,7 +138,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 
 		if (isdir) {
 			block = gfs2_alloc_meta(ip);
-
+			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 			if (error)
 				goto out_brelse;
@@ -206,6 +206,7 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 
 	for(n = 0; n < new_height; n++) {
 		bn = gfs2_alloc_meta(ip);
+		gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 		mp->mp_bh[n] = gfs2_meta_new(ip->i_gl, bn);
 		gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[n], 1);
 	}
@@ -370,8 +371,10 @@ static int lookup_block(struct gfs2_inode *ip, unsigned int height,
 
 	if (height == ip->i_height - 1 && !gfs2_is_dir(ip))
 		*block = gfs2_alloc_data(ip);
-	else
+	else {
 		*block = gfs2_alloc_meta(ip);
+		gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), *block, 1);
+	}
 
 	gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[height], 1);
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 081daa96a9d9..55514ee06dd8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -810,7 +810,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	struct qstr name = { .name = "", .len = 0, .hash = 0 };
 	if (!bh)
 		return NULL;
-
+	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
 	leaf = (struct gfs2_leaf *)bh->b_data;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 04febbc17a16..c7fa0a8b1648 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -585,7 +585,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	u64 block;
 
 	block = gfs2_alloc_meta(ip);
-
+	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
 	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
@@ -644,7 +644,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			int mh_size = sizeof(struct gfs2_meta_header);
 
 			block = gfs2_alloc_meta(ip);
-
+			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
 			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
@@ -968,7 +968,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		u64 blk;
 
 		blk = gfs2_alloc_meta(ip);
-
+		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
 		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index cc28845ba6fd..9f28463e62e5 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1486,7 +1486,6 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 
 	gfs2_statfs_change(sdp, 0, -1, 0);
 	gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
-	gfs2_trans_add_unrevoke(sdp, block);
 
 	spin_lock(&sdp->sd_rindex_spin);
 	rgd->rd_free_clone--;
@@ -1528,7 +1527,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	al->al_alloced++;
 
 	gfs2_statfs_change(sdp, 0, -1, +1);
-	gfs2_trans_add_unrevoke(sdp, block);
+	gfs2_trans_add_unrevoke(sdp, block, 1);
 
 	spin_lock(&sdp->sd_rindex_spin);
 	rgd->rd_free_clone--;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 73e5d92a657c..f677b8a83f0c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -146,30 +146,25 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	lops_add(sdp, &bd->bd_le);
 }
 
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
 {
-	struct gfs2_bufdata *bd;
-	int found = 0;
+	struct gfs2_bufdata *bd, *tmp;
+	struct gfs2_trans *tr = current->journal_info;
+	unsigned int n = len;
 
 	gfs2_log_lock(sdp);
-
-	list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
-		if (bd->bd_blkno == blkno) {
+	list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
+		if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
 			list_del_init(&bd->bd_le.le_list);
 			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
 			sdp->sd_log_num_revoke--;
-			found = 1;
-			break;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+			tr->tr_num_revoke_rm++;
+			if (--n == 0)
+				break;
 		}
 	}
-
 	gfs2_log_unlock(sdp);
-
-	if (found) {
-		struct gfs2_trans *tr = current->journal_info;
-		kmem_cache_free(gfs2_bufdata_cachep, bd);
-		tr->tr_num_revoke_rm++;
-	}
 }
 
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index e826f0dab80a..edf9d4bd908e 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
 
 void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
 
 #endif /* __TRANS_DOT_H__ */
-- 
cgit v1.2.3


From 1639431a3f57b43da1e15e9268a1d691ac01ba26 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 1 Feb 2008 14:52:30 +0000
Subject: [GFS2] Merge gfs2_alloc_meta and gfs2_alloc_data

Thanks to the preceeding patches, the only difference between
these two functions is their name. We can thus merge them
and call the new function gfs2_alloc_block to reflect the
fact that it can allocate either kind of block.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  | 13 +++++--------
 fs/gfs2/dir.c   |  2 +-
 fs/gfs2/eattr.c |  6 +++---
 fs/gfs2/rgrp.c  | 51 +++------------------------------------------------
 fs/gfs2/rgrp.h  |  3 +--
 5 files changed, 13 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 651e5320bb4f..e3a75a27cee7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -137,7 +137,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 		   and write it out to disk */
 
 		if (isdir) {
-			block = gfs2_alloc_meta(ip);
+			block = gfs2_alloc_block(ip);
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 			if (error)
@@ -146,7 +146,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 					      dibh, sizeof(struct gfs2_dinode));
 			brelse(bh);
 		} else {
-			block = gfs2_alloc_data(ip);
+			block = gfs2_alloc_block(ip);
 
 			error = gfs2_unstuffer_page(ip, dibh, block, page);
 			if (error)
@@ -205,7 +205,7 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 		return error;
 
 	for(n = 0; n < new_height; n++) {
-		bn = gfs2_alloc_meta(ip);
+		bn = gfs2_alloc_block(ip);
 		gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 		mp->mp_bh[n] = gfs2_meta_new(ip->i_gl, bn);
 		gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[n], 1);
@@ -369,12 +369,9 @@ static int lookup_block(struct gfs2_inode *ip, unsigned int height,
 	if (!create)
 		return 0;
 
-	if (height == ip->i_height - 1 && !gfs2_is_dir(ip))
-		*block = gfs2_alloc_data(ip);
-	else {
-		*block = gfs2_alloc_meta(ip);
+	*block = gfs2_alloc_block(ip);
+	if (height != ip->i_height - 1 || gfs2_is_dir(ip))
 		gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), *block, 1);
-	}
 
 	gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[height], 1);
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 55514ee06dd8..fbdf31957cb5 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,7 +803,7 @@ got_dent:
 static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	u64 bn = gfs2_alloc_meta(ip);
+	u64 bn = gfs2_alloc_block(ip);
 	struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index c7fa0a8b1648..f9f63bc21cd2 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -584,7 +584,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	struct gfs2_ea_header *ea;
 	u64 block;
 
-	block = gfs2_alloc_meta(ip);
+	block = gfs2_alloc_block(ip);
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -643,7 +643,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			u64 block;
 			int mh_size = sizeof(struct gfs2_meta_header);
 
-			block = gfs2_alloc_meta(ip);
+			block = gfs2_alloc_block(ip);
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -967,7 +967,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	} else {
 		u64 blk;
 
-		blk = gfs2_alloc_meta(ip);
+		blk = gfs2_alloc_block(ip);
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9f28463e62e5..274a2df13f02 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1405,58 +1405,13 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 
 /**
- * gfs2_alloc_data - Allocate a data block
- * @ip: the inode to allocate the data block for
+ * gfs2_alloc_block - Allocate a block
+ * @ip: the inode to allocate the block for
  *
  * Returns: the allocated block
  */
 
-u64 gfs2_alloc_data(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_rgrpd *rgd = al->al_rgd;
-	u32 goal, blk;
-	u64 block;
-
-	if (rgrp_contains_block(rgd, ip->i_goal))
-		goal = ip->i_goal - rgd->rd_data0;
-	else
-		goal = rgd->rd_last_alloc;
-
-	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
-	BUG_ON(blk == BFITNOENT);
-	rgd->rd_last_alloc = blk;
-
-	block = rgd->rd_data0 + blk;
-	ip->i_goal = block;
-
-	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
-	rgd->rd_rg.rg_free--;
-
-	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-
-	al->al_alloced++;
-
-	gfs2_statfs_change(sdp, 0, -1, 0);
-	gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
-
-	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone--;
-	spin_unlock(&sdp->sd_rindex_spin);
-
-	return block;
-}
-
-/**
- * gfs2_alloc_meta - Allocate a metadata block
- * @ip: the inode to allocate the metadata block for
- *
- * Returns: the allocated block
- */
-
-u64 gfs2_alloc_meta(struct gfs2_inode *ip)
+u64 gfs2_alloc_block(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5683605695fd..5e66613b33db 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -46,8 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip);
 
 unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
-u64 gfs2_alloc_data(struct gfs2_inode *ip);
-u64 gfs2_alloc_meta(struct gfs2_inode *ip);
+u64 gfs2_alloc_block(struct gfs2_inode *ip);
 u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 
 void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-- 
cgit v1.2.3


From b45e41d7d56dfef1ae9e02e6c59990066ba82e5c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 6 Feb 2008 10:11:15 +0000
Subject: [GFS2] Add extent allocation to block allocator

Rather than having to allocate a single block at a time, this patch
allows the block allocator to allocate an extent. Since there is
no difference (so far as the block allocator is concerned) between
data blocks and indirect blocks, it is posible to allocate a single
extent and for the caller to unrevoke just the blocks required
for indirect blocks.

Currently the only bit of GFS2 to make use of this feature is the
build height function. The intention is that gfs2_block_map will
be changed to make use of this feature in future patches.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  |  26 ++++++++-------
 fs/gfs2/dir.c   |   3 +-
 fs/gfs2/eattr.c |  10 +++---
 fs/gfs2/rgrp.c  | 100 +++++++++++++++++++++++++++++++++++---------------------
 fs/gfs2/rgrp.h  |   2 +-
 5 files changed, 87 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e3a75a27cee7..1fda731c074b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -136,8 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 
+		unsigned int n = 1;
+		block = gfs2_alloc_block(ip, &n);
 		if (isdir) {
-			block = gfs2_alloc_block(ip);
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
 			if (error)
@@ -146,8 +147,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 					      dibh, sizeof(struct gfs2_dinode));
 			brelse(bh);
 		} else {
-			block = gfs2_alloc_block(ip);
-
 			error = gfs2_unstuffer_page(ip, dibh, block, page);
 			if (error)
 				goto out_brelse;
@@ -195,7 +194,7 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 	int error;
 	__be64 *bp;
 	u64 bn;
-	unsigned n;
+	unsigned n, i = 0;
 
 	if (height <= ip->i_height)
 		return 0;
@@ -204,12 +203,16 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 	if (error)
 		return error;
 
-	for(n = 0; n < new_height; n++) {
-		bn = gfs2_alloc_block(ip);
-		gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
-		mp->mp_bh[n] = gfs2_meta_new(ip->i_gl, bn);
-		gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[n], 1);
-	}
+	do {
+		n = new_height - i;
+		bn = gfs2_alloc_block(ip, &n);
+		gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, n);
+		do {
+			mp->mp_bh[i] = gfs2_meta_new(ip->i_gl, bn++);
+			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i], 1);
+			i++;
+		} while(i < n);
+	} while(i < new_height);
 
 	n = 0;
 	bn = mp->mp_bh[0]->b_blocknr;
@@ -358,6 +361,7 @@ static int lookup_block(struct gfs2_inode *ip, unsigned int height,
 {
 	int boundary;
 	__be64 *ptr = metapointer(&boundary, height, mp);
+	unsigned int n = 1;
 
 	if (*ptr) {
 		*block = be64_to_cpu(*ptr);
@@ -369,7 +373,7 @@ static int lookup_block(struct gfs2_inode *ip, unsigned int height,
 	if (!create)
 		return 0;
 
-	*block = gfs2_alloc_block(ip);
+	*block = gfs2_alloc_block(ip, &n);
 	if (height != ip->i_height - 1 || gfs2_is_dir(ip))
 		gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), *block, 1);
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fbdf31957cb5..93a2e6afbd81 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,7 +803,8 @@ got_dent:
 static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	u64 bn = gfs2_alloc_block(ip);
+	unsigned int n = 1;
+	u64 bn = gfs2_alloc_block(ip, &n);
 	struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index f9f63bc21cd2..0e79cd543496 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,9 +582,10 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_ea_header *ea;
+	unsigned int n = 1;
 	u64 block;
 
-	block = gfs2_alloc_block(ip);
+	block = gfs2_alloc_block(ip, &n);
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -642,8 +643,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			struct buffer_head *bh;
 			u64 block;
 			int mh_size = sizeof(struct gfs2_meta_header);
+			unsigned int n = 1;
 
-			block = gfs2_alloc_block(ip);
+			block = gfs2_alloc_block(ip, &n);
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -966,8 +968,8 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
 	} else {
 		u64 blk;
-
-		blk = gfs2_alloc_block(ip);
+		unsigned int n = 1;
+		blk = gfs2_alloc_block(ip, &n);
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 274a2df13f02..77eba0a38040 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -53,7 +53,8 @@ static const char valid_change[16] = {
 };
 
 static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-                        unsigned char old_state, unsigned char new_state);
+                        unsigned char old_state, unsigned char new_state,
+			unsigned int *n);
 
 /**
  * gfs2_setbit - Set a bit in the bitmaps
@@ -64,26 +65,32 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
  *
  */
 
-static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
-			unsigned int buflen, u32 block,
-			unsigned char new_state)
+static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
+			       unsigned char *buf2, unsigned int offset,
+			       unsigned int buflen, u32 block,
+			       unsigned char new_state)
 {
-	unsigned char *byte, *end, cur_state;
-	unsigned int bit;
+	unsigned char *byte1, *byte2, *end, cur_state;
+	const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
 
-	byte = buffer + (block / GFS2_NBBY);
-	bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
-	end = buffer + buflen;
+	byte1 = buf1 + offset + (block / GFS2_NBBY);
+	end = buf1 + offset + buflen;
 
-	gfs2_assert(rgd->rd_sbd, byte < end);
+	BUG_ON(byte1 >= end);
 
-	cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
 
-	if (valid_change[new_state * 4 + cur_state]) {
-		*byte ^= cur_state << bit;
-		*byte |= new_state << bit;
-	} else
+	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
 		gfs2_consist_rgrpd(rgd);
+		return;
+	}
+	*byte1 ^= (cur_state ^ new_state) << bit;
+
+	if (buf2) {
+		byte2 = buf2 + offset + (block / GFS2_NBBY);
+		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
+		*byte2 ^= (cur_state ^ new_state) << bit;
+	}
 }
 
 /**
@@ -94,10 +101,12 @@ static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
  *
  */
 
-static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
-				  unsigned int buflen, u32 block)
+static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
+					 const unsigned char *buffer,
+					 unsigned int buflen, u32 block)
 {
-	unsigned char *byte, *end, cur_state;
+	const unsigned char *byte, *end;
+	unsigned char cur_state;
 	unsigned int bit;
 
 	byte = buffer + (block / GFS2_NBBY);
@@ -877,13 +886,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 	u32 goal = 0, block;
 	u64 no_addr;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	unsigned int n;
 
 	for(;;) {
 		if (goal >= rgd->rd_data)
 			break;
 		down_write(&sdp->sd_log_flush_lock);
+		n = 1;
 		block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
-				     GFS2_BLKST_UNLINKED);
+				     GFS2_BLKST_UNLINKED, &n);
 		up_write(&sdp->sd_log_flush_lock);
 		if (block == BFITNOENT)
 			break;
@@ -1280,6 +1291,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
  * @goal: the goal block within the RG (start here to search for avail block)
  * @old_state: GFS2_BLKST_XXX the before-allocation state to find
  * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ * @n: The extent length
  *
  * Walk rgrp's bitmap to find bits that represent a block in @old_state.
  * Add the found bitmap buffer to the transaction.
@@ -1295,13 +1307,17 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
  */
 
 static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-			unsigned char old_state, unsigned char new_state)
+			unsigned char old_state, unsigned char new_state,
+			unsigned int *n)
 {
 	struct gfs2_bitmap *bi = NULL;
-	u32 length = rgd->rd_length;
+	const u32 length = rgd->rd_length;
 	u32 blk = 0;
 	unsigned int buf, x;
+	const unsigned int elen = *n;
+	const u8 *buffer;
 
+	*n = 0;
 	/* Find bitmap block that contains bits for goal block */
 	for (buf = 0; buf < length; buf++) {
 		bi = rgd->rd_bits + buf;
@@ -1322,7 +1338,7 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 	for (x = 0; x <= length; x++) {
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
-		const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
+		buffer = bi->bi_bh->b_data + bi->bi_offset;
 		if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			buffer = bi->bi_clone + bi->bi_offset;
 
@@ -1337,12 +1353,21 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 	}
 
 	if (blk != BFITNOENT && old_state != new_state) {
+		*n = 1;
 		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-		gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
 			    bi->bi_len, blk, new_state);
-		if (bi->bi_clone)
-			gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
-				    bi->bi_len, blk, new_state);
+		while(*n < elen) {
+			goal++;
+			if (goal >= (bi->bi_len / GFS2_NBBY))
+				break;
+			if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+			    GFS2_BLKST_FREE)
+				break;
+			(*n)++;
+			gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
+				    bi->bi_offset, bi->bi_len, blk, new_state);
+		}
 	}
 
 	return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
@@ -1397,7 +1422,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 			       bi->bi_len);
 		}
 		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-		gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+		gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
 			    bi->bi_len, buf_blk, new_state);
 	}
 
@@ -1411,7 +1436,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
  * Returns: the allocated block
  */
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip)
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1424,26 +1449,26 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip)
 	else
 		goal = rgd->rd_last_alloc;
 
-	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
 	BUG_ON(blk == BFITNOENT);
-	rgd->rd_last_alloc = blk;
 
+	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
 
-	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
-	rgd->rd_rg.rg_free--;
+	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
+	rgd->rd_rg.rg_free -= *n;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
-	al->al_alloced++;
+	al->al_alloced += *n;
 
-	gfs2_statfs_change(sdp, 0, -1, 0);
-	gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
+	gfs2_statfs_change(sdp, 0, -*n, 0);
+	gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
 	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone--;
+	rgd->rd_free_clone -= *n;
 	spin_unlock(&sdp->sd_rindex_spin);
 
 	return block;
@@ -1463,9 +1488,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	struct gfs2_rgrpd *rgd = al->al_rgd;
 	u32 blk;
 	u64 block;
+	unsigned int n = 1;
 
 	blk = rgblk_search(rgd, rgd->rd_last_alloc,
-			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
 	BUG_ON(blk == BFITNOENT);
 
 	rgd->rd_last_alloc = blk;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5e66613b33db..3181c7e624bf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -46,7 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip);
 
 unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip);
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
 u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 
 void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-- 
cgit v1.2.3


From c85a665f064863cc8a2fe88e5f1eb4def5446e90 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 12 Feb 2008 12:14:59 +0000
Subject: [GFS2] The case of the missing asterisk

A dereference was forgotten. This adds it back correctly.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1fda731c074b..7f72564e0597 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -402,7 +402,7 @@ static int lookup_metapath(struct inode *inode, struct metapath *mp,
 
 	for (x = 0; x < end_of_metadata; x++) {
 		lookup_block(ip, x, mp, create, new, dblock);
-		if (!dblock)
+		if (!*dblock)
 			return 0;
 
 		ret = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &mp->mp_bh[x+1]);
-- 
cgit v1.2.3


From 30cbf189cd2a1ba13ff3c8c8ee2103dbdb18578a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 8 Feb 2008 13:18:11 +0000
Subject: [GFS2] Add a function to interate over an extent

This adds a function (currently the only use is during mapping
of already allocated blocks, but watch this space) which iterates
over a number of pointers in a block and returns the extent length.

If the initial pointer is 0 (i.e. unallocated) it will return the
number of unallocated blocks in the extent. If the initial pointer
is allocated, then it returns the number of contiguously allocated
blocks in the extent.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 68 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7f72564e0597..6780aa5841b2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -422,6 +422,42 @@ static void release_metapath(struct metapath *mp)
 			brelse(mp->mp_bh[i]);
 }
 
+/**
+ * gfs2_extent_length - Returns length of an extent of blocks
+ * @start: Start of the buffer
+ * @len: Length of the buffer in bytes
+ * @ptr: Current position in the buffer
+ * @limit: Max extent length to return (0 = unlimited)
+ * @eob: Set to 1 if we hit "end of block"
+ *
+ * If the first block is zero (unallocated) it will return the number of
+ * unallocated blocks in the extent, otherwise it will return the number
+ * of contiguous blocks in the extent.
+ *
+ * Returns: The length of the extent (minimum of one block)
+ */
+
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+{
+	const __be64 *end = (start + len);
+	const __be64 *first = ptr;
+	u64 d = be64_to_cpu(*ptr);
+
+	*eob = 0;
+	do {
+		ptr++;
+		if (ptr >= end)
+			break;
+		if (limit && --limit == 0)
+			break;
+		if (d)
+			d++;
+	} while(be64_to_cpu(*ptr) == d);
+	if (ptr >= end)
+		*eob = 1;
+	return (ptr - first);
+}
+
 static inline void bmap_lock(struct inode *inode, int create)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -499,26 +535,26 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 		goto out_fail;
 	boundary = error;
 
+	if (new) {
+		map_bh(bh_map, inode->i_sb, dblock);
+		if (boundary)
+			set_buffer_boundary(bh_map);
+		gfs2_trans_add_bh(ip->i_gl, mp.mp_bh[0], 1);
+		gfs2_dinode_out(ip, mp.mp_bh[0]->b_data);
+		set_buffer_new(bh_map);
+		goto out_ok;
+	}
+
 	if (dblock) {
+		unsigned int len;
+		struct buffer_head *bh = mp.mp_bh[ip->i_height - 1];
+		__be64 *ptr = metapointer(&boundary, ip->i_height - 1, &mp);
 		map_bh(bh_map, inode->i_sb, dblock);
+		len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+					 &boundary);
+		bh_map->b_size = (len << inode->i_blkbits);
 		if (boundary)
 			set_buffer_boundary(bh_map);
-		if (new) {
-			gfs2_trans_add_bh(ip->i_gl, mp.mp_bh[0], 1);
-			gfs2_dinode_out(ip, mp.mp_bh[0]->b_data);
-			set_buffer_new(bh_map);
-			goto out_ok;
-		}
-		while(--maxlen && !buffer_boundary(bh_map)) {
-			u64 eblock;
-			mp.mp_list[ip->i_height - 1]++;
-			boundary = lookup_block(ip, ip->i_height - 1, &mp, 0, &new, &eblock);
-			if (eblock != ++dblock)
-				break;
-			bh_map->b_size += (1 << inode->i_blkbits);
-			if (boundary)
-				set_buffer_boundary(bh_map);
-		}
 	}
 out_ok:
 	error = 0;
-- 
cgit v1.2.3


From 77658aad226866fb94097236d14d41a88aaab2ec Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 12 Feb 2008 14:17:27 +0000
Subject: [GFS2] Eliminate (almost) duplicate field from gfs2_inode

The blocks counter is almost a duplicate of the i_blocks
field in the VFS inode. The only difference is that i_blocks
can be only 32bits long for 32bit arch without large single file
support. Since GFS2 doesn't handle the non-large single file
case (for 32 bit anyway) this adds a new config dependency on
64BIT || LSF. This has always been the case, however we've never
explicitly said so before.

Even if we do add support for the non-LSF case, we will still
not require this field to be duplicated since we will not be
able to access oversized files anyway.

So the net result of all this is that we shave 8 bytes from a gfs2_inode
and get our config deps correct.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig     |  2 +-
 fs/gfs2/bmap.c      | 18 ++++++------------
 fs/gfs2/dir.c       | 15 ++++-----------
 fs/gfs2/eattr.c     | 24 ++++++------------------
 fs/gfs2/incore.h    |  1 -
 fs/gfs2/inode.c     | 11 +++++------
 fs/gfs2/inode.h     | 20 +++++++++++++++++---
 fs/gfs2/ops_inode.c |  5 +++--
 8 files changed, 42 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index de8e64c03f73..d147b53ef527 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL
+	depends on EXPERIMENTAL && (64BIT || LSF)
 	select FS_POSIX_ACL
 	select CRC32
 	help
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6780aa5841b2..e27e66046f0a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -161,9 +161,8 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 
 	if (ip->i_di.di_size) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
-		ip->i_di.di_blocks++;
-		gfs2_set_inode_blocks(&ip->i_inode);
-		di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	}
 
 	ip->i_height = 1;
@@ -238,10 +237,9 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	*(__be64 *)(di + 1) = cpu_to_be64(bn);
 	ip->i_height += new_height;
-	ip->i_di.di_blocks += new_height;
-	gfs2_set_inode_blocks(&ip->i_inode);
+	gfs2_add_inode_blocks(&ip->i_inode, new_height);
 	di->di_height = cpu_to_be16(ip->i_height);
-	di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+	di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	brelse(dibh);
 	return error;
 }
@@ -380,8 +378,7 @@ static int lookup_block(struct gfs2_inode *ip, unsigned int height,
 	gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[height], 1);
 
 	*ptr = cpu_to_be64(*block);
-	ip->i_di.di_blocks++;
-	gfs2_set_inode_blocks(&ip->i_inode);
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
 
 	*new = 1;
 	return 0;
@@ -779,10 +776,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		}
 
 		*p = 0;
-		if (!ip->i_di.di_blocks)
-			gfs2_consist_inode(ip);
-		ip->i_di.di_blocks--;
-		gfs2_set_inode_blocks(&ip->i_inode);
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
 	if (bstart) {
 		if (metadata)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 93a2e6afbd81..862aa3228f7a 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -906,8 +906,7 @@ static int dir_make_exhash(struct inode *inode)
 		*lp = cpu_to_be64(bn);
 
 	dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
-	dip->i_di.di_blocks++;
-	gfs2_set_inode_blocks(&dip->i_inode);
+	gfs2_add_inode_blocks(&dip->i_inode, 1);
 	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
@@ -1045,8 +1044,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
 		gfs2_trans_add_bh(dip->i_gl, dibh, 1);
-		dip->i_di.di_blocks++;
-		gfs2_set_inode_blocks(&dip->i_inode);
+		gfs2_add_inode_blocks(&dip->i_inode, 1);
 		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1580,8 +1578,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 	if (error)
 		return error;
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	ip->i_di.di_blocks++;
-	gfs2_set_inode_blocks(&ip->i_inode);
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	return 0;
@@ -1922,11 +1919,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 		brelse(bh);
 
 		gfs2_free_meta(dip, blk, 1);
-
-		if (!dip->i_di.di_blocks)
-			gfs2_consist_inode(dip);
-		dip->i_di.di_blocks--;
-		gfs2_set_inode_blocks(&dip->i_inode);
+		gfs2_add_inode_blocks(&dip->i_inode, -1);
 	}
 
 	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 0e79cd543496..76ead1acfcc7 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -277,10 +277,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 		}
 
 		*dataptrs = 0;
-		if (!ip->i_di.di_blocks)
-			gfs2_consist_inode(ip);
-		ip->i_di.di_blocks--;
-		gfs2_set_inode_blocks(&ip->i_inode);
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
@@ -598,8 +595,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	ea->ea_flags = GFS2_EAFLAG_LAST;
 	ea->ea_num_ptrs = 0;
 
-	ip->i_di.di_blocks++;
-	gfs2_set_inode_blocks(&ip->i_inode);
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
 
 	return 0;
 }
@@ -651,8 +647,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
 			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
 
-			ip->i_di.di_blocks++;
-			gfs2_set_inode_blocks(&ip->i_inode);
+			gfs2_add_inode_blocks(&ip->i_inode, 1);
 
 			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
 							   data_len;
@@ -980,8 +975,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		*eablk = cpu_to_be64(ip->i_di.di_eattr);
 		ip->i_di.di_eattr = blk;
 		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
-		ip->i_di.di_blocks++;
-		gfs2_set_inode_blocks(&ip->i_inode);
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
 
 		eablk++;
 	}
@@ -1389,10 +1383,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 		}
 
 		*eablk = 0;
-		if (!ip->i_di.di_blocks)
-			gfs2_consist_inode(ip);
-		ip->i_di.di_blocks--;
-		gfs2_set_inode_blocks(&ip->i_inode);
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
@@ -1444,10 +1435,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
 
 	ip->i_di.di_eattr = 0;
-	if (!ip->i_di.di_blocks)
-		gfs2_consist_inode(ip);
-	ip->i_di.di_blocks--;
-	gfs2_set_inode_blocks(&ip->i_inode);
+	gfs2_add_inode_blocks(&ip->i_inode, -1);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 898b456b386f..4ba2ea63119d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -239,7 +239,6 @@ enum {
 
 struct gfs2_dinode_host {
 	u64 di_size;		/* number of bytes in file */
-	u64 di_blocks;		/* number of blocks in file */
 	u64 di_generation;	/* generation number for NFS */
 	u32 di_flags;		/* GFS2_DIF_... */
 	/* These only apply to directories  */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c3fe8aa03c4e..5f50dd53bf63 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -273,8 +273,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
 	di->di_size = be64_to_cpu(str->di_size);
 	i_size_write(&ip->i_inode, di->di_size);
-	di->di_blocks = be64_to_cpu(str->di_blocks);
-	gfs2_set_inode_blocks(&ip->i_inode);
+	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
 	ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
 	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
@@ -344,7 +343,7 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	struct gfs2_rgrpd *rgd;
 	int error;
 
-	if (ip->i_di.di_blocks != 1) {
+	if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
 		if (gfs2_consist_inode(ip))
 			gfs2_dinode_print(ip);
 		return -EIO;
@@ -1398,7 +1397,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
 	str->di_size = cpu_to_be64(di->di_size);
-	str->di_blocks = cpu_to_be64(di->di_blocks);
+	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
 	str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
@@ -1430,8 +1429,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  no_addr = %llu\n",
 	       (unsigned long long)ip->i_no_addr);
 	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
-	printk(KERN_INFO "  di_blocks = %llu\n",
-	       (unsigned long long)di->di_blocks);
+	printk(KERN_INFO "  blocks = %llu\n",
+	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
 	       (unsigned long long)ip->i_goal);
 	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index db738686ca1d..580da454b38f 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,8 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
 
+#include "util.h"
+
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
 	return !ip->i_height;
@@ -37,13 +39,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
 	return S_ISDIR(ip->i_inode.i_mode);
 }
 
-static inline void gfs2_set_inode_blocks(struct inode *inode)
+static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
+{
+	inode->i_blocks = blocks <<
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+
+static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
-	inode->i_blocks = ip->i_di.di_blocks <<
+	return inode->i_blocks >>
 		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
 }
 
+static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
+{
+	gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
+	change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+	inode->i_blocks += change;
+}
+
 static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
 				  u64 no_formal_ino)
 {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e87412902bed..301c94596678 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -981,8 +981,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	brelse(dibh);
 
 	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-		gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
-		gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+		gfs2_quota_change(ip, -blocks, ouid, ogid);
+		gfs2_quota_change(ip, blocks, nuid, ngid);
 	}
 
 out_end_trans:
-- 
cgit v1.2.3


From e23159d2a7b2df5bce5f0ee8d57d3292243abf66 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 12 Feb 2008 14:48:39 +0000
Subject: [GFS2] Get inode buffer only once per block map call

In the case that we needed to grow the height of the metadata tree
we were looking up the inode buffer and then brelse()ing it despite
the fact that it is needed later in the block map process.

This patch ensures that we look up the inode's buffer once and only
once during the block map process.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e27e66046f0a..f1f38ca77a52 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -188,50 +188,45 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned new_height = height - ip->i_height;
-	struct buffer_head *dibh;
+	struct buffer_head *dibh = mp->mp_bh[0];
 	struct gfs2_dinode *di;
-	int error;
 	__be64 *bp;
 	u64 bn;
 	unsigned n, i = 0;
 
-	if (height <= ip->i_height)
-		return 0;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		return error;
+	BUG_ON(height <= ip->i_height);
 
 	do {
 		n = new_height - i;
 		bn = gfs2_alloc_block(ip, &n);
 		gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, n);
 		do {
-			mp->mp_bh[i] = gfs2_meta_new(ip->i_gl, bn++);
-			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i], 1);
+			mp->mp_bh[i + 1] = gfs2_meta_new(ip->i_gl, bn++);
+			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i + 1], 1);
 			i++;
 		} while(i < n);
 	} while(i < new_height);
 
 	n = 0;
-	bn = mp->mp_bh[0]->b_blocknr;
+	bn = mp->mp_bh[1]->b_blocknr;
 	if (new_height > 1) {
 		for(; n < new_height-1; n++) {
-			gfs2_metatype_set(mp->mp_bh[n], GFS2_METATYPE_IN,
+			gfs2_metatype_set(mp->mp_bh[n + 1], GFS2_METATYPE_IN,
 					  GFS2_FORMAT_IN);
-			gfs2_buffer_clear_tail(mp->mp_bh[n],
+			gfs2_buffer_clear_tail(mp->mp_bh[n + 1],
 					       sizeof(struct gfs2_meta_header));
-			bp = (__be64 *)(mp->mp_bh[n]->b_data +
+			bp = (__be64 *)(mp->mp_bh[n + 1]->b_data +
 				     sizeof(struct gfs2_meta_header));
-			*bp = cpu_to_be64(mp->mp_bh[n+1]->b_blocknr);
-			brelse(mp->mp_bh[n]);
-			mp->mp_bh[n] = NULL;
+			*bp = cpu_to_be64(mp->mp_bh[n+2]->b_blocknr);
+			brelse(mp->mp_bh[n+1]);
+			mp->mp_bh[n+1] = NULL;
 		}
 	}
-	gfs2_metatype_set(mp->mp_bh[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-	gfs2_buffer_copy_tail(mp->mp_bh[n], sizeof(struct gfs2_meta_header),
+	gfs2_metatype_set(mp->mp_bh[n+1], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+	gfs2_buffer_copy_tail(mp->mp_bh[n+1], sizeof(struct gfs2_meta_header),
 			      dibh, sizeof(struct gfs2_dinode));
-	brelse(mp->mp_bh[n]);
+	brelse(mp->mp_bh[n+1]);
+	mp->mp_bh[n+1] = NULL;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -240,8 +235,7 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
 	gfs2_add_inode_blocks(&ip->i_inode, new_height);
 	di->di_height = cpu_to_be16(ip->i_height);
 	di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
-	brelse(dibh);
-	return error;
+	return 0;
 }
 
 /**
@@ -391,11 +385,7 @@ static int lookup_metapath(struct inode *inode, struct metapath *mp,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
-	int ret = gfs2_meta_inode_buffer(ip, &bh);
-	if (ret)
-		return ret;
-
-	mp->mp_bh[0] = bh;
+	int ret;
 
 	for (x = 0; x < end_of_metadata; x++) {
 		lookup_block(ip, x, mp, create, new, dblock);
@@ -515,6 +505,10 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	}
 	size = (lblock + 1) * bsize;
 
+	error = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+	if (error)
+		goto out_fail;
+
 	if (size > arr[ip->i_height]) {
 		u8 height = ip->i_height;
 		if (!create)
-- 
cgit v1.2.3


From 840ca0ec70903ce8e0fba1596460876c796e4f60 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 12 Feb 2008 15:28:21 +0000
Subject: [GFS2] Fix bug where we called drop_bh incorrectly

As a result of an earlier patch, drop_bh was being called in cases
when it shouldn't have been. Since we never have a gh in the drop
case and we always have a gh in the promote case, we can use that
extra information to tell which case has been seen.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d00dc37e3d51..63981e2fb835 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -765,7 +765,6 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	struct gfs2_holder *gh = gl->gl_req_gh;
 
 	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
 	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -776,23 +775,11 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	if (glops->go_inval)
 		glops->go_inval(gl, DIO_METADATA);
 
-	if (gh) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-	}
-
 	spin_lock(&gl->gl_spin);
 	gfs2_demote_wake(gl);
-	gl->gl_req_gh = NULL;
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	spin_unlock(&gl->gl_spin);
-
 	gfs2_glock_put(gl);
-
-	if (gh)
-		gfs2_holder_wake(gh);
 }
 
 /**
@@ -810,7 +797,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 	int prev_state = gl->gl_state;
 	int op_done = 1;
 
-	if ((ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
+	if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
 		drop_bh(gl, ret);
 		return;
 	}
-- 
cgit v1.2.3


From bb16b342b2e2c83fa47dbb042400db91b748ded7 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Wed, 13 Feb 2008 00:06:10 +0100
Subject: [GFS2] be*_add_cpu conversion

replace all:
big_endian_variable = cpu_to_beX(beX_to_cpu(big_endian_variable) +
					expression_in_cpu_byteorder);
with:
	beX_add_cpu(&big_endian_variable, expression_in_cpu_byteorder);
generated with semantic patch

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 862aa3228f7a..34dc8dfaba12 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1021,13 +1021,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 
 			new->de_inum = dent->de_inum; /* No endian worries */
 			new->de_type = dent->de_type; /* No endian worries */
-			nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+			be16_add_cpu(&nleaf->lf_entries, 1);
 
 			dirent_del(dip, obh, prev, dent);
 
 			if (!oleaf->lf_entries)
 				gfs2_consist_inode(dip);
-			oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+			be16_add_cpu(&oleaf->lf_entries, -1);
 
 			if (!prev)
 				prev = dent;
@@ -1614,7 +1614,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			dent->de_type = cpu_to_be16(type);
 			if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
 				leaf = (struct gfs2_leaf *)bh->b_data;
-				leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+				be16_add_cpu(&leaf->lf_entries, 1);
 			}
 			brelse(bh);
 			error = gfs2_meta_inode_buffer(ip, &bh);
-- 
cgit v1.2.3


From 8af4c72f7df2442230fca3ff49a97f978cfb4a04 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 17 Feb 2008 10:17:12 +0200
Subject: [GFS2] gfs2/ops_file.c should #include "ops_inode.h"

Every file should include the headers containing the prototypes for
its global functions (in this case for gfs2_set_inode_flags()).

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index f97a8b86c485..2b25a5f7a1c7 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -38,6 +38,7 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
+#include "ops_inode.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
-- 
cgit v1.2.3


From 60b779cfc1fa52034a996ee12a23b62d32e86000 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 17 Feb 2008 10:20:34 +0200
Subject: [GFS2] proper extern for gfs2/locking/dlm/mount.c:gdlm_ops

This patch adds a proper extern declaration for gdlm_ops in
fs/gfs2/locking/dlm/lock_dlm.h

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/lock_dlm.h | 5 +++++
 fs/gfs2/locking/dlm/main.c     | 2 --
 fs/gfs2/locking/dlm/sysfs.c    | 2 --
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 9e8265d28377..58fcf8c5bf39 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -183,5 +183,10 @@ int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
 		struct file_lock *);
 int gdlm_punlock(void *, struct lm_lockname *, struct file *,
 		struct file_lock *);
+
+/* mount.c */
+
+extern const struct lm_lockops gdlm_ops;
+
 #endif
 
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index a0e7eda643ed..36a225850bd8 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,8 +11,6 @@
 
 #include "lock_dlm.h"
 
-extern struct lm_lockops gdlm_ops;
-
 static int __init init_lock_dlm(void)
 {
 	int error;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a87b09839761..8479da47049c 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -12,8 +12,6 @@
 
 #include "lock_dlm.h"
 
-extern struct lm_lockops gdlm_ops;
-
 static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
 {
 	return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
-- 
cgit v1.2.3


From 7afd88d9166a752b52517648bcbe923e05d393fc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 Feb 2008 16:07:18 +0000
Subject: [GFS2] Fix a page lock / glock deadlock

We've previously been using a "try lock" in readpage on the basis that
it would prevent deadlocks due to the inverted lock ordering (our normal
lock ordering is glock first and then page lock). Unfortunately tests
have shown that this isn't enough. If the glock has a demote request
queued such that run_queue() in the glock code tries to do a demote when
its called under readpage then it will try and write out all the dirty
pages which requires locking them. This then deadlocks with the page
locked by readpage.

The solution is to always require two calls into readpage. The first
unlocks the page, gets the glock and returns AOP_TRUNCATED_PAGE, the
second does the actual readpage and unlocks the glock & page as
required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.h       | 13 ++++++-------
 fs/gfs2/inode.c       |  2 +-
 fs/gfs2/ops_address.c | 27 +++++++++++++++------------
 fs/gfs2/ops_dentry.c  |  4 ++--
 fs/gfs2/ops_inode.c   |  4 ++--
 5 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index ace5770760ce..cdad3e6f8150 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -32,24 +32,23 @@
 #define GLR_TRYFAILED		13
 #define GLR_CANCELED		14
 
-static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
 	struct gfs2_holder *gh;
-	int locked = 0;
 	struct pid *pid;
 
 	/* Look in glock's list of holders for one with current task as owner */
 	spin_lock(&gl->gl_spin);
 	pid = task_pid(current);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		if (gh->gh_owner_pid == pid) {
-			locked = 1;
-			break;
-		}
+		if (gh->gh_owner_pid == pid)
+			goto out;
 	}
+	gh = NULL;
+out:
 	spin_unlock(&gl->gl_spin);
 
-	return locked;
+	return gh;
 }
 
 static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5f50dd53bf63..810ff023fb14 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -493,7 +493,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 		return dir;
 	}
 
-	if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) {
+	if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
 		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
 		if (error)
 			return ERR_PTR(error);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 7523999afc53..fbb4a6aa1583 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -508,23 +508,26 @@ static int __gfs2_readpage(void *file, struct page *page)
 static int gfs2_readpage(struct file *file, struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-	struct gfs2_holder gh;
+	struct gfs2_holder *gh;
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
-	error = gfs2_glock_nq_atime(&gh);
-	if (unlikely(error)) {
+	gh = gfs2_glock_is_locked_by_me(ip->i_gl);
+	if (!gh) {
+		gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+		if (!gh)
+			return -ENOBUFS;
+		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
 		unlock_page(page);
-		goto out;
+		error = gfs2_glock_nq_atime(gh);
+		if (likely(error != 0))
+			goto out;
+		return AOP_TRUNCATED_PAGE;
 	}
 	error = __gfs2_readpage(file, page);
-	gfs2_glock_dq(&gh);
+	gfs2_glock_dq(gh);
 out:
-	gfs2_holder_uninit(&gh);
-	if (error == GLR_TRYFAILED) {
-		yield();
-		return AOP_TRUNCATED_PAGE;
-	}
+	gfs2_holder_uninit(gh);
+	kfree(gh);
 	return error;
 }
 
@@ -826,7 +829,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	unsigned int to = from + len;
 	int ret;
 
-	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0);
+	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
 
 	ret = gfs2_meta_inode_buffer(ip, &dibh);
 	if (unlikely(ret)) {
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 793e334d098e..4a5e676b4420 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
 	int error;
-	int had_lock=0;
+	int had_lock = 0;
 
 	if (inode) {
 		if (is_bad_inode(inode))
@@ -54,7 +54,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 	if (sdp->sd_args.ar_localcaching)
 		goto valid;
 
-	had_lock = gfs2_glock_is_locked_by_me(dip->i_gl);
+	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
 	if (!had_lock) {
 		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
 		if (error)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 301c94596678..af7097a514c1 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -898,7 +898,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 	int error;
 	int unlock = 0;
 
-	if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
 			return error;
@@ -1065,7 +1065,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	int error;
 	int unlock = 0;
 
-	if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
 		if (error)
 			return error;
-- 
cgit v1.2.3


From 9b8c81d1de49943ec69d157234b8981008c30d31 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 Feb 2008 16:09:31 +0000
Subject: [GFS2] Allow bmap to allocate extents

We've supported mapping of extents when no block allocation is required
for some time. This patch extends that to mapping of extents when an
allocation has been requested. In that case we try to allocate as many
blocks as are requested, but we might return fewer in case there is
something preventing us from returning the complete amount (e.g. an
already allocated block is in the way).

Currently the only code path which can actually request multiple data
blocks in a single bmap call is the page_mkwrite path and even then it
only happens if there are multiple blocks per page. What this patch does
do however, is merge the allocation requests for metadata (growing the
metadata tree in either height or depth) with the allocation of the data
blocks in the case that both are needed. This results in lower overheads
even in the single block allocation case.

The one thing which we can't handle here at the moment is unstuffing. I
would like to be able to do that, but the problem which arises is that
in order to unstuff one has to get a locked page from the page cache
which results in locking problems in the (usual) case that the caller is
holding the page lock on the page it wishes to map. So that case will
have to be addressed in future patches.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 502 +++++++++++++++++++++++++++++++++------------------------
 fs/gfs2/dir.c  |   2 +-
 fs/gfs2/rgrp.c |  10 +-
 3 files changed, 295 insertions(+), 219 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f1f38ca77a52..c1ee6355ced1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -175,74 +175,13 @@ out:
 	return error;
 }
 
-/**
- * build_height - Build a metadata tree of the requested height
- * @ip: The GFS2 inode
- * @height: The height to build to
- *
- *
- * Returns: errno
- */
-
-static int build_height(struct inode *inode, struct metapath *mp, unsigned height)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned new_height = height - ip->i_height;
-	struct buffer_head *dibh = mp->mp_bh[0];
-	struct gfs2_dinode *di;
-	__be64 *bp;
-	u64 bn;
-	unsigned n, i = 0;
-
-	BUG_ON(height <= ip->i_height);
-
-	do {
-		n = new_height - i;
-		bn = gfs2_alloc_block(ip, &n);
-		gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, n);
-		do {
-			mp->mp_bh[i + 1] = gfs2_meta_new(ip->i_gl, bn++);
-			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i + 1], 1);
-			i++;
-		} while(i < n);
-	} while(i < new_height);
-
-	n = 0;
-	bn = mp->mp_bh[1]->b_blocknr;
-	if (new_height > 1) {
-		for(; n < new_height-1; n++) {
-			gfs2_metatype_set(mp->mp_bh[n + 1], GFS2_METATYPE_IN,
-					  GFS2_FORMAT_IN);
-			gfs2_buffer_clear_tail(mp->mp_bh[n + 1],
-					       sizeof(struct gfs2_meta_header));
-			bp = (__be64 *)(mp->mp_bh[n + 1]->b_data +
-				     sizeof(struct gfs2_meta_header));
-			*bp = cpu_to_be64(mp->mp_bh[n+2]->b_blocknr);
-			brelse(mp->mp_bh[n+1]);
-			mp->mp_bh[n+1] = NULL;
-		}
-	}
-	gfs2_metatype_set(mp->mp_bh[n+1], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-	gfs2_buffer_copy_tail(mp->mp_bh[n+1], sizeof(struct gfs2_meta_header),
-			      dibh, sizeof(struct gfs2_dinode));
-	brelse(mp->mp_bh[n+1]);
-	mp->mp_bh[n+1] = NULL;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	di = (struct gfs2_dinode *)dibh->b_data;
-	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-	*(__be64 *)(di + 1) = cpu_to_be64(bn);
-	ip->i_height += new_height;
-	gfs2_add_inode_blocks(&ip->i_inode, new_height);
-	di->di_height = cpu_to_be16(ip->i_height);
-	di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
-	return 0;
-}
 
 /**
  * find_metapath - Find path through the metadata tree
- * @ip: The inode pointer
+ * @sdp: The superblock
  * @mp: The metapath to return the result in
  * @block: The disk block to look up
+ * @height: The pre-calculated height of the metadata tree
  *
  *   This routine returns a struct metapath structure that defines a path
  *   through the metadata of inode "ip" to get to block "block".
@@ -297,17 +236,27 @@ static int build_height(struct inode *inode, struct metapath *mp, unsigned heigh
  *
  */
 
-static void find_metapath(struct gfs2_inode *ip, u64 block,
-			  struct metapath *mp)
+static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
+			  struct metapath *mp, unsigned int height)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	unsigned int i;
 
-	for (i = ip->i_height; i--;)
+	for (i = height; i--;)
 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 
 }
 
+static inline unsigned int zero_metapath_length(const struct metapath *mp,
+						unsigned height)
+{
+	unsigned int i;
+	for (i = 0; i < height - 1; i++) {
+		if (mp->mp_list[i] != 0)
+			return i;
+	}
+	return height;
+}
+
 /**
  * metapointer - Return pointer to start of metadata in a buffer
  * @height: The metadata height (0 = dinode)
@@ -318,95 +267,62 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
  * metadata tree.
  */
 
-static inline __be64 *metapointer(int *boundary, unsigned int height,
-				  const struct metapath *mp)
+static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 {
 	struct buffer_head *bh = mp->mp_bh[height];
 	unsigned int head_size = (height > 0) ?
 		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-	__be64 *ptr;
-	*boundary = 0;
-	ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
-	if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size))
-		*boundary = 1;
-	return ptr;
+	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
 }
 
 /**
- * lookup_block - Get the next metadata block in metadata tree
- * @ip: The GFS2 inode
- * @height: The height of the tree (0 = dinode)
+ * lookup_metapath - Walk the metadata tree to a specific point
+ * @ip: The inode
  * @mp: The metapath
- * @create: Non-zero if we may create a new meatdata block
- * @new: Used to indicate if we did create a new metadata block
- * @block: the returned disk block number
  *
- * Given a metatree, complete to a particular height, checks to see if the next
- * height of the tree exists. If not the next height of the tree is created.
- * The block number of the next height of the metadata tree is returned.
+ * Assumes that the inode's buffer has already been looked up and
+ * hooked onto mp->mp_bh[0] and that the metapath has been initialised
+ * by find_metapath().
  *
+ * If this function encounters part of the tree which has not been
+ * allocated, it returns the current height of the tree at the point
+ * at which it found the unallocated block. Blocks which are found are
+ * added to the mp->mp_bh[] list.
+ *
+ * Returns: error or height of metadata tree
  */
 
-static int lookup_block(struct gfs2_inode *ip, unsigned int height,
-			struct metapath *mp, int create,
-			int *new, u64 *block)
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 {
-	int boundary;
-	__be64 *ptr = metapointer(&boundary, height, mp);
-	unsigned int n = 1;
-
-	if (*ptr) {
-		*block = be64_to_cpu(*ptr);
-		return boundary;
-	}
-
-	*block = 0;
-
-	if (!create)
-		return 0;
-
-	*block = gfs2_alloc_block(ip, &n);
-	if (height != ip->i_height - 1 || gfs2_is_dir(ip))
-		gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), *block, 1);
-
-	gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[height], 1);
-
-	*ptr = cpu_to_be64(*block);
-	gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-	*new = 1;
-	return 0;
-}
-
-static int lookup_metapath(struct inode *inode, struct metapath *mp,
-			   int create, int *new, u64 *dblock)
-{
-	struct buffer_head *bh;
-	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
+	__be64 *ptr;
+	u64 dblock;
 	int ret;
 
 	for (x = 0; x < end_of_metadata; x++) {
-		lookup_block(ip, x, mp, create, new, dblock);
-		if (!*dblock)
-			return 0;
+		ptr = metapointer(x, mp);
+		dblock = be64_to_cpu(*ptr);
+		if (!dblock)
+			return x + 1;
 
-		ret = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &mp->mp_bh[x+1]);
+		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
 		if (ret)
 			return ret;
 	}
 
-	return lookup_block(ip, end_of_metadata, mp, create, new, dblock);
+	return ip->i_height;
 }
 
-static void release_metapath(struct metapath *mp)
+static inline void release_metapath(struct metapath *mp)
 {
 	int i;
 
-	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++)
-		if (mp->mp_bh[i])
-			brelse(mp->mp_bh[i]);
+	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
+		if (mp->mp_bh[i] == NULL)
+			break;
+		brelse(mp->mp_bh[i]);
+	}
 }
 
 /**
@@ -445,32 +361,208 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b
 	return (ptr - first);
 }
 
-static inline void bmap_lock(struct inode *inode, int create)
+static inline void bmap_lock(struct gfs2_inode *ip, int create)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
 	if (create)
 		down_write(&ip->i_rw_mutex);
 	else
 		down_read(&ip->i_rw_mutex);
 }
 
-static inline void bmap_unlock(struct inode *inode, int create)
+static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
 	if (create)
 		up_write(&ip->i_rw_mutex);
 	else
 		up_read(&ip->i_rw_mutex);
 }
 
+static inline __be64 *gfs2_indirect_init(struct metapath *mp,
+					 struct gfs2_glock *gl, unsigned int i,
+					 unsigned offset, u64 bn)
+{
+	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
+		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
+				 sizeof(struct gfs2_dinode)));
+	BUG_ON(i < 1);
+	BUG_ON(mp->mp_bh[i] != NULL);
+	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
+	gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
+	ptr += offset;
+	*ptr = cpu_to_be64(bn);
+	return ptr;
+}
+
+enum alloc_state {
+	ALLOC_DATA = 0,
+	ALLOC_GROW_DEPTH = 1,
+	ALLOC_GROW_HEIGHT = 2,
+	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
+};
+
+/**
+ * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * @inode: The GFS2 inode
+ * @lblock: The logical starting block of the extent
+ * @bh_map: This is used to return the mapping details
+ * @mp: The metapath
+ * @sheight: The starting height (i.e. whats already mapped)
+ * @height: The height to build to
+ * @maxlen: The max number of data blocks to alloc
+ *
+ * In this routine we may have to alloc:
+ *   i) Indirect blocks to grow the metadata tree height
+ *  ii) Indirect blocks to fill in lower part of the metadata tree
+ * iii) Data blocks
+ *
+ * The function is in two parts. The first part works out the total
+ * number of blocks which we need. The second part does the actual
+ * allocation asking for an extent at a time (if enough contiguous free
+ * blocks are available, there will only be one request per bmap call)
+ * and uses the state machine to initialise the blocks in order.
+ *
+ * Returns: errno on error
+ */
+
+static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
+			   struct buffer_head *bh_map, struct metapath *mp,
+			   const unsigned int sheight,
+			   const unsigned int height,
+			   const unsigned int maxlen)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh = mp->mp_bh[0];
+	u64 bn, dblock = 0;
+	unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+	unsigned dblks = 0;
+	unsigned ptrs_per_blk;
+	const unsigned end_of_metadata = height - 1;
+	int eob = 0;
+	enum alloc_state state;
+	__be64 *ptr;
+	__be64 zero_bn = 0;
+
+	BUG_ON(sheight < 1);
+	BUG_ON(dibh == NULL);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (height == sheight) {
+		struct buffer_head *bh;
+		/* Bottom indirect block exists, find unalloced extent size */
+		ptr = metapointer(end_of_metadata, mp);
+		bh = mp->mp_bh[end_of_metadata];
+		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+					   &eob);
+		BUG_ON(dblks < 1);
+		state = ALLOC_DATA;
+	} else {
+		/* Need to allocate indirect blocks */
+		ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+		dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+		if (height == ip->i_height) {
+			/* Writing into existing tree, extend tree down */
+			iblks = height - sheight;
+			state = ALLOC_GROW_DEPTH;
+		} else {
+			/* Building up tree height */
+			state = ALLOC_GROW_HEIGHT;
+			iblks = height - ip->i_height;
+			zmpl = zero_metapath_length(mp, height);
+			iblks -= zmpl;
+			iblks += height;
+		}
+	}
+
+	/* start of the second part of the function (state machine) */
+
+	blks = dblks + iblks;
+	i = sheight;
+	do {
+		n = blks - alloced;
+		bn = gfs2_alloc_block(ip, &n);
+		alloced += n;
+		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
+			gfs2_trans_add_unrevoke(sdp, bn, n);
+		switch (state) {
+		/* Growing height of tree */
+		case ALLOC_GROW_HEIGHT:
+			if (i == 1) {
+				ptr = (__be64 *)(dibh->b_data +
+						 sizeof(struct gfs2_dinode));
+				zero_bn = *ptr;
+			}
+			for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
+			if (i - 1 == height - ip->i_height) {
+				i--;
+				gfs2_buffer_copy_tail(mp->mp_bh[i],
+						sizeof(struct gfs2_meta_header),
+						dibh, sizeof(struct gfs2_dinode));
+				gfs2_buffer_clear_tail(dibh,
+						sizeof(struct gfs2_dinode) +
+						sizeof(__be64));
+				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
+					sizeof(struct gfs2_meta_header));
+				*ptr = zero_bn;
+				state = ALLOC_GROW_DEPTH;
+				for(i = zmpl; i < height; i++) {
+					if (mp->mp_bh[i] == NULL)
+						break;
+					brelse(mp->mp_bh[i]);
+					mp->mp_bh[i] = NULL;
+				}
+				i = zmpl;
+			}
+			if (n == 0)
+				break;
+		/* Branching from existing tree */
+		case ALLOC_GROW_DEPTH:
+			if (i > 1 && i < height)
+				gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+			for (; i < height && n > 0; i++, n--)
+				gfs2_indirect_init(mp, ip->i_gl, i,
+						   mp->mp_list[i-1], bn++);
+			if (i == height)
+				state = ALLOC_DATA;
+			if (n == 0)
+				break;
+		/* Tree complete, adding data blocks */
+		case ALLOC_DATA:
+			BUG_ON(n > dblks);
+			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
+			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+			dblks = n;
+			ptr = metapointer(end_of_metadata, mp);
+			dblock = bn;
+			while (n-- > 0)
+				*ptr++ = cpu_to_be64(bn++);
+			break;
+		}
+	} while (state != ALLOC_DATA);
+
+	ip->i_height = height;
+	gfs2_add_inode_blocks(&ip->i_inode, alloced);
+	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
+	map_bh(bh_map, inode->i_sb, dblock);
+	bh_map->b_size = dblks << inode->i_blkbits;
+	set_buffer_new(bh_map);
+	return 0;
+}
+
 /**
  * gfs2_block_map - Map a block from an inode to a disk block
  * @inode: The inode
  * @lblock: The logical block number
  * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
  *
- * Find the block number on the current device which corresponds to an
- * inode's block. If the block had to be created, "new" will be set.
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
  *
  * Returns: errno
  */
@@ -481,21 +573,21 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned int bsize = sdp->sd_sb.sb_bsize;
-	int error = 0;
-	int new = 0;
-	u64 dblock = 0;
-	int boundary;
-	unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
-	struct metapath mp;
-	u64 size;
+	const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
 	const u64 *arr = sdp->sd_heightsize;
-	BUG_ON(maxlen == 0);
+	__be64 *ptr;
+	u64 size;
+	struct metapath mp;
+	int ret;
+	int eob;
+	unsigned int len;
+	struct buffer_head *bh;
+	u8 height;
 
-	if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
-		return 0;
+	BUG_ON(maxlen == 0);
 
 	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
-	bmap_lock(inode, create);
+	bmap_lock(ip, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
 	clear_buffer_boundary(bh_map);
@@ -503,56 +595,50 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 		bsize = sdp->sd_jbsize;
 		arr = sdp->sd_jheightsize;
 	}
-	size = (lblock + 1) * bsize;
 
-	error = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
-	if (error)
-		goto out_fail;
-
-	if (size > arr[ip->i_height]) {
-		u8 height = ip->i_height;
-		if (!create)
-			goto out_ok;
-		while (size > arr[height])
-			height++;
-		error = build_height(inode, &mp, height);
-		if (error)
-			goto out_fail;
-	}
+	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
+	if (ret)
+		goto out;
 
-	find_metapath(ip, lblock, &mp);
-	error = lookup_metapath(inode, &mp, create, &new, &dblock);
-	if (error < 0)
-		goto out_fail;
-	boundary = error;
-
-	if (new) {
-		map_bh(bh_map, inode->i_sb, dblock);
-		if (boundary)
-			set_buffer_boundary(bh_map);
-		gfs2_trans_add_bh(ip->i_gl, mp.mp_bh[0], 1);
-		gfs2_dinode_out(ip, mp.mp_bh[0]->b_data);
-		set_buffer_new(bh_map);
-		goto out_ok;
-	}
+	height = ip->i_height;
+	size = (lblock + 1) * bsize;
+	while (size > arr[height])
+		height++;
+	find_metapath(sdp, lblock, &mp, height);
+	ret = 1;
+	if (height > ip->i_height || gfs2_is_stuffed(ip))
+		goto do_alloc;
+	ret = lookup_metapath(ip, &mp);
+	if (ret < 0)
+		goto out;
+	if (ret != ip->i_height)
+		goto do_alloc;
+	ptr = metapointer(ip->i_height - 1, &mp);
+	if (*ptr == 0)
+		goto do_alloc;
+	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+	bh = mp.mp_bh[ip->i_height - 1];
+	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
+	bh_map->b_size = (len << inode->i_blkbits);
+	if (eob)
+		set_buffer_boundary(bh_map);
+	ret = 0;
+out:
+	release_metapath(&mp);
+	bmap_unlock(ip, create);
+	return ret;
 
-	if (dblock) {
-		unsigned int len;
-		struct buffer_head *bh = mp.mp_bh[ip->i_height - 1];
-		__be64 *ptr = metapointer(&boundary, ip->i_height - 1, &mp);
-		map_bh(bh_map, inode->i_sb, dblock);
-		len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
-					 &boundary);
-		bh_map->b_size = (len << inode->i_blkbits);
-		if (boundary)
-			set_buffer_boundary(bh_map);
+do_alloc:
+	/* All allocations are done here, firstly check create flag */
+	if (!create) {
+		BUG_ON(gfs2_is_stuffed(ip));
+		ret = 0;
+		goto out;
 	}
-out_ok:
-	error = 0;
-out_fail:
-	release_metapath(&mp);
-	bmap_unlock(inode, create);
-	return error;
+
+	/* At this point ret is the tree depth of already allocated blocks */
+	ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
+	goto out;
 }
 
 /*
@@ -568,7 +654,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 	BUG_ON(!dblock);
 	BUG_ON(!new);
 
-	bh.b_size = 1 << (inode->i_blkbits + 5);
+	bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
 	ret = gfs2_block_map(inode, lblock, &bh, create);
 	*extlen = bh.b_size >> inode->i_blkbits;
 	*dblock = bh.b_blocknr;
@@ -835,38 +921,25 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 	if (error)
 		goto out_ipres;
 
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out_end_trans;
+
 	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-		const u64 *arr = sdp->sd_heightsize;
 		if (gfs2_is_stuffed(ip)) {
 			error = gfs2_unstuff_dinode(ip, NULL);
 			if (error)
-				goto out_end_trans;
-		}
-
-		down_write(&ip->i_rw_mutex);
-		if (size > arr[ip->i_height]) {
-			struct metapath mp;
-			u8 height = ip->i_height;
-			while(size > arr[height])
-				height++;
-			error = build_height(&ip->i_inode, &mp, height);
+				goto out_brelse;
 		}
-		up_write(&ip->i_rw_mutex);
-		if (error)
-			goto out_end_trans;
 	}
 
 	ip->i_di.di_size = size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out_end_trans;
-
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
 
+out_brelse:
+	brelse(dibh);
 out_end_trans:
 	gfs2_trans_end(sdp);
 out_ipres:
@@ -996,6 +1069,7 @@ out:
 
 static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	unsigned int height = ip->i_height;
 	u64 lblock;
 	struct metapath mp;
@@ -1004,9 +1078,9 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 	if (!size)
 		lblock = 0;
 	else
-		lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
 
-	find_metapath(ip, lblock, &mp);
+	find_metapath(sdp, lblock, &mp, ip->i_height);
 	gfs2_alloc_get(ip);
 
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 34dc8dfaba12..a3753c7989f7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -159,6 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
 	unsigned int o;
 	int copied = 0;
 	int error = 0;
+	int new = 0;
 
 	if (!size)
 		return 0;
@@ -183,7 +184,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
 	while (copied < size) {
 		unsigned int amount;
 		struct buffer_head *bh;
-		int new = 0;
 
 		amount = size - copied;
 		if (amount > sdp->sd_sb.sb_bsize - o)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 77eba0a38040..4291375cecc6 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1357,16 +1357,18 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
 		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
 			    bi->bi_len, blk, new_state);
-		while(*n < elen) {
+		goal = blk;
+		while (*n < elen) {
 			goal++;
-			if (goal >= (bi->bi_len / GFS2_NBBY))
+			if (goal >= (bi->bi_len * GFS2_NBBY))
 				break;
 			if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
 			    GFS2_BLKST_FREE)
 				break;
-			(*n)++;
 			gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-				    bi->bi_offset, bi->bi_len, blk, new_state);
+				    bi->bi_offset, bi->bi_len, goal,
+				    new_state);
+			(*n)++;
 		}
 	}
 
-- 
cgit v1.2.3


From 7dc2cf1c8ffbd471722f1aa479bc68d4df1c9edc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Feb 2008 18:58:36 +0100
Subject: [GFS2] fix file_system_type leak on gfs2meta mount

get_gfs2_sb does a get_fs_type without doing a put_filesystem and
thus leaking a file_system_type reference everytime it's called.

Just use gfs2_fs_type directly instead of doing the lookup and thus
fix the problem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c4b7a210c0c0..63d5fd203d3f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -943,7 +943,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
 {
 	struct kstat stat;
 	struct nameidata nd;
-	struct file_system_type *fstype;
 	struct super_block *sb = NULL, *s;
 	int error;
 
@@ -955,8 +954,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
 	}
 	error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
 
-	fstype = get_fs_type("gfs2");
-	list_for_each_entry(s, &fstype->fs_supers, s_instances) {
+	list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
 		if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
 		    (S_ISDIR(stat.mode) &&
 		     s == nd.path.dentry->d_inode->i_sb)) {
-- 
cgit v1.2.3


From d83225d45d2b76175279abb2a3d7ee325a09aba8 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Tue, 26 Feb 2008 15:25:03 +0800
Subject: [GFS2] remove gfs2_dev_iops

struct inode_operations gfs2_dev_iops is always the same as gfs2_file_iops,
since Jan 2006, when GFS2 merged into mainstream kernel.

So one of them could be removed.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     |  2 +-
 fs/gfs2/ops_inode.c | 10 ----------
 fs/gfs2/ops_inode.h |  1 -
 3 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 810ff023fb14..7b9f31e5d6d3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ void gfs2_set_iop(struct inode *inode)
 	} else if (S_ISLNK(mode)) {
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
-		inode->i_op = &gfs2_dev_iops;
+		inode->i_op = &gfs2_file_iops;
 	}
 
 	unlock_new_inode(inode);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index af7097a514c1..6cbbb5b46ce4 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1149,16 +1149,6 @@ const struct inode_operations gfs2_file_iops = {
 	.removexattr = gfs2_removexattr,
 };
 
-const struct inode_operations gfs2_dev_iops = {
-	.permission = gfs2_permission,
-	.setattr = gfs2_setattr,
-	.getattr = gfs2_getattr,
-	.setxattr = gfs2_setxattr,
-	.getxattr = gfs2_getxattr,
-	.listxattr = gfs2_listxattr,
-	.removexattr = gfs2_removexattr,
-};
-
 const struct inode_operations gfs2_dir_iops = {
 	.create = gfs2_create,
 	.lookup = gfs2_lookup,
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index fd8cee231e1d..14b4b797622a 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -15,7 +15,6 @@
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct inode_operations gfs2_dev_iops;
 extern const struct file_operations gfs2_file_fops;
 extern const struct file_operations gfs2_dir_fops;
 extern const struct file_operations gfs2_file_fops_nolock;
-- 
cgit v1.2.3


From 43a33c53cc9131a537522ab9736c6e4c03ddf57a Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Tue, 26 Feb 2008 15:25:04 +0800
Subject: [GFS2] re-support special inode

a previous commit removed call to
init_special_inode from inode lookuping, this cause problems as:

 # mknod /mnt/gfs2/dev/null c 1 3
 # cat /mnt/gfs2/dev/null
 cat: /mnt/gfs2/dev/null: Invalid argument

without special inode, GFS2 cannot support char device file,
block device file, fifo pipe, and socket file, lose many important
features as a common file system.

this one line patch re add special inode support.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7b9f31e5d6d3..92ea9afacb17 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -150,6 +150,7 @@ void gfs2_set_iop(struct inode *inode)
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
 		inode->i_op = &gfs2_file_iops;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 	}
 
 	unlock_new_inode(inode);
-- 
cgit v1.2.3


From 105284970ba7d0d0ff4b97e57728eac7adf6a42a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 27 Feb 2008 17:56:27 +0000
Subject: [GFS2] Need to ensure that sector_t is 64bits for GFS2

We need to ensure that sector_t is 64bits for GFS2, so that we need to
depend on LBD as well as LSF.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index d147b53ef527..7f7947e3dfbb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LSF)
+	depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
 	select FS_POSIX_ACL
 	select CRC32
 	help
-- 
cgit v1.2.3


From 182fe5abd8ebbb3a00c1be91f44e4783e139918c Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Mon, 3 Mar 2008 21:54:21 +0300
Subject: [GFS2] possible null pointer dereference fixup

gfs2_alloc_get may fail so we have to check it to prevent
NULL pointer dereference.

Signed-off-by: Cyrill Gorcunov <gorcunov@gamil.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        |  5 ++++-
 fs/gfs2/dir.c         | 10 +++++++---
 fs/gfs2/eattr.c       |  6 ++++++
 fs/gfs2/inode.c       |  7 ++++++-
 fs/gfs2/ops_address.c |  4 ++++
 fs/gfs2/ops_inode.c   | 11 ++++++++++-
 fs/gfs2/quota.c       |  9 +++++++--
 7 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c1ee6355ced1..f7093aa69aae 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -900,6 +900,8 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 	int error;
 
 	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
 
 	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
@@ -1081,7 +1083,8 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
 
 	find_metapath(sdp, lblock, &mp, ip->i_height);
-	gfs2_alloc_get(ip);
+	if (!gfs2_alloc_get(ip))
+		return -ENOMEM;
 
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a3753c7989f7..94070ad8826b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1868,11 +1868,14 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	if (!ht)
 		return -ENOMEM;
 
-	gfs2_alloc_get(dip);
+	if (!gfs2_alloc_get(dip)) {
+		error = -ENOMEM;
+		goto out;
+	}
 
 	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
-		goto out;
+		goto out_put;
 
 	error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
 	if (error)
@@ -1946,8 +1949,9 @@ out_rlist:
 	gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
 out_qs:
 	gfs2_quota_unhold(dip);
-out:
+out_put:
 	gfs2_alloc_put(dip);
+out:
 	kfree(ht);
 	return error;
 }
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 76ead1acfcc7..288d5e6ad93a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -318,6 +318,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	int error;
 
 	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
 
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
@@ -681,6 +683,8 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	int error;
 
 	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
 
 	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
@@ -1464,6 +1468,8 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 	int error;
 
 	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
 
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 92ea9afacb17..dcae2aa83f13 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -351,6 +351,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	}
 
 	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
 
 	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
@@ -825,7 +827,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	int error;
 
 	munge_mode_uid_gid(dip, &mode, &uid, &gid);
-	gfs2_alloc_get(dip);
+	if (!gfs2_alloc_get(dip))
+		return -ENOMEM;
 
 	error = gfs2_quota_lock(dip, uid, gid);
 	if (error)
@@ -860,6 +863,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	int error;
 
 	al = gfs2_alloc_get(dip);
+	if (!al)
+		return -ENOMEM;
 
 	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (error)
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index fbb4a6aa1583..2483d8741060 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -649,6 +649,10 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 
 	if (alloc_required) {
 		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
 
 		error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 		if (error)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 6cbbb5b46ce4..34fe571e15ee 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -200,6 +200,10 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 
 	if (alloc_required) {
 		struct gfs2_alloc *al = gfs2_alloc_get(dip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
 
 		error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 		if (error)
@@ -716,6 +720,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 
 	if (alloc_required) {
 		struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
 
 		error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 		if (error)
@@ -953,7 +961,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
 		ogid = ngid = NO_QUOTA_CHANGE;
 
-	gfs2_alloc_get(ip);
+	if (!gfs2_alloc_get(ip))
+		return -ENOMEM;
 
 	error = gfs2_quota_lock(ip, nuid, ngid);
 	if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a08dabd6ce90..636bccfd2bcf 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -617,8 +617,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	int err = -EIO;
 
 	if (gfs2_is_stuffed(ip)) {
-		struct gfs2_alloc *al = NULL;
-		al = gfs2_alloc_get(ip);
+		struct gfs2_alloc *al = gfs2_alloc_get(ip);
+		if (!al)
+			return -ENOMEM;
 		/* just request 1 blk */
 		al->al_requested = 1;
 		gfs2_inplace_reserve(ip);
@@ -729,6 +730,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 
 	if (nalloc) {
 		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_gunlock;
+		}
 
 		al->al_requested = nalloc * (data_blocks + ind_blocks);
 
-- 
cgit v1.2.3


From 20b95bf2c4c5c28e093aa42699e67829b6cd7fd0 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Thu, 6 Mar 2008 17:43:52 -0600
Subject: [GFS2] gfs2_adjust_quota has broken unstuffing code

This patch combines the 2 patches in bug 434736 to correct the lock
ordering in the unstuffing of the quota inode in gfs2_adjust_quota and
adjusting the number of revokes in gfs2_write_jdata_pagevec

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c |  3 +--
 fs/gfs2/quota.c       | 71 ++++++++++++++++++++++-----------------------------
 2 files changed, 31 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 2483d8741060..e72fd47d71eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -21,7 +21,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
 #include <linux/backing-dev.h>
-#include <linux/pagevec.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -278,7 +277,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 	int i;
 	int ret;
 
-	ret = gfs2_trans_begin(sdp, nrblocks, 0);
+	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 636bccfd2bcf..c71f781db5d7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -616,17 +616,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	s64 value;
 	int err = -EIO;
 
-	if (gfs2_is_stuffed(ip)) {
-		struct gfs2_alloc *al = gfs2_alloc_get(ip);
-		if (!al)
-			return -ENOMEM;
-		/* just request 1 blk */
-		al->al_requested = 1;
-		gfs2_inplace_reserve(ip);
+	if (gfs2_is_stuffed(ip))
 		gfs2_unstuff_dinode(ip, NULL);
-		gfs2_inplace_release(ip);
-		gfs2_alloc_put(ip);
-	}
+	
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
@@ -691,7 +683,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	unsigned int qx, x;
 	struct gfs2_quota_data *qd;
 	loff_t offset;
-	unsigned int nalloc = 0;
+	unsigned int nalloc = 0, blocks;
 	struct gfs2_alloc *al = NULL;
 	int error;
 
@@ -728,34 +720,33 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 			nalloc++;
 	}
 
-	if (nalloc) {
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_gunlock;
-		}
+	al = gfs2_alloc_get(ip);
+	if (!al) {
+		error = -ENOMEM;
+		goto out_gunlock;
+	}
+	/* 
+	 * 1 blk for unstuffing inode if stuffed. We add this extra
+	 * block to the reservation unconditionally. If the inode
+	 * doesn't need unstuffing, the block will be released to the 
+	 * rgrp since it won't be allocated during the transaction
+	 */
+	al->al_requested = 1;
+	/* +1 in the end for block requested above for unstuffing */
+	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
 
-		al->al_requested = nalloc * (data_blocks + ind_blocks);
+	if (nalloc)
+		al->al_requested += nalloc * (data_blocks + ind_blocks);		
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_alloc;
 
-		error = gfs2_inplace_reserve(ip);
-		if (error)
-			goto out_alloc;
-
-		error = gfs2_trans_begin(sdp,
-					 al->al_rgd->rd_length +
-					 num_qd * data_blocks +
-					 nalloc * ind_blocks +
-					 RES_DINODE + num_qd +
-					 RES_STATFS, 0);
-		if (error)
-			goto out_ipres;
-	} else {
-		error = gfs2_trans_begin(sdp,
-					 num_qd * data_blocks +
-					 RES_DINODE + num_qd, 0);
-		if (error)
-			goto out_gunlock;
-	}
+	if (nalloc)
+		blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+
+	error = gfs2_trans_begin(sdp, blocks, 0);
+	if (error)
+		goto out_ipres;
 
 	for (x = 0; x < num_qd; x++) {
 		qd = qda[x];
@@ -774,11 +765,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 out_end_trans:
 	gfs2_trans_end(sdp);
 out_ipres:
-	if (nalloc)
-		gfs2_inplace_release(ip);
+	gfs2_inplace_release(ip);
 out_alloc:
-	if (nalloc)
-		gfs2_alloc_put(ip);
+	gfs2_alloc_put(ip);
 out_gunlock:
 	gfs2_glock_dq_uninit(&i_gh);
 out:
-- 
cgit v1.2.3


From 860b25d4a913a00331d333f8e207a088c7a1b84a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 10 Mar 2008 10:13:31 +0000
Subject: [GFS2] Remove drop of module ref where not needed

In an earlier patch "[GFS2] fix file_system_type leak on gfs2meta mount"
we removed the code to grab a ref to the module which was not needed
(since we know that the module cannot be unloaded at that time) so
this patch removes the code to drop that reference.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 63d5fd203d3f..5b518f73497a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -998,7 +998,6 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(new);
 		goto error;
 	}
-	module_put(fs_type->owner);
 	new->s_flags = flags;
 	strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
 	sb_set_blocksize(new, sb->s_blocksize);
-- 
cgit v1.2.3


From d82661d96993ac4efc1d54259ea85ffcd9b8bec6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 10 Mar 2008 15:34:50 +0000
Subject: [GFS2] Streamline quota lock/check for no-quota case

This patch streamlines the quota checking in the "no quota" case by
making the check inline in the calling function, thus reducing the
number of function calls. Eventually we might be able to remove the
checks from the gfs2_quota_lock() and gfs2_quota_check() functions, but
currently we can't as there are a very few places in the code which need
to call these functions directly still.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/bmap.c        |  6 +-----
 fs/gfs2/eattr.c       |  6 +-----
 fs/gfs2/ops_address.c |  6 +-----
 fs/gfs2/ops_file.c    |  5 +----
 fs/gfs2/ops_inode.c   | 12 ++----------
 fs/gfs2/quota.h       | 17 +++++++++++++++++
 6 files changed, 23 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f7093aa69aae..c19184f2e70e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -903,14 +903,10 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 	if (!al)
 		return -ENOMEM;
 
-	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_lock_check(ip);
 	if (error)
 		goto out;
 
-	error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-	if (error)
-		goto out_gunlock_q;
-
 	al->al_requested = sdp->sd_max_height + RES_DATA;
 
 	error = gfs2_inplace_reserve(ip);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 288d5e6ad93a..81755925a755 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -686,14 +686,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (!al)
 		return -ENOMEM;
 
-	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_lock_check(ip);
 	if (error)
 		goto out;
 
-	error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-	if (error)
-		goto out_gunlock_q;
-
 	al->al_requested = blks;
 
 	error = gfs2_inplace_reserve(ip);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e72fd47d71eb..90a04a6e3789 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -653,14 +653,10 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 			goto out_unlock;
 		}
 
-		error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		error = gfs2_quota_lock_check(ip);
 		if (error)
 			goto out_alloc_put;
 
-		error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-		if (error)
-			goto out_qunlock;
-
 		al->al_requested = data_blocks + ind_blocks;
 		error = gfs2_inplace_reserve(ip);
 		if (error)
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 2b25a5f7a1c7..e1b7d525a066 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -369,12 +369,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	if (al == NULL)
 		goto out_unlock;
 
-	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	ret = gfs2_quota_lock_check(ip);
 	if (ret)
 		goto out_alloc_put;
-	ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-	if (ret)
-		goto out_quota_unlock;
 	al->al_requested = data_blocks + ind_blocks;
 	ret = gfs2_inplace_reserve(ip);
 	if (ret)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 34fe571e15ee..2686ad4c0029 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -205,14 +205,10 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 			goto out_gunlock;
 		}
 
-		error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto out_alloc;
 
-		error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
-		if (error)
-			goto out_gunlock_q;
-
 		al->al_requested = sdp->sd_max_dirres;
 
 		error = gfs2_inplace_reserve(dip);
@@ -725,14 +721,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock;
 		}
 
-		error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		error = gfs2_quota_lock_check(ndip);
 		if (error)
 			goto out_alloc;
 
-		error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid);
-		if (error)
-			goto out_gunlock_q;
-
 		al->al_requested = sdp->sd_max_dirres;
 
 		error = gfs2_inplace_reserve(ndip);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index a8be1417051f..3b7f4b0e5dfe 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -32,4 +32,21 @@ int gfs2_quota_init(struct gfs2_sbd *sdp);
 void gfs2_quota_scan(struct gfs2_sbd *sdp);
 void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
 
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int ret;
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (ret)
+		return ret;
+	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+		return 0;
+	ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+	if (ret)
+		gfs2_quota_unlock(ip);
+	return ret;
+}
+
 #endif /* __QUOTA_DOT_H__ */
-- 
cgit v1.2.3


From 1f466a47e8a3a3e3b527b3285c7b9c8a837fb7ec Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 10 Mar 2008 18:17:47 -0500
Subject: [GFS2] Faster gfs2_bitfit algorithm

This version of the gfs2_bitfit algorithm includes the latest
suggestions from Steve Whitehouse.  It is typically eight to
ten times faster than the version we're using today.  If there
is a lot of metadata mixed in (lots of small files) the
algorithm is often 15 times faster, and given the right
conditions, I've seen peaks of 20 times faster.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 93 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4291375cecc6..7e8f0b1d6c6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/prefetch.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -33,6 +34,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
 /*
  * These routines are used by the resource group routines (rgrp.c)
  * to keep track of block allocation.  Each block is represented by two
@@ -138,45 +149,63 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
 static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
 		       u8 old_state)
 {
-	const u8 *byte;
-	u32 blk = goal;
-	unsigned int bit, bitlong;
-	const unsigned long *plong;
-#if BITS_PER_LONG == 32
-	const unsigned long plong55 = 0x55555555;
-#else
-	const unsigned long plong55 = 0x5555555555555555;
-#endif
-
-	byte = buffer + (goal / GFS2_NBBY);
-	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
-	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	bitlong = bit;
-
-	while (byte < buffer + buflen) {
-
-		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
-			plong++;
-			byte += sizeof(unsigned long);
-			blk += sizeof(unsigned long) * GFS2_NBBY;
-			continue;
+	const u8 *byte, *start, *end;
+	int bit, startbit;
+	u32 g1, g2, misaligned;
+	unsigned long *plong;
+	unsigned long lskipval;
+
+	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
+	g1 = (goal / GFS2_NBBY);
+	start = buffer + g1;
+	byte = start;
+        end = buffer + buflen;
+	g2 = ALIGN(g1, sizeof(unsigned long));
+	plong = (unsigned long *)(buffer + g2);
+	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+	misaligned = g2 - g1;
+	if (!misaligned)
+		goto ulong_aligned;
+/* parse the bitmap a byte at a time */
+misaligned:
+	while (byte < end) {
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+			return goal +
+				(((byte - start) * GFS2_NBBY) +
+				 ((bit - startbit) >> 1));
 		}
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-			return blk;
 		bit += GFS2_BIT_SIZE;
-		if (bit >= 8) {
+		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
 			bit = 0;
 			byte++;
+			misaligned--;
+			if (!misaligned) {
+				plong = (unsigned long *)byte;
+				goto ulong_aligned;
+			}
 		}
-		bitlong += GFS2_BIT_SIZE;
-		if (bitlong >= sizeof(unsigned long) * 8) {
-			bitlong = 0;
-			plong++;
-		}
-
-		blk++;
 	}
+	return BFITNOENT;
 
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
+	   We could "if" it but we'd lose some of the performance gained.
+	   This way will only slow down searching the very last 4/8 bytes
+	   depending on architecture.  I've experimented with several ways
+	   of writing this section such as using an else before the goto
+	   but this one seems to be the fastest. */
+	while ((unsigned char *)plong < end - 1) {
+		prefetch(plong + 1);
+		if (((*plong) & LBITMASK) != lskipval)
+			break;
+		plong++;
+	}
+	if ((unsigned char *)plong < end) {
+		byte = (const u8 *)plong;
+		misaligned += sizeof(unsigned long) - 1;
+		goto misaligned;
+	}
 	return BFITNOENT;
 }
 
-- 
cgit v1.2.3


From f5a8cd020173c455705fc0095b7d299da6f8f87b Mon Sep 17 00:00:00 2001
From: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Date: Wed, 12 Mar 2008 14:01:29 -0700
Subject: [GFS2] fs/gfs2/recovery.c: suppress warnings

fs/gfs2/recovery.c: In function 'get_log_header':
fs/gfs2/recovery.c:152: warning: 'lh.lh_sequence' may be used uninitialized in this function
fs/gfs2/recovery.c:152: warning: 'lh.lh_flags' may be used uninitialized in this function
fs/gfs2/recovery.c:152: warning: 'lh.lh_tail' may be used uninitialized in this function
fs/gfs2/recovery.c:152: warning: 'lh.lh_blkno' may be used uninitialized in this function
fs/gfs2/recovery.c:152: warning: 'lh.lh_hash' may be used uninitialized in this function

Cc: David Teigland <teigland@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index b17d3b8b2321..06dcdc04627d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -149,7 +149,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
 			  struct gfs2_log_header_host *head)
 {
 	struct buffer_head *bh;
-	struct gfs2_log_header_host lh;
+	struct gfs2_log_header_host uninitialized_var(lh);
 	const u32 nothing = 0;
 	u32 hash;
 	int error;
-- 
cgit v1.2.3


From 58e9fee13e579df44922172dbe3c9e3ba3edf7a3 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 14 Mar 2008 13:52:52 -0500
Subject: [GFS2] Invalidate cache at correct point

GFS2 wasn't invalidating its cache before it called into the lock manager
with a request that could potentially drop a lock.  This was leaving a
window where the lock could be actually be held by another node, but the
file's page cache would still appear valid, causing coherency problems.
This patch moves the cache invalidation to before the lock manager call
when dropping a lock. It also adds the option to the lock_dlm lock
manager to not use conversion mode deadlock avoidance, which, on a
conversion from shared to exclusive, could internally drop the lock, and
then reacquire in. GFS2 now asks lock_dlm to not do this.  Instead, GFS2
manually drops the lock and reacquires it.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c              | 35 +++++++++++++++++++++--------------
 fs/gfs2/incore.h             |  1 +
 fs/gfs2/locking/dlm/lock.c   |  3 ++-
 fs/gfs2/locking/dlm/thread.c | 10 +++++++++-
 fs/gfs2/ops_fstype.c         |  2 +-
 include/linux/lm_interface.h | 10 ++++++++++
 6 files changed, 44 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 63981e2fb835..d636b3e80f5d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -764,7 +764,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh = gl->gl_req_gh;
 
 	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
 	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -772,8 +772,14 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 
 	state_change(gl, LM_ST_UNLOCKED);
 
-	if (glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA);
+	if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
+		spin_lock(&gl->gl_spin);
+		gh->gh_error = 0;
+		spin_unlock(&gl->gl_spin);
+		gfs2_glock_xmote_th(gl, gl->gl_req_gh);
+		gfs2_glock_put(gl);
+		return;
+	}
 
 	spin_lock(&gl->gl_spin);
 	gfs2_demote_wake(gl);
@@ -794,7 +800,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh = gl->gl_req_gh;
-	int prev_state = gl->gl_state;
 	int op_done = 1;
 
 	if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
@@ -808,16 +813,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 
 	state_change(gl, ret & LM_OUT_ST_MASK);
 
-	if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
-		if (glops->go_inval)
-			glops->go_inval(gl, DIO_METADATA);
-	} else if (gl->gl_state == LM_ST_DEFERRED) {
-		/* We might not want to do this here.
-		   Look at moving to the inode glops. */
-		if (glops->go_inval)
-			glops->go_inval(gl, 0);
-	}
-
 	/*  Deal with each possible exit condition  */
 
 	if (!gh) {
@@ -837,6 +832,14 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 		}
 	} else {
 		spin_lock(&gl->gl_spin);
+		if (ret & LM_OUT_CONV_DEADLK) {
+			gh->gh_error = 0;
+			set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
+			spin_unlock(&gl->gl_spin);
+			gfs2_glock_drop_th(gl);
+			gfs2_glock_put(gl);
+			return;
+		}
 		list_del_init(&gh->gh_list);
 		gh->gh_error = -EIO;
 		if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
@@ -910,6 +913,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 
 	if (glops->go_xmote_th)
 		glops->go_xmote_th(gl);
+	if (state == LM_ST_DEFERRED && glops->go_inval)
+		glops->go_inval(gl, DIO_METADATA);
 
 	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
 	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -952,6 +957,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 
 	if (glops->go_xmote_th)
 		glops->go_xmote_th(gl);
+	if (glops->go_inval)
+		glops->go_inval(gl, DIO_METADATA);
 
 	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
 	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4ba2ea63119d..9c2c0b90b22a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -167,6 +167,7 @@ enum {
 	GLF_DEMOTE_IN_PROGRESS	= 6,
 	GLF_LFLUSH		= 7,
 	GLF_WAITERS2		= 8,
+	GLF_CONV_DEADLK		= 9,
 };
 
 struct gfs2_glock {
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 542a797ac89a..53a6ab3c0919 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -137,7 +137,8 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 		/* Conversion deadlock avoidance by DLM */
 
-		if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+		if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
+		    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
 		    !(lkf & DLM_LKF_NOQUEUE) &&
 		    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
 			lkf |= DLM_LKF_CONVDEADLK;
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 521694fc19d6..e53db6fd28ab 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -135,7 +135,15 @@ static void process_complete(struct gdlm_lock *lp)
 			 lp->lksb.sb_status, lp->lockname.ln_type,
 			 (unsigned long long)lp->lockname.ln_number,
 			 lp->flags);
-		return;
+		if (lp->lksb.sb_status == -EDEADLOCK &&
+		    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
+			lp->req = lp->cur;
+			acb.lc_ret |= LM_OUT_CONV_DEADLK;
+			if (lp->cur == DLM_LOCK_IV)
+				lp->lksb.sb_lkid = 0;
+			goto out;
+		} else
+			return;
 	}
 
 	/*
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5b518f73497a..ef9c6c4f80f6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -723,7 +723,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 {
 	char *proto = sdp->sd_proto_name;
 	char *table = sdp->sd_table_name;
-	int flags = 0;
+	int flags = LM_MFLAG_CONV_NODROP;
 	int error;
 
 	if (sdp->sd_args.ar_spectator)
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
index 1418fdc9ac02..f274997bc283 100644
--- a/include/linux/lm_interface.h
+++ b/include/linux/lm_interface.h
@@ -21,9 +21,15 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
  * modify the filesystem.  The lock module shouldn't assign a journal to the FS
  * mount.  It shouldn't send recovery callbacks to the FS mount.  If the node
  * dies or withdraws, all locks can be wiped immediately.
+ *
+ * LM_MFLAG_CONV_NODROP
+ * Do not allow the dlm to internally resolve conversion deadlocks by demoting
+ * the lock to unlocked and then reacquiring it in the requested mode. Instead,
+ * it should cancel the request and return LM_OUT_CONV_DEADLK.
  */
 
 #define LM_MFLAG_SPECTATOR	0x00000001
+#define LM_MFLAG_CONV_NODROP	0x00000002
 
 /*
  * lm_lockstruct flags
@@ -110,6 +116,9 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
  *
  * LM_OUT_ASYNC
  * The result of the request will be returned in an LM_CB_ASYNC callback.
+ *
+ * LM_OUT_CONV_DEADLK
+ * The lock request was canceled do to a conversion deadlock.
  */
 
 #define LM_OUT_ST_MASK		0x00000003
@@ -117,6 +126,7 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
 #define LM_OUT_CANCELED		0x00000008
 #define LM_OUT_ASYNC		0x00000080
 #define LM_OUT_ERROR		0x00000100
+#define LM_OUT_CONV_DEADLK	0x00000200
 
 /*
  * lm_callback_t types
-- 
cgit v1.2.3


From 773adff8e983cba1f5844c3be3be224ca6645f26 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 24 Mar 2008 19:08:48 +0100
Subject: [GFS2] test for IS_ERR rather than 0

The function gfs2_inode_lookup always returns either a valid pointer or a
value made with ERR_PTR, so its result should be tested with IS_ERR, not
with a test for 0.

The problem was found using the following semantic match.
(http://www.emn.fr/x-info/coccinelle/)

//<smpl>
@a@
expression E, E1;
statement S,S1;
position p;
@@

E = gfs2_inode_lookup(...)
... when != E = E1
if@p (E) S else S1

@n@
position a.p;
expression E,E1;
statement S,S1;
@@

E = NULL
... when != E = E1
if@p (E) S else S1

@depends on !n@
expression E;
statement S,S1;
position a.p;
@@

* if@p (E)
  S else S1
//</smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_export.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 334c7f85351b..990d9f4bc463 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -204,8 +204,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
 					inum->no_addr,
 					0, 0);
-	if (!inode)
-		goto fail;
 	if (IS_ERR(inode)) {
 		error = PTR_ERR(inode);
 		goto fail;
-- 
cgit v1.2.3


From 856f6ff7a3132c8e412b23a7b9157b68ac9a2baf Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 2 Apr 2008 00:10:04 -0700
Subject: [NETNS]: Remove ifdef CONFIG_NET braces in fs/proc/proc_net.c.

They are redundant as this file is linked in iff CONFIG_NET is turned
on.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7034facf8b8f..13cd7835d0df 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -51,7 +51,6 @@ int seq_open_net(struct inode *ino, struct file *f,
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
 
-#ifdef CONFIG_NET
 int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq;
@@ -219,4 +218,3 @@ int __init proc_net_init(void)
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }
-#endif /* CONFIG_NET */
-- 
cgit v1.2.3


From 16c5f06f15ad4e5a5d6e90b78ffb1ac14319e445 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 9 Apr 2008 09:33:41 -0400
Subject: [GFS2] fix GFP_KERNEL misuses

There are several places where GFP_KERNEL allocations happen under a glock,
which will result in hangs if we're under memory pressure and go to re-enter the
fs in order to flush stuff out.  This patch changes the culprits to GFS_NOFS to
keep this problem from happening.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/acl.c                 |  6 +++---
 fs/gfs2/dir.c                 | 10 +++++-----
 fs/gfs2/eattr.c               |  4 ++--
 fs/gfs2/inode.c               |  2 +-
 fs/gfs2/locking/dlm/lock.c    |  4 ++--
 fs/gfs2/locking/nolock/main.c |  2 +-
 fs/gfs2/quota.c               |  8 ++++----
 fs/gfs2/recovery.c            |  2 +-
 fs/gfs2/super.c               |  4 ++--
 9 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1047a8c7226a..3e9bd46f27e3 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -116,7 +116,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
 		goto out;
 
 	er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
-	er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+	er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
 	error = -ENOMEM;
 	if (!er.er_data)
 		goto out;
@@ -222,7 +222,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 		return error;
 	}
 
-	clone = posix_acl_clone(acl, GFP_KERNEL);
+	clone = posix_acl_clone(acl, GFP_NOFS);
 	error = -ENOMEM;
 	if (!clone)
 		goto out;
@@ -272,7 +272,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 	if (!acl)
 		return gfs2_setattr_simple(ip, attr);
 
-	clone = posix_acl_clone(acl, GFP_KERNEL);
+	clone = posix_acl_clone(acl, GFP_NOFS);
 	error = -ENOMEM;
 	if (!clone)
 		goto out;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 94070ad8826b..eed040d8ba3a 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1089,7 +1089,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 
 	/*  Allocate both the "from" and "to" buffers in one big chunk  */
 
-	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
 
 	for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
 		error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1378,7 +1378,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	hash = gfs2_dir_offset2hash(*offset);
 	index = hash >> (32 - dip->i_depth);
 
-	lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
 	if (!lp)
 		return -ENOMEM;
 
@@ -1443,7 +1443,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 
 	error = -ENOMEM;
 	/* 96 is max number of dirents which can be stuffed into an inode */
-	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
+	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
 	if (darr) {
 		g.pdent = darr;
 		g.offset = 0;
@@ -1789,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 		return -EIO;
 	}
 
-	lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
 	if (!lp)
 		return -ENOMEM;
 
@@ -1864,7 +1864,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	ht = kzalloc(size, GFP_KERNEL);
+	ht = kzalloc(size, GFP_NOFS);
 	if (!ht)
 		return -ENOMEM;
 
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 81755925a755..e3f76f451b0a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -448,7 +448,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 	unsigned int x;
 	int error = 0;
 
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
 	if (!bh)
 		return -ENOMEM;
 
@@ -1206,7 +1206,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
 	unsigned int x;
 	int error;
 
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
 	if (!bh)
 		return -ENOMEM;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index dcae2aa83f13..3a9ef526c308 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1231,7 +1231,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 
 	x = ip->i_di.di_size + 1;
 	if (x > *len) {
-		*buf = kmalloc(x, GFP_KERNEL);
+		*buf = kmalloc(x, GFP_NOFS);
 		if (!*buf) {
 			error = -ENOMEM;
 			goto out_brelse;
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 53a6ab3c0919..cf7ea8abec87 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -165,7 +165,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 {
 	struct gdlm_lock *lp;
 
-	lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+	lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
 	if (!lp)
 		return -ENOMEM;
 
@@ -383,7 +383,7 @@ static int gdlm_add_lvb(struct gdlm_lock *lp)
 {
 	char *lvb;
 
-	lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+	lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
 	if (!lvb)
 		return -ENOMEM;
 
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index d3b8ce6fbbe3..284a5ece8d94 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -140,7 +140,7 @@ static int nolock_hold_lvb(void *lock, char **lvbp)
 	struct nolock_lockspace *nl = lock;
 	int error = 0;
 
-	*lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+	*lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
 	if (!*lvbp)
 		error = -ENOMEM;
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c71f781db5d7..56aaf915c59a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -94,7 +94,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	struct gfs2_quota_data *qd;
 	int error;
 
-	qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+	qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
 	if (!qd)
 		return -ENOMEM;
 
@@ -690,7 +690,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
 			      &data_blocks, &ind_blocks);
 
-	ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+	ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
 	if (!ghs)
 		return -ENOMEM;
 
@@ -1118,12 +1118,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 	error = -ENOMEM;
 
 	sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
-				       sizeof(unsigned char *), GFP_KERNEL);
+				       sizeof(unsigned char *), GFP_NOFS);
 	if (!sdp->sd_quota_bitmap)
 		return error;
 
 	for (x = 0; x < sdp->sd_quota_chunks; x++) {
-		sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+		sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
 		if (!sdp->sd_quota_bitmap[x])
 			goto fail;
 	}
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 06dcdc04627d..2888e4b4b1c5 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -68,7 +68,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
 		return 0;
 	}
 
-	rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+	rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
 	if (!rr)
 		return -ENOMEM;
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 88497b0b7339..7aeacbc65f35 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -210,7 +210,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 	struct page *page;
 	struct bio *bio;
 
-	page = alloc_page(GFP_KERNEL);
+	page = alloc_page(GFP_NOFS);
 	if (unlikely(!page))
 		return -ENOBUFS;
 
@@ -218,7 +218,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 	ClearPageDirty(page);
 	lock_page(page);
 
-	bio = bio_alloc(GFP_KERNEL, 1);
+	bio = bio_alloc(GFP_NOFS, 1);
 	if (unlikely(!bio)) {
 		__free_page(page);
 		return -ENOBUFS;
-- 
cgit v1.2.3


From f1f73ba8e9b0eb97f90c6256b94afeb035d97562 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 Feb 2008 12:38:02 +0100
Subject: udf: kill udf_set_blocksize

This helper has been quite useless since sb_min_blocksize was introduced
and is misnamed while we're at it.  Just opencode the few lines in the
caller instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 43 ++++---------------------------------------
 1 file changed, 4 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index f3ac4abfc946..3723f04c0799 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -587,44 +587,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	return 0;
 }
 
-/*
- * udf_set_blocksize
- *
- * PURPOSE
- *	Set the block size to be used in all transfers.
- *
- * DESCRIPTION
- *	To allow room for a DMA transfer, it is best to guess big when unsure.
- *	This routine picks 2048 bytes as the blocksize when guessing. This
- *	should be adequate until devices with larger block sizes become common.
- *
- *	Note that the Linux kernel can currently only deal with blocksizes of
- *	512, 1024, 2048, 4096, and 8192 bytes.
- *
- * PRE-CONDITIONS
- *	sb			Pointer to _locked_ superblock.
- *
- * POST-CONDITIONS
- *	sb->s_blocksize		Blocksize.
- *	sb->s_blocksize_bits	log2 of blocksize.
- *	<return>	0	Blocksize is valid.
- *	<return>	1	Blocksize is invalid.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
-static int udf_set_blocksize(struct super_block *sb, int bsize)
-{
-	if (!sb_min_blocksize(sb, bsize)) {
-		udf_debug("Bad block size (%d)\n", bsize);
-		printk(KERN_ERR "udf: bad block size (%d)\n", bsize);
-		return 0;
-	}
-
-	return sb->s_blocksize;
-}
-
 static int udf_vrs(struct super_block *sb, int silent)
 {
 	struct volStructDesc *vsd = NULL;
@@ -1776,8 +1738,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sbi->s_nls_map = uopt.nls_map;
 
 	/* Set the block size for all transfers */
-	if (!udf_set_blocksize(sb, uopt.blocksize))
+	if (!sb_min_blocksize(sb, uopt.blocksize)) {
+		udf_debug("Bad block size (%d)\n", uopt.blocksize);
+		printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
 		goto error_out;
+	}
 
 	if (uopt.session == 0xFFFFFFFF)
 		sbi->s_session = udf_get_last_session(sb);
-- 
cgit v1.2.3


From b1e321266d8797b21eac433b11458ac65b098938 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 Feb 2008 12:38:48 +0100
Subject: udf: kill useless file header comments for vfs method implementations

There's not need to document vfs method invocation rules, we have
Documentation/filesystems/vfs.txt and Documentation/filesystems/Locking
for that.  Also a lot of these comments where either plain wrong or
horrible out of date.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/dir.c   | 26 --------------------------
 fs/udf/file.c  | 46 ----------------------------------------------
 fs/udf/inode.c | 32 +-------------------------------
 fs/udf/namei.c | 33 ---------------------------------
 fs/udf/super.c | 42 ------------------------------------------
 5 files changed, 1 insertion(+), 178 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 8d8643ada199..31feef916fb7 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -188,32 +188,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	return 0;
 }
 
-/*
- * udf_readdir
- *
- * PURPOSE
- *	Read a directory entry.
- *
- * DESCRIPTION
- *	Optional - sys_getdents() will return -ENOTDIR if this routine is not
- *	available.
- *
- *	Refer to sys_getdents() in fs/readdir.c
- *	sys_getdents() -> .
- *
- * PRE-CONDITIONS
- *	filp			Pointer to directory file.
- *	buf			Pointer to directory entry buffer.
- *	filldir			Pointer to filldir function.
- *
- * POST-CONDITIONS
- *	<return>		>=0 on success.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
-
 static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *dir = filp->f_path.dentry->d_inode;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 97c71ae7c689..822cc4071527 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -144,40 +144,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return retval;
 }
 
-/*
- * udf_ioctl
- *
- * PURPOSE
- *	Issue an ioctl.
- *
- * DESCRIPTION
- *	Optional - sys_ioctl() will return -ENOTTY if this routine is not
- *	available, and the ioctl cannot be handled without filesystem help.
- *
- *	sys_ioctl() handles these ioctls that apply only to regular files:
- *		FIBMAP [requires udf_block_map()], FIGETBSZ, FIONREAD
- *	These ioctls are also handled by sys_ioctl():
- *		FIOCLEX, FIONCLEX, FIONBIO, FIOASYNC
- *	All other ioctls are passed to the filesystem.
- *
- *	Refer to sys_ioctl() in fs/ioctl.c
- *	sys_ioctl() -> .
- *
- * PRE-CONDITIONS
- *	inode			Pointer to inode that ioctl was issued on.
- *	filp			Pointer to file that ioctl was issued on.
- *	cmd			The ioctl command.
- *	arg			The ioctl argument [can be interpreted as a
- *				user-space pointer if desired].
- *
- * POST-CONDITIONS
- *	<return>		Success (>=0) or an error code (<=0) that
- *				sys_ioctl() will return.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
 int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	      unsigned long arg)
 {
@@ -225,18 +191,6 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	return result;
 }
 
-/*
- * udf_release_file
- *
- * PURPOSE
- *  Called when all references to the file are closed
- *
- * DESCRIPTION
- *  Discard prealloced blocks
- *
- * HISTORY
- *
- */
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->f_mode & FMODE_WRITE) {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 24cfa55d0fdc..dc2f946dfca9 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -66,22 +66,7 @@ static void udf_update_extents(struct inode *,
 			       struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
-/*
- * udf_delete_inode
- *
- * PURPOSE
- *	Clean-up before the specified inode is destroyed.
- *
- * DESCRIPTION
- *	This routine is called when the kernel destroys an inode structure
- *	ie. when iput() finds i_count == 0.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- *
- *  Called at the last iput() if i_nlink is zero.
- */
+
 void udf_delete_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
@@ -1416,21 +1401,6 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 	return mode;
 }
 
-/*
- * udf_write_inode
- *
- * PURPOSE
- *	Write out the specified inode.
- *
- * DESCRIPTION
- *	This routine is called whenever an inode is synced.
- *	Currently this routine is just a placeholder.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
-
 int udf_write_inode(struct inode *inode, int sync)
 {
 	int ret;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 112a5fb0b27b..ac7f72779e93 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -251,39 +251,6 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	return NULL;
 }
 
-/*
- * udf_lookup
- *
- * PURPOSE
- *	Look-up the inode for a given name.
- *
- * DESCRIPTION
- *	Required - lookup_dentry() will return -ENOTDIR if this routine is not
- *	available for a directory. The filesystem is useless if this routine is
- *	not available for at least the filesystem's root directory.
- *
- *	This routine is passed an incomplete dentry - it must be completed by
- *	calling d_add(dentry, inode). If the name does not exist, then the
- *	specified inode must be set to null. An error should only be returned
- *	when the lookup fails for a reason other than the name not existing.
- *	Note that the directory inode semaphore is held during the call.
- *
- *	Refer to lookup_dentry() in fs/namei.c
- *	lookup_dentry() -> lookup() -> real_lookup() -> .
- *
- * PRE-CONDITIONS
- *	dir			Pointer to inode of parent directory.
- *	dentry			Pointer to dentry to complete.
- *	nd			Pointer to lookup nameidata
- *
- * POST-CONDITIONS
- *	<return>		Zero on success.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
-
 static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 				 struct nameidata *nd)
 {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3723f04c0799..53e1d6e6dd31 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1670,22 +1670,6 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 		vfree(bitmap);
 }
 
-/*
- * udf_read_super
- *
- * PURPOSE
- *	Complete the specified super block.
- *
- * PRE-CONDITIONS
- *	sb			Pointer to superblock to complete - never NULL.
- *	sb->s_dev		Device to read suberblock from.
- *	options			Pointer to mount options.
- *	silent			Silent flag.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
 {
 	int i;
@@ -1913,19 +1897,6 @@ void udf_warning(struct super_block *sb, const char *function,
 	       sb->s_id, function, error_buf);
 }
 
-/*
- * udf_put_super
- *
- * PURPOSE
- *	Prepare for destruction of the superblock.
- *
- * DESCRIPTION
- *	Called before the filesystem is unmounted.
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
 static void udf_put_super(struct super_block *sb)
 {
 	int i;
@@ -1961,19 +1932,6 @@ static void udf_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-/*
- * udf_stat_fs
- *
- * PURPOSE
- *	Return info about the filesystem.
- *
- * DESCRIPTION
- *	Called by sys_statfs()
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
- */
 static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-- 
cgit v1.2.3


From 15aebd2866b21a568d8defec134bf29f9aea9088 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 Feb 2008 12:39:12 +0100
Subject: udf: move headers out include/linux/

There's really no reason to keep udf headers in include/linux as they're
not used by anything but fs/udf/.

This patch merges most of include/linux/udf_fs_i.h into fs/udf/udf_i.h,
include/linux/udf_fs_sb.h into fs/udf/udf_sb.h and
include/linux/udf_fs.h into fs/udf/udfdecl.h.

The only thing remaining in include/linux/ is a stub of udf_fs_i.h
defining the four user-visible udf ioctls.  It's also moved from
unifdef-y to headers-y because it can be included unconditionally now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c             |   1 -
 fs/udf/ialloc.c           |   1 -
 fs/udf/lowlevel.c         |   1 -
 fs/udf/misc.c             |   1 -
 fs/udf/partition.c        |   1 -
 fs/udf/super.c            |   1 -
 fs/udf/symlink.c          |   1 -
 fs/udf/truncate.c         |   1 -
 fs/udf/udf_i.h            |  30 ++++++++++--
 fs/udf/udf_sb.h           |  93 ++++++++++++++++++++++++++++++++++++
 fs/udf/udfdecl.h          |  25 ++++++++--
 fs/udf/unicode.c          |   1 -
 include/linux/Kbuild      |   2 +-
 include/linux/udf_fs.h    |  51 --------------------
 include/linux/udf_fs_i.h  |  31 ------------
 include/linux/udf_fs_sb.h | 117 ----------------------------------------------
 16 files changed, 142 insertions(+), 216 deletions(-)
 delete mode 100644 include/linux/udf_fs.h
 delete mode 100644 include/linux/udf_fs_sb.h

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index 822cc4071527..0ed6e146a0d9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,6 @@
 
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <linux/udf_fs.h>
 #include <asm/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/string.h> /* memset */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 84360315aca2..c3fb28eee646 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -21,7 +21,6 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
 #include <linux/quotaops.h>
-#include <linux/udf_fs.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 579bae71e67e..703843f30ffd 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -23,7 +23,6 @@
 #include <linux/cdrom.h>
 #include <asm/uaccess.h>
 
-#include <linux/udf_fs.h>
 #include "udf_sb.h"
 
 unsigned int udf_get_last_session(struct super_block *sb)
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index a1d6da0caf71..581b6e4cc591 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -23,7 +23,6 @@
 
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/udf_fs.h>
 #include <linux/buffer_head.h>
 
 #include "udf_i.h"
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index fc533345ab89..307c9c33d184 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
 
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/udf_fs.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 53e1d6e6dd31..02815e92553b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -57,7 +57,6 @@
 #include <linux/seq_file.h>
 #include <asm/byteorder.h>
 
-#include <linux/udf_fs.h>
 #include "udf_sb.h"
 #include "udf_i.h"
 
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6ec99221e50c..c3265e1385d4 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -23,7 +23,6 @@
 #include <asm/uaccess.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/udf_fs.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index fe61be17cdab..6111d97902d7 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -22,7 +22,6 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
 #include <linux/mm.h>
-#include <linux/udf_fs.h>
 #include <linux/buffer_head.h>
 
 #include "udf_i.h"
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index ccc52f16bf7d..d6d9a774a1c1 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,10 +1,32 @@
-#ifndef __LINUX_UDF_I_H
-#define __LINUX_UDF_I_H
+#ifndef _UDF_I_H
+#define _UDF_I_H
+
+struct udf_inode_info {
+	struct timespec		i_crtime;
+	/* Physical address of inode */
+	kernel_lb_addr		i_location;
+	__u64			i_unique;
+	__u32			i_lenEAttr;
+	__u32			i_lenAlloc;
+	__u64			i_lenExtents;
+	__u32			i_next_alloc_block;
+	__u32			i_next_alloc_goal;
+	unsigned		i_alloc_type : 3;
+	unsigned		i_efe : 1;
+	unsigned		i_use : 1;
+	unsigned		i_strat4096 : 1;
+	unsigned		reserved : 26;
+	union {
+		short_ad	*i_sad;
+		long_ad		*i_lad;
+		__u8		*i_data;
+	} i_ext;
+	struct inode vfs_inode;
+};
 
-#include <linux/udf_fs_i.h>
 static inline struct udf_inode_info *UDF_I(struct inode *inode)
 {
 	return list_entry(inode, struct udf_inode_info, vfs_inode);
 }
 
-#endif /* !defined(_LINUX_UDF_I_H) */
+#endif /* _UDF_I_H) */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 737d1c604eea..fb2a6e9f8dac 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_UDF_SB_H
 #define __LINUX_UDF_SB_H
 
+#include <linux/mutex.h>
+
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC			0x15013346
 
@@ -38,6 +40,97 @@
 #define UDF_PART_FLAG_REWRITABLE	0x0040
 #define UDF_PART_FLAG_OVERWRITABLE	0x0080
 
+#define UDF_MAX_BLOCK_LOADED	8
+
+#define UDF_TYPE1_MAP15			0x1511U
+#define UDF_VIRTUAL_MAP15		0x1512U
+#define UDF_VIRTUAL_MAP20		0x2012U
+#define UDF_SPARABLE_MAP15		0x1522U
+
+#pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
+
+struct udf_sparing_data {
+	__u16	s_packet_len;
+	struct buffer_head *s_spar_map[4];
+};
+
+struct udf_virtual_data {
+	__u32	s_num_entries;
+	__u16	s_start_offset;
+};
+
+struct udf_bitmap {
+	__u32			s_extLength;
+	__u32			s_extPosition;
+	__u16			s_nr_groups;
+	struct buffer_head 	**s_block_bitmap;
+};
+
+struct udf_part_map {
+	union {
+		struct udf_bitmap	*s_bitmap;
+		struct inode		*s_table;
+	} s_uspace;
+	union {
+		struct udf_bitmap	*s_bitmap;
+		struct inode		*s_table;
+	} s_fspace;
+	__u32	s_partition_root;
+	__u32	s_partition_len;
+	__u16	s_partition_type;
+	__u16	s_partition_num;
+	union {
+		struct udf_sparing_data s_sparing;
+		struct udf_virtual_data s_virtual;
+	} s_type_specific;
+	__u32	(*s_partition_func)(struct super_block *, __u32, __u16, __u32);
+	__u16	s_volumeseqnum;
+	__u16	s_partition_flags;
+};
+
+#pragma pack()
+
+struct udf_sb_info {
+	struct udf_part_map	*s_partmaps;
+	__u8			s_volume_ident[32];
+
+	/* Overall info */
+	__u16			s_partitions;
+	__u16			s_partition;
+
+	/* Sector headers */
+	__s32			s_session;
+	__u32			s_anchor[4];
+	__u32			s_last_block;
+
+	struct buffer_head	*s_lvid_bh;
+
+	/* Default permissions */
+	mode_t			s_umask;
+	gid_t			s_gid;
+	uid_t			s_uid;
+
+	/* Root Info */
+	struct timespec		s_record_time;
+
+	/* Fileset Info */
+	__u16			s_serial_number;
+
+	/* highest UDF revision we have recorded to this media */
+	__u16			s_udfrev;
+
+	/* Miscellaneous flags */
+	__u32			s_flags;
+
+	/* Encoding info */
+	struct nls_table	*s_nls_map;
+
+	/* VAT inode */
+	struct inode		*s_vat_inode;
+
+	struct mutex		s_alloc_mutex;
+};
+
 static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 681dc2b66cdb..c9c75856af80 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,18 +1,37 @@
 #ifndef __UDF_DECL_H
 #define __UDF_DECL_H
 
-#include <linux/udf_fs.h>
 #include "ecma_167.h"
 #include "osta_udf.h"
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/udf_fs_i.h>
-#include <linux/udf_fs_sb.h>
 #include <linux/buffer_head.h>
+#include <linux/udf_fs_i.h>
 
+#include "udf_sb.h"
 #include "udfend.h"
 
+#define UDF_PREALLOCATE
+#define UDF_DEFAULT_PREALLOC_BLOCKS	8
+
+#undef UDFFS_DEBUG
+
+#ifdef UDFFS_DEBUG
+#define udf_debug(f, a...) \
+do { \
+	printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
+		__FILE__, __LINE__, __func__); \
+	printk(f, ##a); \
+} while (0)
+#else
+#define udf_debug(f, a...) /**/
+#endif
+
+#define udf_info(f, a...) \
+	printk(KERN_INFO "UDF-fs INFO " f, ##a);
+
+
 #define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
 #define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
 
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index e533b11703bf..b9de050ad830 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -23,7 +23,6 @@
 #include <linux/kernel.h>
 #include <linux/string.h>	/* for memset */
 #include <linux/nls.h>
-#include <linux/udf_fs.h>
 
 #include "udf_sb.h"
 
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index aada32fffec2..897d84e04985 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -149,6 +149,7 @@ header-y += tiocl.h
 header-y += tipc.h
 header-y += tipc_config.h
 header-y += toshiba.h
+header-y += udf_fs_i.h
 header-y += ultrasound.h
 header-y += un.h
 header-y += utime.h
@@ -336,7 +337,6 @@ unifdef-y += time.h
 unifdef-y += timex.h
 unifdef-y += tty.h
 unifdef-y += types.h
-unifdef-y += udf_fs_i.h
 unifdef-y += udp.h
 unifdef-y += uinput.h
 unifdef-y += uio.h
diff --git a/include/linux/udf_fs.h b/include/linux/udf_fs.h
deleted file mode 100644
index aa88654eb76b..000000000000
--- a/include/linux/udf_fs.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * udf_fs.h
- *
- * PURPOSE
- *  Included by fs/filesystems.c
- *
- * DESCRIPTION
- *  OSTA-UDF(tm) = Optical Storage Technology Association
- *  Universal Disk Format.
- *
- *  This code is based on version 2.50 of the UDF specification,
- *  and revision 3 of the ECMA 167 standard [equivalent to ISO 13346].
- *    http://www.osta.org/ *    http://www.ecma.ch/
- *    http://www.iso.org/
- *
- * COPYRIGHT
- *	This file is distributed under the terms of the GNU General Public
- *	License (GPL). Copies of the GPL can be obtained from:
- *		ftp://prep.ai.mit.edu/pub/gnu/GPL
- *	Each contributing author retains all rights to their own work.
- *
- *  (C) 1999-2004 Ben Fennema
- *  (C) 1999-2000 Stelias Computing Inc
- *
- * HISTORY
- *
- */
-
-#ifndef _UDF_FS_H
-#define _UDF_FS_H 1
-
-#define UDF_PREALLOCATE
-#define UDF_DEFAULT_PREALLOC_BLOCKS	8
-
-#undef UDFFS_DEBUG
-
-#ifdef UDFFS_DEBUG
-#define udf_debug(f, a...) \
-	do { \
-		printk (KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
-			__FILE__, __LINE__, __FUNCTION__); \
-		printk (f, ##a); \
-	} while (0)
-#else
-#define udf_debug(f, a...) /**/
-#endif
-
-#define udf_info(f, a...) \
-		printk (KERN_INFO "UDF-fs INFO " f, ##a);
-
-#endif /* _UDF_FS_H */
diff --git a/include/linux/udf_fs_i.h b/include/linux/udf_fs_i.h
index ffaf05679ffb..3536965913b0 100644
--- a/include/linux/udf_fs_i.h
+++ b/include/linux/udf_fs_i.h
@@ -9,41 +9,10 @@
  *		ftp://prep.ai.mit.edu/pub/gnu/GPL
  *	Each contributing author retains all rights to their own work.
  */
-
 #ifndef _UDF_FS_I_H
 #define _UDF_FS_I_H 1
 
-#ifdef __KERNEL__
-
-struct udf_inode_info
-{
-	struct timespec		i_crtime;
-	/* Physical address of inode */
-	kernel_lb_addr		i_location;
-	__u64			i_unique;
-	__u32			i_lenEAttr;
-	__u32			i_lenAlloc;
-	__u64			i_lenExtents;
-	__u32			i_next_alloc_block;
-	__u32			i_next_alloc_goal;
-	unsigned		i_alloc_type : 3;
-	unsigned		i_efe : 1;
-	unsigned		i_use : 1;
-	unsigned		i_strat4096 : 1;
-	unsigned		reserved : 26;
-	union
-	{
-		short_ad	*i_sad;
-		long_ad		*i_lad;
-		__u8		*i_data;
-	} i_ext;
-	struct inode vfs_inode;
-};
-
-#endif
-
 /* exported IOCTLs, we have 'l', 0x40-0x7f */
-
 #define UDF_GETEASIZE   _IOR('l', 0x40, int)
 #define UDF_GETEABLOCK  _IOR('l', 0x41, void *)
 #define UDF_GETVOLIDENT _IOR('l', 0x42, void *)
diff --git a/include/linux/udf_fs_sb.h b/include/linux/udf_fs_sb.h
deleted file mode 100644
index 9bc47352b6b4..000000000000
--- a/include/linux/udf_fs_sb.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * udf_fs_sb.h
- * 
- * This include file is for the Linux kernel/module.
- *
- * COPYRIGHT
- *	This file is distributed under the terms of the GNU General Public
- *	License (GPL). Copies of the GPL can be obtained from:
- *		ftp://prep.ai.mit.edu/pub/gnu/GPL
- *	Each contributing author retains all rights to their own work.
- */
-
-#ifndef _UDF_FS_SB_H
-#define _UDF_FS_SB_H 1
-
-#include <linux/mutex.h>
-
-#pragma pack(1)
-
-#define UDF_MAX_BLOCK_LOADED	8
-
-#define UDF_TYPE1_MAP15			0x1511U
-#define UDF_VIRTUAL_MAP15		0x1512U
-#define UDF_VIRTUAL_MAP20		0x2012U
-#define UDF_SPARABLE_MAP15		0x1522U
-
-struct udf_sparing_data
-{
-	__u16	s_packet_len;
-	struct buffer_head *s_spar_map[4];
-};
-
-struct udf_virtual_data
-{
-	__u32	s_num_entries;
-	__u16	s_start_offset;
-};
-
-struct udf_bitmap
-{
-	__u32			s_extLength;
-	__u32			s_extPosition;
-	__u16			s_nr_groups;
-	struct buffer_head 	**s_block_bitmap;
-};
-
-struct udf_part_map
-{
-	union
-	{
-		struct udf_bitmap	*s_bitmap;
-		struct inode		*s_table;
-	} s_uspace;
-	union
-	{
-		struct udf_bitmap	*s_bitmap;
-		struct inode		*s_table;
-	} s_fspace;
-	__u32	s_partition_root;
-	__u32	s_partition_len;
-	__u16	s_partition_type;
-	__u16	s_partition_num;
-	union
-	{
-		struct udf_sparing_data s_sparing;
-		struct udf_virtual_data s_virtual;
-	} s_type_specific;
-	__u32	(*s_partition_func)(struct super_block *, __u32, __u16, __u32);
-	__u16	s_volumeseqnum;
-	__u16	s_partition_flags;
-};
-
-#pragma pack()
-
-struct udf_sb_info
-{
-	struct udf_part_map	*s_partmaps;
-	__u8			s_volume_ident[32];
-
-	/* Overall info */
-	__u16			s_partitions;
-	__u16			s_partition;
-
-	/* Sector headers */
-	__s32			s_session;
-	__u32			s_anchor[4];
-	__u32			s_last_block;
-
-	struct buffer_head	*s_lvid_bh;
-
-	/* Default permissions */
-	mode_t			s_umask;
-	gid_t			s_gid;
-	uid_t			s_uid;
-
-	/* Root Info */
-	struct timespec		s_record_time;
-
-	/* Fileset Info */
-	__u16			s_serial_number;
-
-	/* highest UDF revision we have recorded to this media */
-	__u16			s_udfrev;
-
-	/* Miscellaneous flags */
-	__u32			s_flags;
-
-	/* Encoding info */
-	struct nls_table	*s_nls_map;
-
-	/* VAT inode */
-	struct inode		*s_vat_inode;
-
-	struct mutex		s_alloc_mutex;
-};
-
-#endif /* _UDF_FS_SB_H */
-- 
cgit v1.2.3


From 8dee00bb758f9e1d7fac9f5d2463d09444d4f255 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 14 Feb 2008 16:15:45 +0100
Subject: fs/udf: Use DIV_ROUND_UP

The kernel.h macro DIV_ROUND_UP performs the computation (((n) + (d) - 1) /
(d)) but is perhaps more readable.

An extract of the semantic patch that makes this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@haskernel@
@@

#include <linux/kernel.h>

@depends on haskernel@
expression n,d;
@@

(
- (n + d - 1) / d
+ DIV_ROUND_UP(n,d)
|
- (n + (d - 1)) / d
+ DIV_ROUND_UP(n,d)
)

@depends on haskernel@
expression n,d;
@@

- DIV_ROUND_UP((n),d)
+ DIV_ROUND_UP(n,d)

@depends on haskernel@
expression n,d;
@@

- DIV_ROUND_UP(n,(d))
+ DIV_ROUND_UP(n,d)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 02815e92553b..5f0d400385e0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -986,10 +986,9 @@ static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
 int udf_compute_nr_groups(struct super_block *sb, u32 partition)
 {
 	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
-	return (map->s_partition_len +
-		(sizeof(struct spaceBitmapDesc) << 3) +
-		(sb->s_blocksize * 8) - 1) /
-		(sb->s_blocksize * 8);
+	return DIV_ROUND_UP(map->s_partition_len +
+			    (sizeof(struct spaceBitmapDesc) << 3),
+			    sb->s_blocksize * 8);
 }
 
 static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
-- 
cgit v1.2.3


From b8145a769765ce9688eb84080d4c48b22a3553f2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 17 Feb 2008 10:19:55 +0200
Subject: make udf_error() static

This patch makes the needlessly global udf_error() static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c   | 6 ++++--
 fs/udf/udfdecl.h | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 5f0d400385e0..2666e5bc69ff 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -99,6 +99,8 @@ static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
 static int udf_show_options(struct seq_file *, struct vfsmount *);
+static void udf_error(struct super_block *sb, const char *function,
+		      const char *fmt, ...);
 
 struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 {
@@ -1867,8 +1869,8 @@ error_out:
 	return -EINVAL;
 }
 
-void udf_error(struct super_block *sb, const char *function,
-	       const char *fmt, ...)
+static void udf_error(struct super_block *sb, const char *function,
+		      const char *fmt, ...)
 {
 	va_list args;
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index c9c75856af80..d3aae5633722 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -102,7 +102,6 @@ struct extent_position {
 };
 
 /* super.c */
-extern void udf_error(struct super_block *, const char *, const char *, ...);
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
 
 /* namei.c */
-- 
cgit v1.2.3


From 79cfe0ff5fb585b92126365a2881262945ac0ee9 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 30 Jan 2008 22:03:51 +0100
Subject: udf: udf_CS0toUTF8 cleanup

- fix error handling - always zero output variable
- don't zero explicitely fields zeroed by memset
- mark "in" paramater as const
- remove outdated comment

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/udfdecl.h |  2 +-
 fs/udf/unicode.c | 27 ++++++++++-----------------
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index d3aae5633722..2310b5c8fd8b 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -175,7 +175,7 @@ extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
 			    int);
 extern int udf_build_ustr(struct ustr *, dstring *, int);
-extern int udf_CS0toUTF8(struct ustr *, struct ustr *);
+extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
 
 /* ialloc.c */
 extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index b9de050ad830..05bc505ec01a 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -82,9 +82,6 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  * PURPOSE
  *	Convert OSTA Compressed Unicode to the UTF-8 equivalent.
  *
- * DESCRIPTION
- *	This routine is only called by udf_filldir().
- *
  * PRE-CONDITIONS
  *	utf			Pointer to UTF-8 output buffer.
  *	ocu			Pointer to OSTA Compressed Unicode input buffer
@@ -98,43 +95,39 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  *	November 12, 1997 - Andrew E. Mileski
  *	Written, tested, and released.
  */
-int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i)
+int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 {
-	uint8_t *ocu;
-	uint32_t c;
+	const uint8_t *ocu;
 	uint8_t cmp_id, ocu_len;
 	int i;
 
-	ocu = ocu_i->u_name;
-
 	ocu_len = ocu_i->u_len;
-	cmp_id = ocu_i->u_cmpID;
-	utf_o->u_len = 0;
-
 	if (ocu_len == 0) {
 		memset(utf_o, 0, sizeof(struct ustr));
-		utf_o->u_cmpID = 0;
-		utf_o->u_len = 0;
 		return 0;
 	}
 
-	if ((cmp_id != 8) && (cmp_id != 16)) {
+	cmp_id = ocu_i->u_cmpID;
+	if (cmp_id != 8 && cmp_id != 16) {
+		memset(utf_o, 0, sizeof(struct ustr));
 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
 		       cmp_id, ocu_i->u_name);
 		return 0;
 	}
 
+	ocu = ocu_i->u_name;
+	utf_o->u_len = 0;
 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
 
 		/* Expand OSTA compressed Unicode to Unicode */
-		c = ocu[i++];
+		uint32_t c = ocu[i++];
 		if (cmp_id == 16)
 			c = (c << 8) | ocu[i++];
 
 		/* Compress Unicode to UTF-8 */
-		if (c < 0x80U) {
+		if (c < 0x80U)
 			utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
-		} else if (c < 0x800U) {
+		else if (c < 0x800U) {
 			utf_o->u_name[utf_o->u_len++] =
 						(uint8_t)(0xc0 | (c >> 6));
 			utf_o->u_name[utf_o->u_len++] =
-- 
cgit v1.2.3


From 6305a0a9d559e97807b8bc6d5250fd525dc571a7 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Mon, 4 Feb 2008 22:27:39 +0100
Subject: udf: fix udf_build_ustr

udf_build_ustr was broken:

- size == 1:
    dest->u_len = ptr[1 - 1], but at ptr[0] there's cmpID,
    so we created string with wrong length
    it should not happen, so we BUG() it
- size > 1 and size < UDF_NAME_LEN:
    we set u_len correctly, but memcpy copied one needless byte
- size == UDF_NAME_LEN - 1:
    memcpy overwrited u_len - with correct value, but...
- size >= UDF_NAME_LEN:
    we copied UDF_NAME_LEN - 1 bytes, but dest->u_name is array
    of UDF_NAME_LEN - 2 bytes, so we were overwriting u_len with
    character from input string

nobody noticed because all callers set size
to acceptable values (constants within range)

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 05bc505ec01a..24d6165d21a9 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -48,14 +48,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
 {
 	int usesize;
 
-	if ((!dest) || (!ptr) || (!size))
+	if (!dest || !ptr || !size)
 		return -1;
+	BUG_ON(size < 2);
 
-	memset(dest, 0, sizeof(struct ustr));
-	usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size;
+	usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
+	usesize = min(usesize, size - 2);
 	dest->u_cmpID = ptr[0];
-	dest->u_len = ptr[size - 1];
-	memcpy(dest->u_name, ptr + 1, usesize - 1);
+	dest->u_len = usesize;
+	memcpy(dest->u_name, ptr + 1, usesize);
+	memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 34f953ddfd15da8feb5b8383c93c35dc57202b66 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 27 Feb 2008 22:38:36 +0100
Subject: udf: udf_CS0toNLS cleanup

- fix error handling - always zero output variable
- don't zero explicitely fields zeroed by memset
- mark "in" paramater as const

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/unicode.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 24d6165d21a9..f5872ae325a3 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -249,35 +249,32 @@ error_out:
 }
 
 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
-			struct ustr *ocu_i)
+			const struct ustr *ocu_i)
 {
-	uint8_t *ocu;
-	uint32_t c;
+	const uint8_t *ocu;
 	uint8_t cmp_id, ocu_len;
 	int i;
 
-	ocu = ocu_i->u_name;
 
 	ocu_len = ocu_i->u_len;
-	cmp_id = ocu_i->u_cmpID;
-	utf_o->u_len = 0;
-
 	if (ocu_len == 0) {
 		memset(utf_o, 0, sizeof(struct ustr));
-		utf_o->u_cmpID = 0;
-		utf_o->u_len = 0;
 		return 0;
 	}
 
-	if ((cmp_id != 8) && (cmp_id != 16)) {
+	cmp_id = ocu_i->u_cmpID;
+	if (cmp_id != 8 && cmp_id != 16) {
+		memset(utf_o, 0, sizeof(struct ustr));
 		printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
 		       cmp_id, ocu_i->u_name);
 		return 0;
 	}
 
+	ocu = ocu_i->u_name;
+	utf_o->u_len = 0;
 	for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
 		/* Expand OSTA compressed Unicode to Unicode */
-		c = ocu[i++];
+		uint32_t c = ocu[i++];
 		if (cmp_id == 16)
 			c = (c << 8) | ocu[i++];
 
-- 
cgit v1.2.3


From c8ed837d371c24b678182a30e9f0b1f61dee212c Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 27 Feb 2008 22:38:38 +0100
Subject: udf: constify crc

- constify internal crc table
- mark udf_crc "in" parameter as const

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/crc.c     | 4 ++--
 fs/udf/udfdecl.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index b1661296e786..f178c63686e0 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -23,7 +23,7 @@
 
 #include "udfdecl.h"
 
-static uint16_t crc_table[256] = {
+static const uint16_t crc_table[256] = {
 	0x0000U, 0x1021U, 0x2042U, 0x3063U, 0x4084U, 0x50a5U, 0x60c6U, 0x70e7U,
 	0x8108U, 0x9129U, 0xa14aU, 0xb16bU, 0xc18cU, 0xd1adU, 0xe1ceU, 0xf1efU,
 	0x1231U, 0x0210U, 0x3273U, 0x2252U, 0x52b5U, 0x4294U, 0x72f7U, 0x62d6U,
@@ -79,7 +79,7 @@ static uint16_t crc_table[256] = {
  *	July 21, 1997 - Andrew E. Mileski
  *	Adapted from OSTA-UDF(tm) 1.50 standard.
  */
-uint16_t udf_crc(uint8_t *data, uint32_t size, uint16_t crc)
+uint16_t udf_crc(const uint8_t *data, uint32_t size, uint16_t crc)
 {
 	while (size--)
 		crc = crc_table[(crc >> 8 ^ *(data++)) & 0xffU] ^ (crc << 8);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 2310b5c8fd8b..26bc9c237cb7 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -210,7 +210,7 @@ extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
 extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 
 /* crc.c */
-extern uint16_t udf_crc(uint8_t *, uint32_t, uint16_t);
+extern uint16_t udf_crc(const uint8_t *, uint32_t, uint16_t);
 
 /* udftime.c */
 extern time_t *udf_stamp_to_time(time_t *, long *, kernel_timestamp);
-- 
cgit v1.2.3


From 9de90b76eb96e7cdeac8b8dbe7d3db948b070f4d Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 30 Jan 2008 22:03:55 +0100
Subject: udf: simple cleanup of truncate.c

- remove one indentation level by little code reorganization
- convert "if (smth) BUG();" to "BUG_ON(smth);"

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/truncate.c | 76 +++++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 6111d97902d7..cde328f16775 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -223,34 +223,29 @@ void udf_truncate_extents(struct inode *inode)
 				if (indirect_ext_len) {
 					/* We managed to free all extents in the
 					 * indirect extent - free it too */
-					if (!epos.bh)
-						BUG();
+					BUG_ON(!epos.bh);
 					udf_free_blocks(sb, inode, epos.block,
 							0, indirect_ext_len);
+				} else if (!epos.bh) {
+					iinfo->i_lenAlloc = lenalloc;
+					mark_inode_dirty(inode);
 				} else {
-					if (!epos.bh) {
-						iinfo->i_lenAlloc =
-								lenalloc;
-						mark_inode_dirty(inode);
-					} else {
-						struct allocExtDesc *aed =
-							(struct allocExtDesc *)
-							(epos.bh->b_data);
-						int len =
-						    sizeof(struct allocExtDesc);
+					struct allocExtDesc *aed =
+						(struct allocExtDesc *)
+						(epos.bh->b_data);
+					int len = sizeof(struct allocExtDesc);
 
-						aed->lengthAllocDescs =
-						    cpu_to_le32(lenalloc);
-						if (!UDF_QUERY_FLAG(sb,
-							UDF_FLAG_STRICT) ||
-						    sbi->s_udfrev >= 0x0201)
-							len += lenalloc;
+					aed->lengthAllocDescs =
+						cpu_to_le32(lenalloc);
+					if (!UDF_QUERY_FLAG(sb,
+						UDF_FLAG_STRICT) ||
+						sbi->s_udfrev >= 0x0201)
+						len += lenalloc;
 
-						udf_update_tag(epos.bh->b_data,
-								len);
-						mark_buffer_dirty_inode(
-								epos.bh, inode);
-					}
+					udf_update_tag(epos.bh->b_data,
+							len);
+					mark_buffer_dirty_inode(
+							epos.bh, inode);
 				}
 				brelse(epos.bh);
 				epos.offset = sizeof(struct allocExtDesc);
@@ -271,28 +266,25 @@ void udf_truncate_extents(struct inode *inode)
 		}
 
 		if (indirect_ext_len) {
-			if (!epos.bh)
-				BUG();
+			BUG_ON(!epos.bh);
 			udf_free_blocks(sb, inode, epos.block, 0,
 					indirect_ext_len);
+		} else if (!epos.bh) {
+			iinfo->i_lenAlloc = lenalloc;
+			mark_inode_dirty(inode);
 		} else {
-			if (!epos.bh) {
-				iinfo->i_lenAlloc = lenalloc;
-				mark_inode_dirty(inode);
-			} else {
-				struct allocExtDesc *aed =
-				    (struct allocExtDesc *)(epos.bh->b_data);
-				aed->lengthAllocDescs = cpu_to_le32(lenalloc);
-				if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) ||
-				    sbi->s_udfrev >= 0x0201)
-					udf_update_tag(epos.bh->b_data,
-						lenalloc +
-						sizeof(struct allocExtDesc));
-				else
-					udf_update_tag(epos.bh->b_data,
-						sizeof(struct allocExtDesc));
-				mark_buffer_dirty_inode(epos.bh, inode);
-			}
+			struct allocExtDesc *aed =
+				(struct allocExtDesc *)(epos.bh->b_data);
+			aed->lengthAllocDescs = cpu_to_le32(lenalloc);
+			if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) ||
+				sbi->s_udfrev >= 0x0201)
+				udf_update_tag(epos.bh->b_data,
+					lenalloc +
+					sizeof(struct allocExtDesc));
+			else
+				udf_update_tag(epos.bh->b_data,
+					sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos.bh, inode);
 		}
 	} else if (inode->i_size) {
 		if (byte_offset) {
-- 
cgit v1.2.3


From 456390de465e5a19c84bca5d78e2550971ab5a96 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 30 Jan 2008 22:03:56 +0100
Subject: udf: truncate: create function for updating of Allocation Ext
 Descriptor

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/truncate.c | 56 +++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index cde328f16775..65e19b4f9424 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -179,6 +179,24 @@ void udf_discard_prealloc(struct inode *inode)
 	brelse(epos.bh);
 }
 
+static void udf_update_alloc_ext_desc(struct inode *inode,
+				      struct extent_position *epos,
+				      u32 lenalloc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data);
+	int len = sizeof(struct allocExtDesc);
+
+	aed->lengthAllocDescs =	cpu_to_le32(lenalloc);
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201)
+		len += lenalloc;
+
+	udf_update_tag(epos->bh->b_data, len);
+	mark_buffer_dirty_inode(epos->bh, inode);
+}
+
 void udf_truncate_extents(struct inode *inode)
 {
 	struct extent_position epos;
@@ -186,7 +204,6 @@ void udf_truncate_extents(struct inode *inode)
 	uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
 	int8_t etype;
 	struct super_block *sb = inode->i_sb;
-	struct udf_sb_info *sbi = UDF_SB(sb);
 	sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset;
 	loff_t byte_offset;
 	int adsize;
@@ -229,24 +246,9 @@ void udf_truncate_extents(struct inode *inode)
 				} else if (!epos.bh) {
 					iinfo->i_lenAlloc = lenalloc;
 					mark_inode_dirty(inode);
-				} else {
-					struct allocExtDesc *aed =
-						(struct allocExtDesc *)
-						(epos.bh->b_data);
-					int len = sizeof(struct allocExtDesc);
-
-					aed->lengthAllocDescs =
-						cpu_to_le32(lenalloc);
-					if (!UDF_QUERY_FLAG(sb,
-						UDF_FLAG_STRICT) ||
-						sbi->s_udfrev >= 0x0201)
-						len += lenalloc;
-
-					udf_update_tag(epos.bh->b_data,
-							len);
-					mark_buffer_dirty_inode(
-							epos.bh, inode);
-				}
+				} else
+					udf_update_alloc_ext_desc(inode,
+							&epos, lenalloc);
 				brelse(epos.bh);
 				epos.offset = sizeof(struct allocExtDesc);
 				epos.block = eloc;
@@ -272,20 +274,8 @@ void udf_truncate_extents(struct inode *inode)
 		} else if (!epos.bh) {
 			iinfo->i_lenAlloc = lenalloc;
 			mark_inode_dirty(inode);
-		} else {
-			struct allocExtDesc *aed =
-				(struct allocExtDesc *)(epos.bh->b_data);
-			aed->lengthAllocDescs = cpu_to_le32(lenalloc);
-			if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) ||
-				sbi->s_udfrev >= 0x0201)
-				udf_update_tag(epos.bh->b_data,
-					lenalloc +
-					sizeof(struct allocExtDesc));
-			else
-				udf_update_tag(epos.bh->b_data,
-					sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(epos.bh, inode);
-		}
+		} else
+			udf_update_alloc_ext_desc(inode, &epos, lenalloc);
 	} else if (inode->i_size) {
 		if (byte_offset) {
 			kernel_long_ad extent;
-- 
cgit v1.2.3


From c2104fda5e6a6981e385b2d11c5c591ab06d82a2 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 30 Jan 2008 22:03:57 +0100
Subject: udf: replace all adds to little endians variables with le*_add_cpu

replace all:
	little_endian_variable = cpu_to_leX(leX_to_cpu(little_endian_variable) +
	                                    expression_in_cpu_byteorder);
with:
	leX_add_cpu(&little_endian_variable, expression_in_cpu_byteorder);
sparse didn't generate any new warning with this patch

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/balloc.c | 13 ++++---------
 fs/udf/ialloc.c | 12 ++++--------
 fs/udf/inode.c  | 16 ++++------------
 3 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index f855dcbbdfb8..1b809bd494bd 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -149,8 +149,7 @@ static bool udf_add_free_space(struct udf_sb_info *sbi,
 		return false;
 
 	lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
-	lvid->freeSpaceTable[partition] = cpu_to_le32(le32_to_cpu(
-					lvid->freeSpaceTable[partition]) + cnt);
+	le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
 	return true;
 }
 
@@ -589,10 +588,8 @@ static void udf_table_free_blocks(struct super_block *sb,
 					sptr = oepos.bh->b_data + epos.offset;
 					aed = (struct allocExtDesc *)
 						oepos.bh->b_data;
-					aed->lengthAllocDescs =
-						cpu_to_le32(le32_to_cpu(
-							aed->lengthAllocDescs) +
-								adsize);
+					le32_add_cpu(&aed->lengthAllocDescs,
+							adsize);
 				} else {
 					sptr = iinfo->i_ext.i_data +
 								epos.offset;
@@ -645,9 +642,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 				mark_inode_dirty(table);
 			} else {
 				aed = (struct allocExtDesc *)epos.bh->b_data;
-				aed->lengthAllocDescs =
-					cpu_to_le32(le32_to_cpu(
-					    aed->lengthAllocDescs) + adsize);
+				le32_add_cpu(&aed->lengthAllocDescs, adsize);
 				udf_update_tag(epos.bh->b_data, epos.offset);
 				mark_buffer_dirty(epos.bh);
 			}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c3fb28eee646..eb9cfa23dc3d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -46,11 +46,9 @@ void udf_free_inode(struct inode *inode)
 		struct logicalVolIntegrityDescImpUse *lvidiu =
 							udf_sb_lvidiu(sbi);
 		if (S_ISDIR(inode->i_mode))
-			lvidiu->numDirs =
-				cpu_to_le32(le32_to_cpu(lvidiu->numDirs) - 1);
+			le32_add_cpu(&lvidiu->numDirs, -1);
 		else
-			lvidiu->numFiles =
-				cpu_to_le32(le32_to_cpu(lvidiu->numFiles) - 1);
+			le32_add_cpu(&lvidiu->numFiles, -1);
 
 		mark_buffer_dirty(sbi->s_lvid_bh);
 	}
@@ -104,11 +102,9 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		lvhd = (struct logicalVolHeaderDesc *)
 				(lvid->logicalVolContentsUse);
 		if (S_ISDIR(mode))
-			lvidiu->numDirs =
-				cpu_to_le32(le32_to_cpu(lvidiu->numDirs) + 1);
+			le32_add_cpu(&lvidiu->numDirs, 1);
 		else
-			lvidiu->numFiles =
-				cpu_to_le32(le32_to_cpu(lvidiu->numFiles) + 1);
+			le32_add_cpu(&lvidiu->numFiles, 1);
 		iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
 		if (!(++uniqueID & 0x00000000FFFFFFFFUL))
 			uniqueID += 16;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index dc2f946dfca9..91d1f1d3d7a5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1748,9 +1748,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 
 			if (epos->bh) {
 				aed = (struct allocExtDesc *)epos->bh->b_data;
-				aed->lengthAllocDescs =
-					cpu_to_le32(le32_to_cpu(
-					aed->lengthAllocDescs) + adsize);
+				le32_add_cpu(&aed->lengthAllocDescs, adsize);
 			} else {
 				iinfo->i_lenAlloc += adsize;
 				mark_inode_dirty(inode);
@@ -1800,9 +1798,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		mark_inode_dirty(inode);
 	} else {
 		aed = (struct allocExtDesc *)epos->bh->b_data;
-		aed->lengthAllocDescs =
-			cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) +
-				    adsize);
+		le32_add_cpu(&aed->lengthAllocDescs, adsize);
 		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
 				UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
 			udf_update_tag(epos->bh->b_data,
@@ -2016,9 +2012,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 			mark_inode_dirty(inode);
 		} else {
 			aed = (struct allocExtDesc *)oepos.bh->b_data;
-			aed->lengthAllocDescs =
-				cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) -
-					    (2 * adsize));
+			le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize));
 			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
 			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
 				udf_update_tag(oepos.bh->b_data,
@@ -2035,9 +2029,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 			mark_inode_dirty(inode);
 		} else {
 			aed = (struct allocExtDesc *)oepos.bh->b_data;
-			aed->lengthAllocDescs =
-				cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) -
-					    adsize);
+			le32_add_cpu(&aed->lengthAllocDescs, -adsize);
 			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
 			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
 				udf_update_tag(oepos.bh->b_data,
-- 
cgit v1.2.3


From 1ab9278570077101d1e367399686be62b22c4019 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 30 Jan 2008 22:03:58 +0100
Subject: udf: simplify __udf_read_inode

- move all brelse(ibh) after main if, because it's called
  on every path except one where ibh is null
- move variables to the most inner blocks

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 52 +++++++++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 91d1f1d3d7a5..ddd7780d1ab1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1101,42 +1101,36 @@ static void __udf_read_inode(struct inode *inode)
 	fe = (struct fileEntry *)bh->b_data;
 
 	if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
-		struct buffer_head *ibh = NULL, *nbh = NULL;
-		struct indirectEntry *ie;
+		struct buffer_head *ibh;
 
 		ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
 					&ident);
-		if (ident == TAG_IDENT_IE) {
-			if (ibh) {
-				kernel_lb_addr loc;
-				ie = (struct indirectEntry *)ibh->b_data;
-
-				loc = lelb_to_cpu(ie->indirectICB.extLocation);
-
-				if (ie->indirectICB.extLength &&
-				    (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
-							    &ident))) {
-					if (ident == TAG_IDENT_FE ||
-					    ident == TAG_IDENT_EFE) {
-						memcpy(&iinfo->i_location,
-						       &loc,
-						       sizeof(kernel_lb_addr));
-						brelse(bh);
-						brelse(ibh);
-						brelse(nbh);
-						__udf_read_inode(inode);
-						return;
-					} else {
-						brelse(nbh);
-						brelse(ibh);
-					}
-				} else {
+		if (ident == TAG_IDENT_IE && ibh) {
+			struct buffer_head *nbh = NULL;
+			kernel_lb_addr loc;
+			struct indirectEntry *ie;
+
+			ie = (struct indirectEntry *)ibh->b_data;
+			loc = lelb_to_cpu(ie->indirectICB.extLocation);
+
+			if (ie->indirectICB.extLength &&
+				(nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+							&ident))) {
+				if (ident == TAG_IDENT_FE ||
+					ident == TAG_IDENT_EFE) {
+					memcpy(&iinfo->i_location,
+						&loc,
+						sizeof(kernel_lb_addr));
+					brelse(bh);
 					brelse(ibh);
+					brelse(nbh);
+					__udf_read_inode(inode);
+					return;
 				}
+				brelse(nbh);
 			}
-		} else {
-			brelse(ibh);
 		}
+		brelse(ibh);
 	} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
 		printk(KERN_ERR "udf: unsupported strategy type: %d\n",
 		       le16_to_cpu(fe->icbTag.strategyType));
-- 
cgit v1.2.3


From d652eefb70142c64495f4188883f9dfc0430a96b Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 30 Jan 2008 22:03:59 +0100
Subject: udf: replace udf_*_offset macros with functions

- translate udf_file_entry_alloc_offset macro into function
- translate udf_ext0_offset macro into function
- add comment about crypticly named fields in struct udf_inode_info

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/udf_i.h   |  4 ++--
 fs/udf/udfdecl.h | 29 +++++++++++++++++++----------
 2 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index d6d9a774a1c1..4f86b1d98a5d 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -12,8 +12,8 @@ struct udf_inode_info {
 	__u32			i_next_alloc_block;
 	__u32			i_next_alloc_goal;
 	unsigned		i_alloc_type : 3;
-	unsigned		i_efe : 1;
-	unsigned		i_use : 1;
+	unsigned		i_efe : 1;	/* extendedFileEntry */
+	unsigned		i_use : 1;	/* unallocSpaceEntry */
 	unsigned		i_strat4096 : 1;
 	unsigned		reserved : 26;
 	union {
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 26bc9c237cb7..cc15f58d497a 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -11,6 +11,7 @@
 
 #include "udf_sb.h"
 #include "udfend.h"
+#include "udf_i.h"
 
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS	8
@@ -42,16 +43,24 @@ do { \
 #define UDF_NAME_LEN		256
 #define UDF_PATH_LEN		1023
 
-#define udf_file_entry_alloc_offset(inode)\
-	(UDF_I(inode)->i_use ?\
-		sizeof(struct unallocSpaceEntry) :\
-		((UDF_I(inode)->i_efe ?\
-			sizeof(struct extendedFileEntry) :\
-			sizeof(struct fileEntry)) + UDF_I(inode)->i_lenEAttr))
-
-#define udf_ext0_offset(inode)\
-	(UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ?\
-		udf_file_entry_alloc_offset(inode) : 0)
+static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	if (iinfo->i_use)
+		return sizeof(struct unallocSpaceEntry);
+	else if (iinfo->i_efe)
+		return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr;
+	else
+		return sizeof(struct fileEntry) + iinfo->i_lenEAttr;
+}
+
+static inline size_t udf_ext0_offset(struct inode *inode)
+{
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		return udf_file_entry_alloc_offset(inode);
+	else
+		return 0;
+}
 
 #define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
 
-- 
cgit v1.2.3


From 01b954a36a03d90a66c9dd1fc13e4cb51269caf7 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sat, 2 Feb 2008 22:37:07 +0100
Subject: udf: convert udf_count_free_bitmap to use bitmap_weight

replace handwritten bits counting with bitmap_weight

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2666e5bc69ff..be0aa424b8f1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -55,6 +55,7 @@
 #include <linux/errno.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/bitmap.h>
 #include <asm/byteorder.h>
 
 #include "udf_sb.h"
@@ -1958,10 +1959,6 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static unsigned char udf_bitmap_lookup[16] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
-};
-
 static unsigned int udf_count_free_bitmap(struct super_block *sb,
 					  struct udf_bitmap *bitmap)
 {
@@ -1971,7 +1968,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 	int block = 0, newblock;
 	kernel_lb_addr loc;
 	uint32_t bytes;
-	uint8_t value;
 	uint8_t *ptr;
 	uint16_t ident;
 	struct spaceBitmapDesc *bm;
@@ -1997,13 +1993,10 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 	ptr = (uint8_t *)bh->b_data;
 
 	while (bytes > 0) {
-		while ((bytes > 0) && (index < sb->s_blocksize)) {
-			value = ptr[index];
-			accum += udf_bitmap_lookup[value & 0x0f];
-			accum += udf_bitmap_lookup[value >> 4];
-			index++;
-			bytes--;
-		}
+		u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index);
+		accum += bitmap_weight((const unsigned long *)(ptr + index),
+					cur_bytes * 8);
+		bytes -= cur_bytes;
 		if (bytes) {
 			brelse(bh);
 			newblock = udf_get_lb_pblock(sb, loc, ++block);
-- 
cgit v1.2.3


From f18f17b0338a388ad87a2f8078dbbfb83798bdfd Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Sun, 3 Feb 2008 19:36:06 +0100
Subject: udf: udf_get_block, inode_bmap - remove unneeded checks

block cannot be less than 0, because it's sector_t,
so remove unneeded checks

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ddd7780d1ab1..fb92476fcdd9 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -308,9 +308,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
 
 	lock_kernel();
 
-	if (block < 0)
-		goto abort_negative;
-
 	iinfo = UDF_I(inode);
 	if (block == iinfo->i_next_alloc_block + 1) {
 		iinfo->i_next_alloc_block++;
@@ -332,10 +329,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
 abort:
 	unlock_kernel();
 	return err;
-
-abort_negative:
-	udf_warning(inode->i_sb, "udf_get_block", "block < 0");
-	goto abort;
 }
 
 static struct buffer_head *udf_getblk(struct inode *inode, long block,
@@ -2051,11 +2044,6 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 	int8_t etype;
 	struct udf_inode_info *iinfo;
 
-	if (block < 0) {
-		printk(KERN_ERR "udf: inode_bmap: block < 0\n");
-		return -1;
-	}
-
 	iinfo = UDF_I(inode);
 	pos->offset = 0;
 	pos->block = iinfo->i_location;
-- 
cgit v1.2.3


From c87e8e90d0da1134e42c89dc89559f4bfe282ef9 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Sun, 3 Feb 2008 19:36:07 +0100
Subject: udf: create function for conversion from timestamp to timespec

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 79 +++++++++++++++++-----------------------------------------
 1 file changed, 23 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fb92476fcdd9..3c0a60dad478 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1136,12 +1136,24 @@ static void __udf_read_inode(struct inode *inode)
 	brelse(bh);
 }
 
+static void udf_fill_inode_time(struct timespec *tspec,
+				const timestamp *tstamp,
+				struct udf_sb_info *sbi)
+{
+	time_t convtime;
+	long convtime_usec;
+	if (udf_stamp_to_time(&convtime, &convtime_usec,
+				lets_to_cpu(*tstamp))) {
+		tspec->tv_sec = convtime;
+		tspec->tv_nsec = convtime_usec * 1000;
+	} else
+		*tspec = sbi->s_record_time;
+}
+
 static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 {
 	struct fileEntry *fe;
 	struct extendedFileEntry *efe;
-	time_t convtime;
-	long convtime_usec;
 	int offset;
 	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
 	struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1229,29 +1241,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
 			(inode->i_sb->s_blocksize_bits - 9);
 
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(fe->accessTime))) {
-			inode->i_atime.tv_sec = convtime;
-			inode->i_atime.tv_nsec = convtime_usec * 1000;
-		} else {
-			inode->i_atime = sbi->s_record_time;
-		}
-
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(fe->modificationTime))) {
-			inode->i_mtime.tv_sec = convtime;
-			inode->i_mtime.tv_nsec = convtime_usec * 1000;
-		} else {
-			inode->i_mtime = sbi->s_record_time;
-		}
-
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(fe->attrTime))) {
-			inode->i_ctime.tv_sec = convtime;
-			inode->i_ctime.tv_nsec = convtime_usec * 1000;
-		} else {
-			inode->i_ctime = sbi->s_record_time;
-		}
+		udf_fill_inode_time(&inode->i_atime, &fe->accessTime, sbi);
+		udf_fill_inode_time(&inode->i_mtime, &fe->modificationTime,
+				    sbi);
+		udf_fill_inode_time(&inode->i_ctime, &fe->attrTime, sbi);
 
 		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1261,37 +1254,11 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
 
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(efe->accessTime))) {
-			inode->i_atime.tv_sec = convtime;
-			inode->i_atime.tv_nsec = convtime_usec * 1000;
-		} else {
-			inode->i_atime = sbi->s_record_time;
-		}
-
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(efe->modificationTime))) {
-			inode->i_mtime.tv_sec = convtime;
-			inode->i_mtime.tv_nsec = convtime_usec * 1000;
-		} else {
-			inode->i_mtime = sbi->s_record_time;
-		}
-
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(efe->createTime))) {
-			iinfo->i_crtime.tv_sec = convtime;
-			iinfo->i_crtime.tv_nsec = convtime_usec * 1000;
-		} else {
-			iinfo->i_crtime = sbi->s_record_time;
-		}
-
-		if (udf_stamp_to_time(&convtime, &convtime_usec,
-				      lets_to_cpu(efe->attrTime))) {
-			inode->i_ctime.tv_sec = convtime;
-			inode->i_ctime.tv_nsec = convtime_usec * 1000;
-		} else {
-			inode->i_ctime = sbi->s_record_time;
-		}
+		udf_fill_inode_time(&inode->i_atime, &efe->accessTime, sbi);
+		udf_fill_inode_time(&inode->i_mtime, &efe->modificationTime,
+				    sbi);
+		udf_fill_inode_time(&iinfo->i_crtime, &efe->createTime, sbi);
+		udf_fill_inode_time(&inode->i_ctime, &efe->attrTime, sbi);
 
 		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
-- 
cgit v1.2.3


From cbf5676a0e0463f05e5073589f3194846dfb02e7 Mon Sep 17 00:00:00 2001
From: "marcin.slusarz@gmail.com" <marcin.slusarz@gmail.com>
Date: Wed, 27 Feb 2008 22:50:14 +0100
Subject: udf: convert udf_stamp_to_time to return struct timespec

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c   | 49 ++++++++++++++++++++++++++-----------------------
 fs/udf/super.c   |  9 ++-------
 fs/udf/udfdecl.h |  4 ++--
 fs/udf/udftime.c | 17 +++++++----------
 4 files changed, 37 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 3c0a60dad478..a7646e9bdbde 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1136,20 +1136,6 @@ static void __udf_read_inode(struct inode *inode)
 	brelse(bh);
 }
 
-static void udf_fill_inode_time(struct timespec *tspec,
-				const timestamp *tstamp,
-				struct udf_sb_info *sbi)
-{
-	time_t convtime;
-	long convtime_usec;
-	if (udf_stamp_to_time(&convtime, &convtime_usec,
-				lets_to_cpu(*tstamp))) {
-		tspec->tv_sec = convtime;
-		tspec->tv_nsec = convtime_usec * 1000;
-	} else
-		*tspec = sbi->s_record_time;
-}
-
 static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 {
 	struct fileEntry *fe;
@@ -1241,10 +1227,17 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
 			(inode->i_sb->s_blocksize_bits - 9);
 
-		udf_fill_inode_time(&inode->i_atime, &fe->accessTime, sbi);
-		udf_fill_inode_time(&inode->i_mtime, &fe->modificationTime,
-				    sbi);
-		udf_fill_inode_time(&inode->i_ctime, &fe->attrTime, sbi);
+		if (!udf_stamp_to_time(&inode->i_atime,
+					lets_to_cpu(fe->accessTime)))
+			inode->i_atime = sbi->s_record_time;
+
+		if (!udf_stamp_to_time(&inode->i_mtime,
+					lets_to_cpu(fe->modificationTime)))
+			inode->i_mtime = sbi->s_record_time;
+
+		if (!udf_stamp_to_time(&inode->i_ctime,
+					lets_to_cpu(fe->attrTime)))
+			inode->i_ctime = sbi->s_record_time;
 
 		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1254,11 +1247,21 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
 
-		udf_fill_inode_time(&inode->i_atime, &efe->accessTime, sbi);
-		udf_fill_inode_time(&inode->i_mtime, &efe->modificationTime,
-				    sbi);
-		udf_fill_inode_time(&iinfo->i_crtime, &efe->createTime, sbi);
-		udf_fill_inode_time(&inode->i_ctime, &efe->attrTime, sbi);
+		if (!udf_stamp_to_time(&inode->i_atime,
+					lets_to_cpu(efe->accessTime)))
+			inode->i_atime = sbi->s_record_time;
+
+		if (!udf_stamp_to_time(&inode->i_mtime,
+					lets_to_cpu(efe->modificationTime)))
+			inode->i_mtime = sbi->s_record_time;
+
+		if (!udf_stamp_to_time(&iinfo->i_crtime,
+					lets_to_cpu(efe->createTime)))
+			iinfo->i_crtime = sbi->s_record_time;
+
+		if (!udf_stamp_to_time(&inode->i_ctime,
+					lets_to_cpu(efe->attrTime)))
+			inode->i_ctime = sbi->s_record_time;
 
 		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index be0aa424b8f1..f4cdd530c65f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -938,24 +938,19 @@ static int udf_find_fileset(struct super_block *sb,
 static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
 {
 	struct primaryVolDesc *pvoldesc;
-	time_t recording;
-	long recording_usec;
 	struct ustr instr;
 	struct ustr outstr;
 
 	pvoldesc = (struct primaryVolDesc *)bh->b_data;
 
-	if (udf_stamp_to_time(&recording, &recording_usec,
+	if (udf_stamp_to_time(&UDF_SB(sb)->s_record_time,
 			      lets_to_cpu(pvoldesc->recordingDateAndTime))) {
 		kernel_timestamp ts;
 		ts = lets_to_cpu(pvoldesc->recordingDateAndTime);
-		udf_debug("recording time %ld/%ld, %04u/%02u/%02u"
+		udf_debug("recording time %04u/%02u/%02u"
 			  " %02u:%02u (%x)\n",
-			  recording, recording_usec,
 			  ts.year, ts.month, ts.day, ts.hour,
 			  ts.minute, ts.typeAndTimezone);
-		UDF_SB(sb)->s_record_time.tv_sec = recording;
-		UDF_SB(sb)->s_record_time.tv_nsec = recording_usec * 1000;
 	}
 
 	if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cc15f58d497a..b277524fe608 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -222,7 +222,7 @@ extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 extern uint16_t udf_crc(const uint8_t *, uint32_t, uint16_t);
 
 /* udftime.c */
-extern time_t *udf_stamp_to_time(time_t *, long *, kernel_timestamp);
-extern kernel_timestamp *udf_time_to_stamp(kernel_timestamp *, struct timespec);
+extern struct timespec *udf_stamp_to_time(struct timespec *dest, kernel_timestamp src);
+extern kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec src);
 
 #endif				/* __UDF_DECL_H */
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index ce595732ba6f..12fae6cd444c 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,7 @@ extern struct timezone sys_tz;
 #define SECS_PER_HOUR	(60 * 60)
 #define SECS_PER_DAY	(SECS_PER_HOUR * 24)
 
-time_t *udf_stamp_to_time(time_t *dest, long *dest_usec, kernel_timestamp src)
+struct timespec *udf_stamp_to_time(struct timespec *dest, kernel_timestamp src)
 {
 	int yday;
 	uint8_t type = src.typeAndTimezone >> 12;
@@ -97,23 +97,20 @@ time_t *udf_stamp_to_time(time_t *dest, long *dest_usec, kernel_timestamp src)
 		offset = (offset >> 4);
 		if (offset == -2047) /* unspecified offset */
 			offset = 0;
-	} else {
+	} else
 		offset = 0;
-	}
 
 	if ((src.year < EPOCH_YEAR) ||
 	    (src.year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
-		*dest = -1;
-		*dest_usec = -1;
 		return NULL;
 	}
-	*dest = year_seconds[src.year - EPOCH_YEAR];
-	*dest -= offset * 60;
+	dest->tv_sec = year_seconds[src.year - EPOCH_YEAR];
+	dest->tv_sec -= offset * 60;
 
 	yday = ((__mon_yday[__isleap(src.year)][src.month - 1]) + src.day - 1);
-	*dest += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
-	*dest_usec = src.centiseconds * 10000 +
-			src.hundredsOfMicroseconds * 100 + src.microseconds;
+	dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
+	dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
+			src.hundredsOfMicroseconds * 100 + src.microseconds);
 	return dest;
 }
 
-- 
cgit v1.2.3


From 56774805d5eeecd3f1fb700603e593a35dc523c8 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 10 Feb 2008 11:25:31 +0100
Subject: udf: convert udf_stamp_to_time and udf_time_to_stamp to use
 timestamps

* kernel_timestamp type was almost unused - only callers of udf_stamp_to_time
and udf_time_to_stamp used it, so let these functions handle endianness
internally and don't clutter code with conversions

* rename udf_stamp_to_time to udf_disk_stamp_to_time
  and udf_time_to_stamp to udf_time_to_disk_stamp

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c   | 45 ++++++++++++++++-----------------------------
 fs/udf/super.c   | 22 ++++++++++------------
 fs/udf/udfdecl.h |  5 +++--
 fs/udf/udftime.c | 22 ++++++++++++----------
 4 files changed, 41 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a7646e9bdbde..2362bf0c6900 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1227,16 +1227,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
 			(inode->i_sb->s_blocksize_bits - 9);
 
-		if (!udf_stamp_to_time(&inode->i_atime,
-					lets_to_cpu(fe->accessTime)))
+		if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime))
 			inode->i_atime = sbi->s_record_time;
 
-		if (!udf_stamp_to_time(&inode->i_mtime,
-					lets_to_cpu(fe->modificationTime)))
+		if (!udf_disk_stamp_to_time(&inode->i_mtime,
+					    fe->modificationTime))
 			inode->i_mtime = sbi->s_record_time;
 
-		if (!udf_stamp_to_time(&inode->i_ctime,
-					lets_to_cpu(fe->attrTime)))
+		if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime))
 			inode->i_ctime = sbi->s_record_time;
 
 		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
@@ -1247,20 +1245,17 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
 
-		if (!udf_stamp_to_time(&inode->i_atime,
-					lets_to_cpu(efe->accessTime)))
+		if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime))
 			inode->i_atime = sbi->s_record_time;
 
-		if (!udf_stamp_to_time(&inode->i_mtime,
-					lets_to_cpu(efe->modificationTime)))
+		if (!udf_disk_stamp_to_time(&inode->i_mtime,
+					    efe->modificationTime))
 			inode->i_mtime = sbi->s_record_time;
 
-		if (!udf_stamp_to_time(&iinfo->i_crtime,
-					lets_to_cpu(efe->createTime)))
+		if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime))
 			iinfo->i_crtime = sbi->s_record_time;
 
-		if (!udf_stamp_to_time(&inode->i_ctime,
-					lets_to_cpu(efe->attrTime)))
+		if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime))
 			inode->i_ctime = sbi->s_record_time;
 
 		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
@@ -1382,7 +1377,6 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	uint32_t udfperms;
 	uint16_t icbflags;
 	uint16_t crclen;
-	kernel_timestamp cpu_time;
 	int err = 0;
 	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -1485,12 +1479,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
 			(blocksize_bits - 9));
 
-		if (udf_time_to_stamp(&cpu_time, inode->i_atime))
-			fe->accessTime = cpu_to_lets(cpu_time);
-		if (udf_time_to_stamp(&cpu_time, inode->i_mtime))
-			fe->modificationTime = cpu_to_lets(cpu_time);
-		if (udf_time_to_stamp(&cpu_time, inode->i_ctime))
-			fe->attrTime = cpu_to_lets(cpu_time);
+		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
+		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
 		memset(&(fe->impIdent), 0, sizeof(regid));
 		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
 		fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1525,14 +1516,10 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		     iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec))
 			iinfo->i_crtime = inode->i_ctime;
 
-		if (udf_time_to_stamp(&cpu_time, inode->i_atime))
-			efe->accessTime = cpu_to_lets(cpu_time);
-		if (udf_time_to_stamp(&cpu_time, inode->i_mtime))
-			efe->modificationTime = cpu_to_lets(cpu_time);
-		if (udf_time_to_stamp(&cpu_time, iinfo->i_crtime))
-			efe->createTime = cpu_to_lets(cpu_time);
-		if (udf_time_to_stamp(&cpu_time, inode->i_ctime))
-			efe->attrTime = cpu_to_lets(cpu_time);
+		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
+		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
+		udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
 
 		memset(&(efe->impIdent), 0, sizeof(regid));
 		strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f4cdd530c65f..4d2ecee1970b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -943,8 +943,8 @@ static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
 
 	pvoldesc = (struct primaryVolDesc *)bh->b_data;
 
-	if (udf_stamp_to_time(&UDF_SB(sb)->s_record_time,
-			      lets_to_cpu(pvoldesc->recordingDateAndTime))) {
+	if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
+			      pvoldesc->recordingDateAndTime)) {
 		kernel_timestamp ts;
 		ts = lets_to_cpu(pvoldesc->recordingDateAndTime);
 		udf_debug("recording time %04u/%02u/%02u"
@@ -1589,7 +1589,6 @@ static void udf_open_lvid(struct super_block *sb)
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	if (bh) {
-		kernel_timestamp cpu_time;
 		struct logicalVolIntegrityDesc *lvid =
 				(struct logicalVolIntegrityDesc *)bh->b_data;
 		struct logicalVolIntegrityDescImpUse *lvidiu =
@@ -1597,8 +1596,8 @@ static void udf_open_lvid(struct super_block *sb)
 
 		lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 		lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-		if (udf_time_to_stamp(&cpu_time, CURRENT_TIME))
-			lvid->recordingDateAndTime = cpu_to_lets(cpu_time);
+		udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
+					CURRENT_TIME);
 		lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
 
 		lvid->descTag.descCRC = cpu_to_le16(
@@ -1613,7 +1612,6 @@ static void udf_open_lvid(struct super_block *sb)
 
 static void udf_close_lvid(struct super_block *sb)
 {
-	kernel_timestamp cpu_time;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
@@ -1628,8 +1626,8 @@ static void udf_close_lvid(struct super_block *sb)
 							udf_sb_lvidiu(sbi);
 		lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 		lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-		if (udf_time_to_stamp(&cpu_time, CURRENT_TIME))
-			lvid->recordingDateAndTime = cpu_to_lets(cpu_time);
+		udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
+					CURRENT_TIME);
 		if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
 			lvidiu->maxUDFWriteRev =
 					cpu_to_le16(UDF_MAX_WRITE_VERSION);
@@ -1801,12 +1799,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	}
 
 	if (!silent) {
-		kernel_timestamp ts;
-		udf_time_to_stamp(&ts, sbi->s_record_time);
+		timestamp ts;
+		udf_time_to_disk_stamp(&ts, sbi->s_record_time);
 		udf_info("UDF: Mounting volume '%s', "
 			 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
-			 sbi->s_volume_ident, ts.year, ts.month, ts.day,
-			 ts.hour, ts.minute, ts.typeAndTimezone);
+			 sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
+			 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
 	}
 	if (!(sb->s_flags & MS_RDONLY))
 		udf_open_lvid(sb);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index b277524fe608..2cb2f5de4245 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -222,7 +222,8 @@ extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 extern uint16_t udf_crc(const uint8_t *, uint32_t, uint16_t);
 
 /* udftime.c */
-extern struct timespec *udf_stamp_to_time(struct timespec *dest, kernel_timestamp src);
-extern kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec src);
+extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
+						timestamp src);
+extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
 
 #endif				/* __UDF_DECL_H */
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 12fae6cd444c..5f811655c9b5 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,14 +85,16 @@ extern struct timezone sys_tz;
 #define SECS_PER_HOUR	(60 * 60)
 #define SECS_PER_DAY	(SECS_PER_HOUR * 24)
 
-struct timespec *udf_stamp_to_time(struct timespec *dest, kernel_timestamp src)
+struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
 {
 	int yday;
-	uint8_t type = src.typeAndTimezone >> 12;
+	u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
+	u16 year = le16_to_cpu(src.year);
+	uint8_t type = typeAndTimezone >> 12;
 	int16_t offset;
 
 	if (type == 1) {
-		offset = src.typeAndTimezone << 4;
+		offset = typeAndTimezone << 4;
 		/* sign extent offset */
 		offset = (offset >> 4);
 		if (offset == -2047) /* unspecified offset */
@@ -100,21 +102,21 @@ struct timespec *udf_stamp_to_time(struct timespec *dest, kernel_timestamp src)
 	} else
 		offset = 0;
 
-	if ((src.year < EPOCH_YEAR) ||
-	    (src.year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
+	if ((year < EPOCH_YEAR) ||
+	    (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
 		return NULL;
 	}
-	dest->tv_sec = year_seconds[src.year - EPOCH_YEAR];
+	dest->tv_sec = year_seconds[year - EPOCH_YEAR];
 	dest->tv_sec -= offset * 60;
 
-	yday = ((__mon_yday[__isleap(src.year)][src.month - 1]) + src.day - 1);
+	yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1);
 	dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
 	dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
 			src.hundredsOfMicroseconds * 100 + src.microseconds);
 	return dest;
 }
 
-kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
+timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
 {
 	long int days, rem, y;
 	const unsigned short int *ip;
@@ -125,7 +127,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
 	if (!dest)
 		return NULL;
 
-	dest->typeAndTimezone = 0x1000 | (offset & 0x0FFF);
+	dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF));
 
 	ts.tv_sec += offset * 60;
 	days = ts.tv_sec / SECS_PER_DAY;
@@ -148,7 +150,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
 			 - LEAPS_THRU_END_OF(y - 1));
 		y = yg;
 	}
-	dest->year = y;
+	dest->year = cpu_to_le16(y);
 	ip = __mon_yday[__isleap(y)];
 	for (y = 11; days < (long int)ip[y]; --y)
 		continue;
-- 
cgit v1.2.3


From af15a298a49c9b5844cdaf70e10eb808e54ead2c Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 10 Feb 2008 11:29:10 +0100
Subject: udf: remove unneeded kernel_timestamp type

remove now unneeded kernel_timestamp type with conversion functions

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/ecma_167.h | 13 -------------
 fs/udf/super.c    |  9 +++++----
 fs/udf/udfend.h   | 22 ----------------------
 3 files changed, 5 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 56387711589b..a0974df82b31 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -70,19 +70,6 @@ typedef struct {
 	uint8_t		microseconds;
 } __attribute__ ((packed)) timestamp;
 
-typedef struct {
-	uint16_t	typeAndTimezone;
-	int16_t		year;
-	uint8_t		month;
-	uint8_t		day;
-	uint8_t		hour;
-	uint8_t		minute;
-	uint8_t		second;
-	uint8_t		centiseconds;
-	uint8_t		hundredsOfMicroseconds;
-	uint8_t		microseconds;
-} __attribute__ ((packed)) kernel_timestamp;
-
 /* Type and Time Zone (ECMA 167r3 1/7.3.1) */
 #define TIMESTAMP_TYPE_MASK		0xF000
 #define TIMESTAMP_TYPE_CUT		0x0000
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4d2ecee1970b..829ddfa5a148 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -945,12 +945,13 @@ static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
 
 	if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
 			      pvoldesc->recordingDateAndTime)) {
-		kernel_timestamp ts;
-		ts = lets_to_cpu(pvoldesc->recordingDateAndTime);
+#ifdef UDFFS_DEBUG
+		timestamp *ts = &pvoldesc->recordingDateAndTime;
 		udf_debug("recording time %04u/%02u/%02u"
 			  " %02u:%02u (%x)\n",
-			  ts.year, ts.month, ts.day, ts.hour,
-			  ts.minute, ts.typeAndTimezone);
+			  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
+			  ts->minute, le16_to_cpu(ts->typeAndTimezone));
+#endif
 	}
 
 	if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index c4bd1203f857..489f52fb428c 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -24,17 +24,6 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
 	return out;
 }
 
-static inline kernel_timestamp lets_to_cpu(timestamp in)
-{
-	kernel_timestamp out;
-
-	memcpy(&out, &in, sizeof(timestamp));
-	out.typeAndTimezone = le16_to_cpu(in.typeAndTimezone);
-	out.year = le16_to_cpu(in.year);
-
-	return out;
-}
-
 static inline short_ad lesa_to_cpu(short_ad in)
 {
 	short_ad out;
@@ -85,15 +74,4 @@ static inline kernel_extent_ad leea_to_cpu(extent_ad in)
 	return out;
 }
 
-static inline timestamp cpu_to_lets(kernel_timestamp in)
-{
-	timestamp out;
-
-	memcpy(&out, &in, sizeof(timestamp));
-	out.typeAndTimezone = cpu_to_le16(in.typeAndTimezone);
-	out.year = cpu_to_le16(in.year);
-
-	return out;
-}
-
 #endif /* __UDF_ENDIAN_H */
-- 
cgit v1.2.3


From 165923fa4590b0eb77bec31af383ea16b2d5868f Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 10 Feb 2008 11:33:08 +0100
Subject: udf: super.c reorganization

reorganize few code blocks in super.c which
were needlessly indented (and hard to read):

so change from:
rettype fun()
{
	init;
	if (sth) {
		long block of code;
	}
}

to:
rettype fun()
{
	init;
	if (!sth)
		return;
	long block of code;
}

or

from:
rettype fun2()
{
	init;
	while (sth) {
		init2();
		if (sth2) {
			long block of code;
		}
	}
}

to:
rettype fun2()
{
	init;
	while (sth) {
		init2();
		if (!sth2)
			continue;
		long block of code;
	}
}

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 552 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 270 insertions(+), 282 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 829ddfa5a148..17ba054faaa4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -827,18 +827,18 @@ static void udf_find_anchor(struct super_block *sb)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-		if (sbi->s_anchor[i]) {
-			bh = udf_read_tagged(sb, sbi->s_anchor[i],
-					     sbi->s_anchor[i], &ident);
-			if (!bh)
+		if (!sbi->s_anchor[i])
+			continue;
+		bh = udf_read_tagged(sb, sbi->s_anchor[i],
+					sbi->s_anchor[i], &ident);
+		if (!bh)
+			sbi->s_anchor[i] = 0;
+		else {
+			brelse(bh);
+			if ((ident != TAG_IDENT_AVDP) &&
+				(i || (ident != TAG_IDENT_FE &&
+					ident != TAG_IDENT_EFE)))
 				sbi->s_anchor[i] = 0;
-			else {
-				brelse(bh);
-				if ((ident != TAG_IDENT_AVDP) &&
-				    (i || (ident != TAG_IDENT_FE &&
-					   ident != TAG_IDENT_EFE)))
-					sbi->s_anchor[i] = 0;
-			}
 		}
 	}
 
@@ -1020,128 +1020,118 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 
 static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 {
-	struct partitionDesc *p;
-	int i;
+	struct partitionHeaderDesc *phd;
+	struct partitionDesc *p = (struct partitionDesc *)bh->b_data;
 	struct udf_part_map *map;
-	struct udf_sb_info *sbi;
-
-	p = (struct partitionDesc *)bh->b_data;
-	sbi = UDF_SB(sb);
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	bool found = false;
+	int i;
+	u16 partitionNumber = le16_to_cpu(p->partitionNumber);
 
 	for (i = 0; i < sbi->s_partitions; i++) {
 		map = &sbi->s_partmaps[i];
 		udf_debug("Searching map: (%d == %d)\n",
-			  map->s_partition_num,
-			  le16_to_cpu(p->partitionNumber));
-		if (map->s_partition_num ==
-				le16_to_cpu(p->partitionNumber)) {
-			map->s_partition_len =
-				le32_to_cpu(p->partitionLength); /* blocks */
-			map->s_partition_root =
-				le32_to_cpu(p->partitionStartingLocation);
-			if (p->accessType ==
-					cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
-				map->s_partition_flags |=
-						UDF_PART_FLAG_READ_ONLY;
-			if (p->accessType ==
-					cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
-				map->s_partition_flags |=
-						UDF_PART_FLAG_WRITE_ONCE;
-			if (p->accessType ==
-					cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
-				map->s_partition_flags |=
-						UDF_PART_FLAG_REWRITABLE;
-			if (p->accessType ==
-				    cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
-				map->s_partition_flags |=
-						UDF_PART_FLAG_OVERWRITABLE;
-
-			if (!strcmp(p->partitionContents.ident,
-				    PD_PARTITION_CONTENTS_NSR02) ||
-			    !strcmp(p->partitionContents.ident,
-				    PD_PARTITION_CONTENTS_NSR03)) {
-				struct partitionHeaderDesc *phd;
-
-				phd = (struct partitionHeaderDesc *)
-						(p->partitionContentsUse);
-				if (phd->unallocSpaceTable.extLength) {
-					kernel_lb_addr loc = {
-						.logicalBlockNum = le32_to_cpu(phd->unallocSpaceTable.extPosition),
-						.partitionReferenceNum = i,
-					};
-
-					map->s_uspace.s_table =
-						udf_iget(sb, loc);
-					if (!map->s_uspace.s_table) {
-						udf_debug("cannot load unallocSpaceTable (part %d)\n", i);
-						return 1;
-					}
-					map->s_partition_flags |=
-						UDF_PART_FLAG_UNALLOC_TABLE;
-					udf_debug("unallocSpaceTable (part %d) @ %ld\n",
-						  i, map->s_uspace.s_table->i_ino);
-				}
-				if (phd->unallocSpaceBitmap.extLength) {
-					struct udf_bitmap *bitmap =
-						udf_sb_alloc_bitmap(sb, i);
-					map->s_uspace.s_bitmap = bitmap;
-					if (bitmap != NULL) {
-						bitmap->s_extLength =
-							le32_to_cpu(phd->unallocSpaceBitmap.extLength);
-						bitmap->s_extPosition =
-							le32_to_cpu(phd->unallocSpaceBitmap.extPosition);
-						map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
-						udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
-							  i, bitmap->s_extPosition);
-					}
-				}
-				if (phd->partitionIntegrityTable.extLength)
-					udf_debug("partitionIntegrityTable (part %d)\n", i);
-				if (phd->freedSpaceTable.extLength) {
-					kernel_lb_addr loc = {
-						.logicalBlockNum = le32_to_cpu(phd->freedSpaceTable.extPosition),
-						.partitionReferenceNum = i,
-					};
-
-					map->s_fspace.s_table =
-						udf_iget(sb, loc);
-					if (!map->s_fspace.s_table) {
-						udf_debug("cannot load freedSpaceTable (part %d)\n", i);
-						return 1;
-					}
-					map->s_partition_flags |=
-						UDF_PART_FLAG_FREED_TABLE;
-					udf_debug("freedSpaceTable (part %d) @ %ld\n",
-						  i, map->s_fspace.s_table->i_ino);
-				}
-				if (phd->freedSpaceBitmap.extLength) {
-					struct udf_bitmap *bitmap =
-						udf_sb_alloc_bitmap(sb, i);
-					map->s_fspace.s_bitmap = bitmap;
-					if (bitmap != NULL) {
-						bitmap->s_extLength =
-							le32_to_cpu(phd->freedSpaceBitmap.extLength);
-						bitmap->s_extPosition =
-							le32_to_cpu(phd->freedSpaceBitmap.extPosition);
-						map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
-						udf_debug("freedSpaceBitmap (part %d) @ %d\n",
-							  i, bitmap->s_extPosition);
-					}
-				}
-			}
+			  map->s_partition_num, partitionNumber);
+		found = map->s_partition_num == partitionNumber;
+		if (found)
 			break;
-		}
 	}
-	if (i == sbi->s_partitions)
+
+	if (!found) {
 		udf_debug("Partition (%d) not found in partition map\n",
-			  le16_to_cpu(p->partitionNumber));
-	else
-		udf_debug("Partition (%d:%d type %x) starts at physical %d, "
-			  "block length %d\n",
-			  le16_to_cpu(p->partitionNumber), i,
-			  map->s_partition_type,
-			  map->s_partition_root,
-			  map->s_partition_len);
+			  partitionNumber);
+		return 0;
+	}
+
+	map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
+	map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
+
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
+		map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
+		map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
+		map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE;
+	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
+		map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
+
+	udf_debug("Partition (%d:%d type %x) starts at physical %d, "
+		  "block length %d\n", partitionNumber, i,
+		  map->s_partition_type, map->s_partition_root,
+		  map->s_partition_len);
+
+	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+		return 0;
+
+	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+	if (phd->unallocSpaceTable.extLength) {
+		kernel_lb_addr loc = {
+			.logicalBlockNum = le32_to_cpu(
+				phd->unallocSpaceTable.extPosition),
+			.partitionReferenceNum = i,
+		};
+
+		map->s_uspace.s_table = udf_iget(sb, loc);
+		if (!map->s_uspace.s_table) {
+			udf_debug("cannot load unallocSpaceTable (part %d)\n",
+					i);
+			return 1;
+		}
+		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
+		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
+				i, map->s_uspace.s_table->i_ino);
+	}
+
+	if (phd->unallocSpaceBitmap.extLength) {
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, i);
+		map->s_uspace.s_bitmap = bitmap;
+		if (bitmap != NULL) {
+			bitmap->s_extLength = le32_to_cpu(
+				phd->unallocSpaceBitmap.extLength);
+			bitmap->s_extPosition = le32_to_cpu(
+				phd->unallocSpaceBitmap.extPosition);
+			map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
+			udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
+					i, bitmap->s_extPosition);
+		}
+	}
+
+	if (phd->partitionIntegrityTable.extLength)
+		udf_debug("partitionIntegrityTable (part %d)\n", i);
+
+	if (phd->freedSpaceTable.extLength) {
+		kernel_lb_addr loc = {
+			.logicalBlockNum = le32_to_cpu(
+				phd->freedSpaceTable.extPosition),
+			.partitionReferenceNum = i,
+		};
+
+		map->s_fspace.s_table = udf_iget(sb, loc);
+		if (!map->s_fspace.s_table) {
+			udf_debug("cannot load freedSpaceTable (part %d)\n", i);
+			return 1;
+		}
+
+		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
+		udf_debug("freedSpaceTable (part %d) @ %ld\n",
+				i, map->s_fspace.s_table->i_ino);
+	}
+
+	if (phd->freedSpaceBitmap.extLength) {
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, i);
+		map->s_fspace.s_bitmap = bitmap;
+		if (bitmap != NULL) {
+			bitmap->s_extLength = le32_to_cpu(
+				phd->freedSpaceBitmap.extLength);
+			bitmap->s_extPosition = le32_to_cpu(
+				phd->freedSpaceBitmap.extPosition);
+			map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
+			udf_debug("freedSpaceBitmap (part %d) @ %d\n",
+					i, bitmap->s_extPosition);
+		}
+	}
+
 	return 0;
 }
 
@@ -1215,19 +1205,17 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
 					map->s_type_specific.s_sparing.
 							s_spar_map[j] = bh2;
 
-					if (bh2 != NULL) {
-						st = (struct sparingTable *)
-								bh2->b_data;
-						if (ident != 0 || strncmp(
-							st->sparingIdent.ident,
-							UDF_ID_SPARING,
-							strlen(UDF_ID_SPARING))) {
-							brelse(bh2);
-							map->s_type_specific.
-								s_sparing.
-								s_spar_map[j] =
-									NULL;
-						}
+					if (bh2 == NULL)
+						continue;
+
+					st = (struct sparingTable *)bh2->b_data;
+					if (ident != 0 || strncmp(
+						st->sparingIdent.ident,
+						UDF_ID_SPARING,
+						strlen(UDF_ID_SPARING))) {
+						brelse(bh2);
+						map->s_type_specific.s_sparing.
+							s_spar_map[j] = NULL;
 					}
 				}
 				map->s_partition_func = udf_get_pblock_spar15;
@@ -1392,40 +1380,40 @@ static int udf_process_sequence(struct super_block *sb, long block,
 		brelse(bh);
 	}
 	for (i = 0; i < VDS_POS_LENGTH; i++) {
-		if (vds[i].block) {
-			bh = udf_read_tagged(sb, vds[i].block, vds[i].block,
-					     &ident);
-
-			if (i == VDS_POS_PRIMARY_VOL_DESC) {
-				udf_load_pvoldesc(sb, bh);
-			} else if (i == VDS_POS_LOGICAL_VOL_DESC) {
-				if (udf_load_logicalvol(sb, bh, fileset)) {
-					brelse(bh);
-					return 1;
-				}
-			} else if (i == VDS_POS_PARTITION_DESC) {
-				struct buffer_head *bh2 = NULL;
-				if (udf_load_partdesc(sb, bh)) {
-					brelse(bh);
-					return 1;
-				}
-				for (j = vds[i].block + 1;
-				     j <  vds[VDS_POS_TERMINATING_DESC].block;
-				     j++) {
-					bh2 = udf_read_tagged(sb, j, j, &ident);
-					gd = (struct generic_desc *)bh2->b_data;
-					if (ident == TAG_IDENT_PD)
-						if (udf_load_partdesc(sb,
-								      bh2)) {
-							brelse(bh);
-							brelse(bh2);
-							return 1;
-						}
-					brelse(bh2);
-				}
+		if (!vds[i].block)
+			continue;
+
+		bh = udf_read_tagged(sb, vds[i].block, vds[i].block,
+					&ident);
+
+		if (i == VDS_POS_PRIMARY_VOL_DESC)
+			udf_load_pvoldesc(sb, bh);
+		else if (i == VDS_POS_LOGICAL_VOL_DESC) {
+			if (udf_load_logicalvol(sb, bh, fileset)) {
+				brelse(bh);
+				return 1;
+			}
+		} else if (i == VDS_POS_PARTITION_DESC) {
+			struct buffer_head *bh2 = NULL;
+			if (udf_load_partdesc(sb, bh)) {
+				brelse(bh);
+				return 1;
+			}
+			for (j = vds[i].block + 1;
+				j <  vds[VDS_POS_TERMINATING_DESC].block;
+				j++) {
+				bh2 = udf_read_tagged(sb, j, j, &ident);
+				gd = (struct generic_desc *)bh2->b_data;
+				if (ident == TAG_IDENT_PD)
+					if (udf_load_partdesc(sb, bh2)) {
+						brelse(bh);
+						brelse(bh2);
+						return 1;
+					}
+				brelse(bh2);
 			}
-			brelse(bh);
 		}
+		brelse(bh);
 	}
 
 	return 0;
@@ -1437,6 +1425,7 @@ static int udf_process_sequence(struct super_block *sb, long block,
 static int udf_check_valid(struct super_block *sb, int novrs, int silent)
 {
 	long block;
+	struct udf_sb_info *sbi;
 
 	if (novrs) {
 		udf_debug("Validity check skipped because of novrs option\n");
@@ -1444,18 +1433,16 @@ static int udf_check_valid(struct super_block *sb, int novrs, int silent)
 	}
 	/* Check that it is NSR02 compliant */
 	/* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
-	else {
-		block = udf_vrs(sb, silent);
-		if (block == -1) {
-			struct udf_sb_info *sbi = UDF_SB(sb);
-			udf_debug("Failed to read byte 32768. Assuming open "
-				  "disc. Skipping validity check\n");
-			if (!sbi->s_last_block)
-				sbi->s_last_block = udf_get_last_block(sb);
-			return 0;
-		} else
-			return !block;
-	}
+	block = udf_vrs(sb, silent);
+	if (block != -1)
+		return !block;
+
+	sbi = UDF_SB(sb);
+	udf_debug("Failed to read byte 32768. Assuming open "
+			"disc. Skipping validity check\n");
+	if (!sbi->s_last_block)
+		sbi->s_last_block = udf_get_last_block(sb);
+	return 0;
 }
 
 static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
@@ -1474,6 +1461,7 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
 	for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
 		if (!sbi->s_anchor[i])
 			continue;
+
 		bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
 				     &ident);
 		if (!bh)
@@ -1515,72 +1503,73 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
 	for (i = 0; i < sbi->s_partitions; i++) {
 		kernel_lb_addr uninitialized_var(ino);
 		struct udf_part_map *map = &sbi->s_partmaps[i];
-		switch (map->s_partition_type) {
-		case UDF_VIRTUAL_MAP15:
-		case UDF_VIRTUAL_MAP20:
-			if (!sbi->s_last_block) {
-				sbi->s_last_block = udf_get_last_block(sb);
-				udf_find_anchor(sb);
-			}
 
-			if (!sbi->s_last_block) {
-				udf_debug("Unable to determine Lastblock (For "
-					  "Virtual Partition)\n");
-				return 1;
-			}
+		if (map->s_partition_type != UDF_VIRTUAL_MAP15 &&
+			map->s_partition_type != UDF_VIRTUAL_MAP20)
+			continue;
 
-			for (j = 0; j < sbi->s_partitions; j++) {
-				struct udf_part_map *map2 = &sbi->s_partmaps[j];
-				if (j != i &&
-				    map->s_volumeseqnum ==
-						map2->s_volumeseqnum &&
-				    map->s_partition_num ==
-						map2->s_partition_num) {
-					ino.partitionReferenceNum = j;
-					ino.logicalBlockNum =
-						sbi->s_last_block -
-							map2->s_partition_root;
-					break;
-				}
+		if (!sbi->s_last_block) {
+			sbi->s_last_block = udf_get_last_block(sb);
+			udf_find_anchor(sb);
+		}
+
+		if (!sbi->s_last_block) {
+			udf_debug("Unable to determine Lastblock (For "
+					"Virtual Partition)\n");
+			return 1;
+		}
+
+		for (j = 0; j < sbi->s_partitions; j++) {
+			struct udf_part_map *map2 = &sbi->s_partmaps[j];
+			if (j != i &&
+				map->s_volumeseqnum ==
+					map2->s_volumeseqnum &&
+				map->s_partition_num ==
+					map2->s_partition_num) {
+				ino.partitionReferenceNum = j;
+				ino.logicalBlockNum =
+					sbi->s_last_block -
+						map2->s_partition_root;
+				break;
 			}
+		}
 
-			if (j == sbi->s_partitions)
-				return 1;
+		if (j == sbi->s_partitions)
+			return 1;
 
-			sbi->s_vat_inode = udf_iget(sb, ino);
-			if (!sbi->s_vat_inode)
-				return 1;
+		sbi->s_vat_inode = udf_iget(sb, ino);
+		if (!sbi->s_vat_inode)
+			return 1;
 
-			if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
-				map->s_type_specific.s_virtual.s_start_offset =
-					udf_ext0_offset(sbi->s_vat_inode);
-				map->s_type_specific.s_virtual.s_num_entries =
-					(sbi->s_vat_inode->i_size - 36) >> 2;
-			} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
-				uint32_t pos;
-				struct virtualAllocationTable20 *vat20;
-
-				pos = udf_block_map(sbi->s_vat_inode, 0);
-				bh = sb_bread(sb, pos);
-				if (!bh)
-					return 1;
-				vat20 = (struct virtualAllocationTable20 *)
-					bh->b_data +
-					udf_ext0_offset(sbi->s_vat_inode);
-				map->s_type_specific.s_virtual.s_start_offset =
-					le16_to_cpu(vat20->lengthHeader) +
-					udf_ext0_offset(sbi->s_vat_inode);
-				map->s_type_specific.s_virtual.s_num_entries =
-					(sbi->s_vat_inode->i_size -
-					 map->s_type_specific.s_virtual.
-							s_start_offset) >> 2;
-				brelse(bh);
-			}
-			map->s_partition_root = udf_get_pblock(sb, 0, i, 0);
-			map->s_partition_len =
-				sbi->s_partmaps[ino.partitionReferenceNum].
-								s_partition_len;
+		if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
+			map->s_type_specific.s_virtual.s_start_offset =
+				udf_ext0_offset(sbi->s_vat_inode);
+			map->s_type_specific.s_virtual.s_num_entries =
+				(sbi->s_vat_inode->i_size - 36) >> 2;
+		} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
+			uint32_t pos;
+			struct virtualAllocationTable20 *vat20;
+
+			pos = udf_block_map(sbi->s_vat_inode, 0);
+			bh = sb_bread(sb, pos);
+			if (!bh)
+				return 1;
+			vat20 = (struct virtualAllocationTable20 *)
+				bh->b_data +
+				udf_ext0_offset(sbi->s_vat_inode);
+			map->s_type_specific.s_virtual.s_start_offset =
+				le16_to_cpu(vat20->lengthHeader) +
+				udf_ext0_offset(sbi->s_vat_inode);
+			map->s_type_specific.s_virtual.s_num_entries =
+				(sbi->s_vat_inode->i_size -
+					map->s_type_specific.s_virtual.
+						s_start_offset) >> 2;
+			brelse(bh);
 		}
+		map->s_partition_root = udf_get_pblock(sb, 0, i, 0);
+		map->s_partition_len =
+			sbi->s_partmaps[ino.partitionReferenceNum].
+							s_partition_len;
 	}
 	return 0;
 }
@@ -1589,26 +1578,26 @@ static void udf_open_lvid(struct super_block *sb)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct buffer_head *bh = sbi->s_lvid_bh;
-	if (bh) {
-		struct logicalVolIntegrityDesc *lvid =
-				(struct logicalVolIntegrityDesc *)bh->b_data;
-		struct logicalVolIntegrityDescImpUse *lvidiu =
-							udf_sb_lvidiu(sbi);
+	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolIntegrityDescImpUse *lvidiu;
+	if (!bh)
+		return;
 
-		lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
-		lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-		udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
-					CURRENT_TIME);
-		lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+	lvidiu = udf_sb_lvidiu(sbi);
 
-		lvid->descTag.descCRC = cpu_to_le16(
-			udf_crc((char *)lvid + sizeof(tag),
-				le16_to_cpu(lvid->descTag.descCRCLength),
-				0));
+	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
+				CURRENT_TIME);
+	lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
 
-		lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
-		mark_buffer_dirty(bh);
-	}
+	lvid->descTag.descCRC = cpu_to_le16(
+		udf_crc((char *)lvid + sizeof(tag),
+			le16_to_cpu(lvid->descTag.descCRCLength), 0));
+
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	mark_buffer_dirty(bh);
 }
 
 static void udf_close_lvid(struct super_block *sb)
@@ -1616,36 +1605,35 @@ static void udf_close_lvid(struct super_block *sb)
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
+	struct logicalVolIntegrityDescImpUse *lvidiu;
 
 	if (!bh)
 		return;
 
 	lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
 
-	if (lvid->integrityType == LVID_INTEGRITY_TYPE_OPEN) {
-		struct logicalVolIntegrityDescImpUse *lvidiu =
-							udf_sb_lvidiu(sbi);
-		lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
-		lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-		udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
-					CURRENT_TIME);
-		if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
-			lvidiu->maxUDFWriteRev =
-					cpu_to_le16(UDF_MAX_WRITE_VERSION);
-		if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
-			lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
-		if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
-			lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
-		lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
-
-		lvid->descTag.descCRC = cpu_to_le16(
-			udf_crc((char *)lvid + sizeof(tag),
-				le16_to_cpu(lvid->descTag.descCRCLength),
-				0));
-
-		lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
-		mark_buffer_dirty(bh);
-	}
+	if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
+		return;
+
+	lvidiu = udf_sb_lvidiu(sbi);
+	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
+	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
+	if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
+		lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
+	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
+		lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
+	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
+		lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
+	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
+
+	lvid->descTag.descCRC = cpu_to_le16(
+		udf_crc((char *)lvid + sizeof(tag),
+			le16_to_cpu(lvid->descTag.descCRCLength),
+			0));
+
+	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+	mark_buffer_dirty(bh);
 }
 
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
-- 
cgit v1.2.3


From 200a3592cda7bfc010d30c0b1eb8eff3791e9ba9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 4 Mar 2008 13:03:09 +0100
Subject: udf: Mark udf_process_sequence() as noinline

Mark udf_process_sequence() as noinline since stack usage is terrible
otherwise.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 17ba054faaa4..bb8d866deb3f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1292,7 +1292,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
  *	July 1, 1997 - Andrew E. Mileski
  *	Written, tested, and released.
  */
-static int udf_process_sequence(struct super_block *sb, long block,
+static noinline int udf_process_sequence(struct super_block *sb, long block,
 				long lastblock, kernel_lb_addr *fileset)
 {
 	struct buffer_head *bh = NULL;
-- 
cgit v1.2.3


From 9bf2c6b834f4caad82b3e2d962c266153d39e411 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 4 Mar 2008 13:10:29 +0100
Subject: udf: Remove checking of existence of filename in udf_add_entry()

We don't have to check whether a directory entry already exists in a directory
when creating a new one since we've already checked that earlier by lookup and
we are holding directory i_mutex all the time.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ac7f72779e93..e9f587201e17 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -303,11 +303,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fileIdentDesc *fi = NULL;
-	char name[UDF_NAME_LEN], fname[UDF_NAME_LEN];
+	char name[UDF_NAME_LEN];
 	int namelen;
 	loff_t f_pos;
-	int flen;
-	char *nameptr;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
 	int nfidlen;
 	uint8_t lfi;
@@ -385,26 +383,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 		liu = le16_to_cpu(cfi->lengthOfImpUse);
 		lfi = cfi->lengthFileIdent;
 
-		if (fibh->sbh == fibh->ebh)
-			nameptr = fi->fileIdent + liu;
-		else {
-			int poffset;	/* Unpaded ending offset */
-
-			poffset = fibh->soffset + sizeof(struct fileIdentDesc) +
-					liu + lfi;
-
-			if (poffset >= lfi)
-				nameptr = (char *)(fibh->ebh->b_data +
-						   poffset - lfi);
-			else {
-				nameptr = fname;
-				memcpy(nameptr, fi->fileIdent + liu,
-					lfi - poffset);
-				memcpy(nameptr + lfi - poffset,
-					fibh->ebh->b_data, poffset);
-			}
-		}
-
 		if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
 			if (((sizeof(struct fileIdentDesc) +
 					liu + lfi + 3) & ~3) == nfidlen) {
@@ -423,20 +401,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 				}
 			}
 		}
-
-		if (!lfi || !dentry)
-			continue;
-
-		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-		if (flen && udf_match(flen, fname, dentry->d_name.len,
-				      dentry->d_name.name)) {
-			if (fibh->sbh != fibh->ebh)
-				brelse(fibh->ebh);
-			brelse(fibh->sbh);
-			brelse(epos.bh);
-			*err = -EEXIST;
-			return NULL;
-		}
 	}
 
 add:
-- 
cgit v1.2.3


From b80697c14dcacd83ed1b78e26ad93b25ecc52c5e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 4 Mar 2008 14:14:05 +0100
Subject: udf: Remove declarations of arrays of size UDF_NAME_LEN (256 bytes)

There are several places in UDF where we declared temporary arrays of
UDF_NAME_LEN bytes on stack. This is not nice to stack usage so this patch
changes those places to use kmalloc() instead. Also clean up bail-out paths
in those functions when we are changing them.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/dir.c   |  57 ++++++++++++-------------
 fs/udf/namei.c | 128 +++++++++++++++++++++++++++++----------------------------
 2 files changed, 93 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 31feef916fb7..62dc270c69d1 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -39,13 +39,13 @@
 static int do_udf_readdir(struct inode *dir, struct file *filp,
 			  filldir_t filldir, void *dirent)
 {
-	struct udf_fileident_bh fibh;
+	struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
 	struct fileIdentDesc *fi = NULL;
 	struct fileIdentDesc cfi;
 	int block, iblock;
 	loff_t nf_pos = (filp->f_pos - 1) << 2;
 	int flen;
-	char fname[UDF_NAME_LEN];
+	char *fname = NULL;
 	char *nameptr;
 	uint16_t liu;
 	uint8_t lfi;
@@ -54,23 +54,32 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	kernel_lb_addr eloc;
 	uint32_t elen;
 	sector_t offset;
-	int i, num;
+	int i, num, ret = 0;
 	unsigned int dt_type;
 	struct extent_position epos = { NULL, 0, {0, 0} };
 	struct udf_inode_info *iinfo;
 
 	if (nf_pos >= size)
-		return 0;
+		goto out;
+
+	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!fname) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (nf_pos == 0)
 		nf_pos = udf_ext0_offset(dir);
 
 	fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
 	iinfo = UDF_I(dir);
-	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-		fibh.sbh = fibh.ebh = NULL;
-	} else if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
-			      &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) {
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
+		    &epos, &eloc, &elen, &offset)
+		    != (EXT_RECORDED_ALLOCATED >> 30)) {
+			ret = -ENOENT;
+			goto out;
+		}
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -83,8 +92,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 		}
 
 		if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) {
-			brelse(epos.bh);
-			return -EIO;
+			ret = -EIO;
+			goto out;
 		}
 
 		if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) {
@@ -105,9 +114,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 					brelse(bha[i]);
 			}
 		}
-	} else {
-		brelse(epos.bh);
-		return -ENOENT;
 	}
 
 	while (nf_pos < size) {
@@ -115,13 +121,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 
 		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
 					&elen, &offset);
-		if (!fi) {
-			if (fibh.sbh != fibh.ebh)
-				brelse(fibh.ebh);
-			brelse(fibh.sbh);
-			brelse(epos.bh);
-			return 0;
-		}
+		if (!fi)
+			goto out;
 
 		liu = le16_to_cpu(cfi.lengthOfImpUse);
 		lfi = cfi.lengthFileIdent;
@@ -167,25 +168,21 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 			dt_type = DT_UNKNOWN;
 		}
 
-		if (flen) {
-			if (filldir(dirent, fname, flen, filp->f_pos, iblock, dt_type) < 0) {
-				if (fibh.sbh != fibh.ebh)
-					brelse(fibh.ebh);
-				brelse(fibh.sbh);
-				brelse(epos.bh);
-	 			return 0;
-			}
-		}
+		if (flen && filldir(dirent, fname, flen, filp->f_pos,
+				    iblock, dt_type) < 0)
+			goto out;
 	} /* end while */
 
 	filp->f_pos = (nf_pos >> 2) + 1;
 
+out:
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
 	brelse(epos.bh);
+	kfree(fname);
 
-	return 0;
+	return ret;
 }
 
 static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index e9f587201e17..68686b79650a 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -149,7 +149,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	struct fileIdentDesc *fi = NULL;
 	loff_t f_pos;
 	int block, flen;
-	char fname[UDF_NAME_LEN];
+	char *fname = NULL;
 	char *nameptr;
 	uint8_t lfi;
 	uint16_t liu;
@@ -163,12 +163,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	size = udf_ext0_offset(dir) + dir->i_size;
 	f_pos = udf_ext0_offset(dir);
 
+	fibh->sbh = fibh->ebh = NULL;
 	fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
-	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-		fibh->sbh = fibh->ebh = NULL;
-	else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
-			      &epos, &eloc, &elen, &offset) ==
-					(EXT_RECORDED_ALLOCATED >> 30)) {
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
+			goto out_err;
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -179,25 +179,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 			offset = 0;
 
 		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
-		if (!fibh->sbh) {
-			brelse(epos.bh);
-			return NULL;
-		}
-	} else {
-		brelse(epos.bh);
-		return NULL;
+		if (!fibh->sbh)
+			goto out_err;
 	}
 
+	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!fname)
+		goto out_err;
+
 	while (f_pos < size) {
 		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
 					&elen, &offset);
-		if (!fi) {
-			if (fibh->sbh != fibh->ebh)
-				brelse(fibh->ebh);
-			brelse(fibh->sbh);
-			brelse(epos.bh);
-			return NULL;
-		}
+		if (!fi)
+			goto out_err;
 
 		liu = le16_to_cpu(cfi->lengthOfImpUse);
 		lfi = cfi->lengthFileIdent;
@@ -237,18 +231,20 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 
 		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
 		if (flen && udf_match(flen, fname, dentry->d_name.len,
-				      dentry->d_name.name)) {
-			brelse(epos.bh);
-			return fi;
-		}
+				      dentry->d_name.name))
+			goto out_ok;
 	}
 
+out_err:
+	fi = NULL;
 	if (fibh->sbh != fibh->ebh)
 		brelse(fibh->ebh);
 	brelse(fibh->sbh);
+out_ok:
 	brelse(epos.bh);
+	kfree(fname);
 
-	return NULL;
+	return fi;
 }
 
 static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
@@ -303,7 +299,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fileIdentDesc *fi = NULL;
-	char name[UDF_NAME_LEN];
+	char *name = NULL;
 	int namelen;
 	loff_t f_pos;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -317,16 +313,23 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo;
 
+	fibh->sbh = fibh->ebh = NULL;
+	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!name) {
+		*err = -ENOMEM;
+		goto out_err;
+	}
+
 	if (dentry) {
 		if (!dentry->d_name.len) {
 			*err = -EINVAL;
-			return NULL;
+			goto out_err;
 		}
 		namelen = udf_put_filename(sb, dentry->d_name.name, name,
 						 dentry->d_name.len);
 		if (!namelen) {
 			*err = -ENAMETOOLONG;
-			return NULL;
+			goto out_err;
 		}
 	} else {
 		namelen = 0;
@@ -338,11 +341,14 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 
 	fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
 	dinfo = UDF_I(dir);
-	if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-		fibh->sbh = fibh->ebh = NULL;
-	else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
-			      &epos, &eloc, &elen, &offset) ==
-					(EXT_RECORDED_ALLOCATED >> 30)) {
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+		    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
+			block = udf_get_lb_pblock(dir->i_sb,
+					dinfo->i_location, 0);
+			fibh->soffset = fibh->eoffset = sb->s_blocksize;
+			goto add;
+		}
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
 			if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -354,17 +360,11 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 
 		fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
 		if (!fibh->sbh) {
-			brelse(epos.bh);
 			*err = -EIO;
-			return NULL;
+			goto out_err;
 		}
 
 		block = dinfo->i_location.logicalBlockNum;
-	} else {
-		block = udf_get_lb_pblock(dir->i_sb, dinfo->i_location, 0);
-		fibh->sbh = fibh->ebh = NULL;
-		fibh->soffset = fibh->eoffset = sb->s_blocksize;
-		goto add;
 	}
 
 	while (f_pos < size) {
@@ -372,12 +372,8 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 					&elen, &offset);
 
 		if (!fi) {
-			if (fibh->sbh != fibh->ebh)
-				brelse(fibh->ebh);
-			brelse(fibh->sbh);
-			brelse(epos.bh);
 			*err = -EIO;
-			return NULL;
+			goto out_err;
 		}
 
 		liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -386,7 +382,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 		if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
 			if (((sizeof(struct fileIdentDesc) +
 					liu + lfi + 3) & ~3) == nfidlen) {
-				brelse(epos.bh);
 				cfi->descTag.tagSerialNum = cpu_to_le16(1);
 				cfi->fileVersionNum = cpu_to_le16(1);
 				cfi->fileCharacteristics = 0;
@@ -394,10 +389,10 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 				cfi->lengthOfImpUse = cpu_to_le16(0);
 				if (!udf_write_fi(dir, cfi, fi, fibh, NULL,
 						  name))
-					return fi;
+					goto out_ok;
 				else {
 					*err = -EIO;
-					return NULL;
+					goto out_err;
 				}
 			}
 		}
@@ -427,7 +422,7 @@ add:
 		fibh->sbh = fibh->ebh =
 				udf_expand_dir_adinicb(dir, &block, err);
 		if (!fibh->sbh)
-			return NULL;
+			goto out_err;
 		epos.block = dinfo->i_location;
 		epos.offset = udf_file_entry_alloc_offset(dir);
 		/* Load extent udf_expand_dir_adinicb() has created */
@@ -468,11 +463,8 @@ add:
 						dir->i_sb->s_blocksize_bits);
 		fibh->ebh = udf_bread(dir,
 				f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
-		if (!fibh->ebh) {
-			brelse(epos.bh);
-			brelse(fibh->sbh);
-			return NULL;
-		}
+		if (!fibh->ebh)
+			goto out_err;
 
 		if (!fibh->soffset) {
 			if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
@@ -503,20 +495,25 @@ add:
 	cfi->lengthFileIdent = namelen;
 	cfi->lengthOfImpUse = cpu_to_le16(0);
 	if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) {
-		brelse(epos.bh);
 		dir->i_size += nfidlen;
 		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 			dinfo->i_lenAlloc += nfidlen;
 		mark_inode_dirty(dir);
-		return fi;
+		goto out_ok;
 	} else {
-		brelse(epos.bh);
-		if (fibh->sbh != fibh->ebh)
-			brelse(fibh->ebh);
-		brelse(fibh->sbh);
 		*err = -EIO;
-		return NULL;
+		goto out_err;
 	}
+
+out_err:
+	fi = NULL;
+	if (fibh->sbh != fibh->ebh)
+		brelse(fibh->ebh);
+	brelse(fibh->sbh);
+out_ok:
+	brelse(epos.bh);
+	kfree(name);
+	return fi;
 }
 
 static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
@@ -871,7 +868,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	char *ea;
 	int err;
 	int block;
-	char name[UDF_NAME_LEN];
+	char *name = NULL;
 	int namelen;
 	struct buffer_head *bh;
 	struct udf_inode_info *iinfo;
@@ -881,6 +878,12 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	if (!inode)
 		goto out;
 
+	name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+		goto out_no_entry;
+	}
+
 	iinfo = UDF_I(inode);
 	inode->i_mode = S_IFLNK | S_IRWXUGO;
 	inode->i_data.a_ops = &udf_symlink_aops;
@@ -1020,6 +1023,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	err = 0;
 
 out:
+	kfree(name);
 	unlock_kernel();
 	return err;
 
-- 
cgit v1.2.3


From d0db181c072259c21a375b290a4574e5ce6d2b5f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 5 Mar 2008 00:03:36 +0100
Subject: udf: fix anchor point detection

According to ECMA 167 rev.  3 (see 3/8.4.2.1), Anchor Volume Descriptor
Pointer should be recorded at two or more anchor points located at sectors
256, N, N - 256, where N - is a largest logical sector number at volume
space.

So we should always try to detect N on UDF volume before trying to find
Anchor Volume Descriptor (i.e.  calling to udf_find_anchor()).

That said, all this patch does is updates the s_last_block even if the
udf_vrs() returns positive value.

Originally written and tested by Yuri Per, ported on latest mainline by me.

Signed-off-by: Yuri Per <Yuri.Per@acronis.com>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Max Lyadvinsky <Max.Lyadvinsky@acronis.com>
Cc: Vladimir Simonov <Vladimir.Simonov@acronis.com>
Cc: Andrew Neporada <Andrew.Neporada@acronis.com>
Cc: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index bb8d866deb3f..137153c5005b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1425,7 +1425,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
 static int udf_check_valid(struct super_block *sb, int novrs, int silent)
 {
 	long block;
-	struct udf_sb_info *sbi;
+	struct udf_sb_info *sbi = UDF_SB(sb);
 
 	if (novrs) {
 		udf_debug("Validity check skipped because of novrs option\n");
@@ -1434,15 +1434,12 @@ static int udf_check_valid(struct super_block *sb, int novrs, int silent)
 	/* Check that it is NSR02 compliant */
 	/* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
 	block = udf_vrs(sb, silent);
-	if (block != -1)
-		return !block;
-
-	sbi = UDF_SB(sb);
-	udf_debug("Failed to read byte 32768. Assuming open "
-			"disc. Skipping validity check\n");
-	if (!sbi->s_last_block)
+	if (block == -1)
+		udf_debug("Failed to read byte 32768. Assuming open "
+			  "disc. Skipping validity check\n");
+	if (block && !sbi->s_last_block)
 		sbi->s_last_block = udf_get_last_block(sb);
-	return 0;
+	return !block;
 }
 
 static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
-- 
cgit v1.2.3


From c0eb31ed130ab18e1858179a644e39aee2355a13 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 1 Apr 2008 16:50:35 +0200
Subject: udf: Cleanup volume descriptor sequence processing

Cleanup processing of volume descriptor sequence so that it is more readable,
make code handle errors (e.g. media problems) better.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 135 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 81 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 137153c5005b..500d1e4cc1c7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -85,16 +85,12 @@ static int udf_remount_fs(struct super_block *, int *, char *);
 static int udf_check_valid(struct super_block *, int, int);
 static int udf_vrs(struct super_block *sb, int silent);
 static int udf_load_partition(struct super_block *, kernel_lb_addr *);
-static int udf_load_logicalvol(struct super_block *, struct buffer_head *,
-			       kernel_lb_addr *);
 static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
 static void udf_find_anchor(struct super_block *);
 static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
 			    kernel_lb_addr *);
-static void udf_load_pvoldesc(struct super_block *, struct buffer_head *);
 static void udf_load_fileset(struct super_block *, struct buffer_head *,
 			     kernel_lb_addr *);
-static int udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -935,11 +931,18 @@ static int udf_find_fileset(struct super_block *sb,
 	return 1;
 }
 
-static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
+static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 {
 	struct primaryVolDesc *pvoldesc;
 	struct ustr instr;
 	struct ustr outstr;
+	struct buffer_head *bh;
+	uint16_t ident;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 1;
+	BUG_ON(ident != TAG_IDENT_PVD);
 
 	pvoldesc = (struct primaryVolDesc *)bh->b_data;
 
@@ -965,6 +968,9 @@ static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
 	if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
 		if (udf_CS0toUTF8(&outstr, &instr))
 			udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+
+	brelse(bh);
+	return 0;
 }
 
 static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
@@ -1018,16 +1024,27 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 	return bitmap;
 }
 
-static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
+static int udf_load_partdesc(struct super_block *sb, sector_t block)
 {
+	struct buffer_head *bh;
 	struct partitionHeaderDesc *phd;
-	struct partitionDesc *p = (struct partitionDesc *)bh->b_data;
+	struct partitionDesc *p;
 	struct udf_part_map *map;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	bool found = false;
 	int i;
-	u16 partitionNumber = le16_to_cpu(p->partitionNumber);
+	uint16_t partitionNumber;
+	uint16_t ident;
+	int ret = 0;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 1;
+	if (ident != TAG_IDENT_PD)
+		goto out_bh;
 
+	p = (struct partitionDesc *)bh->b_data;
+	partitionNumber = le16_to_cpu(p->partitionNumber);
 	for (i = 0; i < sbi->s_partitions; i++) {
 		map = &sbi->s_partmaps[i];
 		udf_debug("Searching map: (%d == %d)\n",
@@ -1040,7 +1057,7 @@ static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 	if (!found) {
 		udf_debug("Partition (%d) not found in partition map\n",
 			  partitionNumber);
-		return 0;
+		goto out_bh;
 	}
 
 	map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
@@ -1062,7 +1079,7 @@ static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 
 	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
 	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
-		return 0;
+		goto out_bh;
 
 	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
 	if (phd->unallocSpaceTable.extLength) {
@@ -1076,7 +1093,8 @@ static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 		if (!map->s_uspace.s_table) {
 			udf_debug("cannot load unallocSpaceTable (part %d)\n",
 					i);
-			return 1;
+			ret = 1;
+			goto out_bh;
 		}
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
 		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
@@ -1110,7 +1128,8 @@ static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 		map->s_fspace.s_table = udf_iget(sb, loc);
 		if (!map->s_fspace.s_table) {
 			udf_debug("cannot load freedSpaceTable (part %d)\n", i);
-			return 1;
+			ret = 1;
+			goto out_bh;
 		}
 
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
@@ -1132,10 +1151,12 @@ static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh)
 		}
 	}
 
-	return 0;
+out_bh:
+	brelse(bh);
+	return ret;
 }
 
-static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
+static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 			       kernel_lb_addr *fileset)
 {
 	struct logicalVolDesc *lvd;
@@ -1143,12 +1164,21 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
 	uint8_t type;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct genericPartitionMap *gpm;
+	uint16_t ident;
+	struct buffer_head *bh;
+	int ret = 0;
 
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 1;
+	BUG_ON(ident != TAG_IDENT_LVD);
 	lvd = (struct logicalVolDesc *)bh->b_data;
 
 	i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
-	if (i != 0)
-		return i;
+	if (i != 0) {
+		ret = i;
+		goto out_bh;
+	}
 
 	for (i = 0, offset = 0;
 	     i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength);
@@ -1187,7 +1217,6 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
 						UDF_ID_SPARABLE,
 						strlen(UDF_ID_SPARABLE))) {
 				uint32_t loc;
-				uint16_t ident;
 				struct sparingTable *st;
 				struct sparablePartitionMap *spm =
 					(struct sparablePartitionMap *)gpm;
@@ -1243,7 +1272,9 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
 	if (lvd->integritySeqExt.extLength)
 		udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
 
-	return 0;
+out_bh:
+	brelse(bh);
+	return ret;
 }
 
 /*
@@ -1301,19 +1332,25 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
 	struct generic_desc *gd;
 	struct volDescPtr *vdp;
 	int done = 0;
-	int i, j;
 	uint32_t vdsn;
 	uint16_t ident;
 	long next_s = 0, next_e = 0;
 
 	memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
 
-	/* Read the main descriptor sequence */
+	/*
+	 * Read the main descriptor sequence and find which descriptors
+	 * are in it.
+	 */
 	for (; (!done && block <= lastblock); block++) {
 
 		bh = udf_read_tagged(sb, block, block, &ident);
-		if (!bh)
-			break;
+		if (!bh) {
+			printk(KERN_ERR "udf: Block %Lu of volume descriptor "
+			       "sequence is corrupted or we could not read "
+			       "it.\n", (unsigned long long)block);
+			return 1;
+		}
 
 		/* Process each descriptor (ISO 13346 3/8.3-8.4) */
 		gd = (struct generic_desc *)bh->b_data;
@@ -1379,41 +1416,31 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
 		}
 		brelse(bh);
 	}
-	for (i = 0; i < VDS_POS_LENGTH; i++) {
-		if (!vds[i].block)
-			continue;
+	/*
+	 * Now read interesting descriptors again and process them
+	 * in a suitable order
+	 */
+	if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
+		printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
+		return 1;
+	}
+	if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
+		return 1;
 
-		bh = udf_read_tagged(sb, vds[i].block, vds[i].block,
-					&ident);
+	if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
+	    vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
+		return 1;
 
-		if (i == VDS_POS_PRIMARY_VOL_DESC)
-			udf_load_pvoldesc(sb, bh);
-		else if (i == VDS_POS_LOGICAL_VOL_DESC) {
-			if (udf_load_logicalvol(sb, bh, fileset)) {
-				brelse(bh);
+	if (vds[VDS_POS_PARTITION_DESC].block) {
+		/*
+		 * We rescan the whole descriptor sequence to find
+		 * partition descriptor blocks and process them.
+		 */
+		for (block = vds[VDS_POS_PARTITION_DESC].block;
+		     block < vds[VDS_POS_TERMINATING_DESC].block;
+		     block++)
+			if (udf_load_partdesc(sb, block))
 				return 1;
-			}
-		} else if (i == VDS_POS_PARTITION_DESC) {
-			struct buffer_head *bh2 = NULL;
-			if (udf_load_partdesc(sb, bh)) {
-				brelse(bh);
-				return 1;
-			}
-			for (j = vds[i].block + 1;
-				j <  vds[VDS_POS_TERMINATING_DESC].block;
-				j++) {
-				bh2 = udf_read_tagged(sb, j, j, &ident);
-				gd = (struct generic_desc *)bh2->b_data;
-				if (ident == TAG_IDENT_PD)
-					if (udf_load_partdesc(sb, bh2)) {
-						brelse(bh);
-						brelse(bh2);
-						return 1;
-					}
-				brelse(bh2);
-			}
-		}
-		brelse(bh);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 2e0838fd0c0b8f0753a4be9af9f9712c4be881e1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 1 Apr 2008 18:08:51 +0200
Subject: udf: Improve error recovery on mount

Report error when we fail to allocate memory for a bitmap and properly
release allocated memory and inodes for all the partitions in case of
mount failure and umount.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 86 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 500d1e4cc1c7..d50e3f5c46e8 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1103,16 +1103,18 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 
 	if (phd->unallocSpaceBitmap.extLength) {
 		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, i);
+		if (!bitmap) {
+			ret = 1;
+			goto out_bh;
+		}
 		map->s_uspace.s_bitmap = bitmap;
-		if (bitmap != NULL) {
-			bitmap->s_extLength = le32_to_cpu(
+		bitmap->s_extLength = le32_to_cpu(
 				phd->unallocSpaceBitmap.extLength);
-			bitmap->s_extPosition = le32_to_cpu(
+		bitmap->s_extPosition = le32_to_cpu(
 				phd->unallocSpaceBitmap.extPosition);
-			map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
-			udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
-					i, bitmap->s_extPosition);
-		}
+		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
+		udf_debug("unallocSpaceBitmap (part %d) @ %d\n", i,
+						bitmap->s_extPosition);
 	}
 
 	if (phd->partitionIntegrityTable.extLength)
@@ -1139,19 +1141,22 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 
 	if (phd->freedSpaceBitmap.extLength) {
 		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, i);
+		if (!bitmap) {
+			ret = 1;
+			goto out_bh;
+		}
 		map->s_fspace.s_bitmap = bitmap;
-		if (bitmap != NULL) {
-			bitmap->s_extLength = le32_to_cpu(
+		bitmap->s_extLength = le32_to_cpu(
 				phd->freedSpaceBitmap.extLength);
-			bitmap->s_extPosition = le32_to_cpu(
+		bitmap->s_extPosition = le32_to_cpu(
 				phd->freedSpaceBitmap.extPosition);
-			map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
-			udf_debug("freedSpaceBitmap (part %d) @ %d\n",
-					i, bitmap->s_extPosition);
-		}
+		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
+		udf_debug("freedSpaceBitmap (part %d) @ %d\n", i,
+					bitmap->s_extPosition);
 	}
 
 out_bh:
+	/* In case loading failed, we handle cleanup in udf_fill_super */
 	brelse(bh);
 	return ret;
 }
@@ -1677,6 +1682,23 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 		vfree(bitmap);
 }
 
+static void udf_free_partition(struct udf_part_map *map)
+{
+	int i;
+
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
+		iput(map->s_uspace.s_table);
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
+		iput(map->s_fspace.s_table);
+	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
+		udf_sb_free_bitmap(map->s_uspace.s_bitmap);
+	if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
+		udf_sb_free_bitmap(map->s_fspace.s_bitmap);
+	if (map->s_partition_type == UDF_SPARABLE_MAP15)
+		for (i = 0; i < 4; i++)
+			brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
+}
+
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
 {
 	int i;
@@ -1846,21 +1868,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 error_out:
 	if (sbi->s_vat_inode)
 		iput(sbi->s_vat_inode);
-	if (sbi->s_partitions) {
-		struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition];
-		if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
-			iput(map->s_uspace.s_table);
-		if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-			iput(map->s_fspace.s_table);
-		if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
-			udf_sb_free_bitmap(map->s_uspace.s_bitmap);
-		if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-			udf_sb_free_bitmap(map->s_fspace.s_bitmap);
-		if (map->s_partition_type == UDF_SPARABLE_MAP15)
-			for (i = 0; i < 4; i++)
-				brelse(map->s_type_specific.s_sparing.
-						s_spar_map[i]);
-	}
+	if (sbi->s_partitions)
+		for (i = 0; i < sbi->s_partitions; i++)
+			udf_free_partition(&sbi->s_partmaps[i]);
 #ifdef CONFIG_UDF_NLS
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 		unload_nls(sbi->s_nls_map);
@@ -1912,21 +1922,9 @@ static void udf_put_super(struct super_block *sb)
 	sbi = UDF_SB(sb);
 	if (sbi->s_vat_inode)
 		iput(sbi->s_vat_inode);
-	if (sbi->s_partitions) {
-		struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition];
-		if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
-			iput(map->s_uspace.s_table);
-		if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
-			iput(map->s_fspace.s_table);
-		if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
-			udf_sb_free_bitmap(map->s_uspace.s_bitmap);
-		if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
-			udf_sb_free_bitmap(map->s_fspace.s_bitmap);
-		if (map->s_partition_type == UDF_SPARABLE_MAP15)
-			for (i = 0; i < 4; i++)
-				brelse(map->s_type_specific.s_sparing.
-						s_spar_map[i]);
-	}
+	if (sbi->s_partitions)
+		for (i = 0; i < sbi->s_partitions; i++)
+			udf_free_partition(&sbi->s_partmaps[i]);
 #ifdef CONFIG_UDF_NLS
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
 		unload_nls(sbi->s_nls_map);
-- 
cgit v1.2.3


From 3fb38dfa0e28575170119c70cb2ab70fdb8478fb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 2 Apr 2008 12:26:36 +0200
Subject: udf: Move filling of partition descriptor info into a separate
 function

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 118 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 62 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index d50e3f5c46e8..c5fef85a9c15 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1024,41 +1024,14 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 	return bitmap;
 }
 
-static int udf_load_partdesc(struct super_block *sb, sector_t block)
+static int udf_fill_partdesc_info(struct super_block *sb,
+		struct partitionDesc *p, int p_index)
 {
-	struct buffer_head *bh;
-	struct partitionHeaderDesc *phd;
-	struct partitionDesc *p;
 	struct udf_part_map *map;
 	struct udf_sb_info *sbi = UDF_SB(sb);
-	bool found = false;
-	int i;
-	uint16_t partitionNumber;
-	uint16_t ident;
-	int ret = 0;
-
-	bh = udf_read_tagged(sb, block, block, &ident);
-	if (!bh)
-		return 1;
-	if (ident != TAG_IDENT_PD)
-		goto out_bh;
-
-	p = (struct partitionDesc *)bh->b_data;
-	partitionNumber = le16_to_cpu(p->partitionNumber);
-	for (i = 0; i < sbi->s_partitions; i++) {
-		map = &sbi->s_partmaps[i];
-		udf_debug("Searching map: (%d == %d)\n",
-			  map->s_partition_num, partitionNumber);
-		found = map->s_partition_num == partitionNumber;
-		if (found)
-			break;
-	}
+	struct partitionHeaderDesc *phd;
 
-	if (!found) {
-		udf_debug("Partition (%d) not found in partition map\n",
-			  partitionNumber);
-		goto out_bh;
-	}
+	map = &sbi->s_partmaps[p_index];
 
 	map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
 	map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
@@ -1073,88 +1046,121 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 		map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
 
 	udf_debug("Partition (%d:%d type %x) starts at physical %d, "
-		  "block length %d\n", partitionNumber, i,
+		  "block length %d\n", partitionNumber, p_index,
 		  map->s_partition_type, map->s_partition_root,
 		  map->s_partition_len);
 
 	if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
 	    strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
-		goto out_bh;
+		return 0;
 
 	phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
 	if (phd->unallocSpaceTable.extLength) {
 		kernel_lb_addr loc = {
 			.logicalBlockNum = le32_to_cpu(
 				phd->unallocSpaceTable.extPosition),
-			.partitionReferenceNum = i,
+			.partitionReferenceNum = p_index,
 		};
 
 		map->s_uspace.s_table = udf_iget(sb, loc);
 		if (!map->s_uspace.s_table) {
 			udf_debug("cannot load unallocSpaceTable (part %d)\n",
-					i);
-			ret = 1;
-			goto out_bh;
+					p_index);
+			return 1;
 		}
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
 		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
-				i, map->s_uspace.s_table->i_ino);
+				p_index, map->s_uspace.s_table->i_ino);
 	}
 
 	if (phd->unallocSpaceBitmap.extLength) {
-		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, i);
-		if (!bitmap) {
-			ret = 1;
-			goto out_bh;
-		}
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+		if (!bitmap)
+			return 1;
 		map->s_uspace.s_bitmap = bitmap;
 		bitmap->s_extLength = le32_to_cpu(
 				phd->unallocSpaceBitmap.extLength);
 		bitmap->s_extPosition = le32_to_cpu(
 				phd->unallocSpaceBitmap.extPosition);
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
-		udf_debug("unallocSpaceBitmap (part %d) @ %d\n", i,
+		udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
 						bitmap->s_extPosition);
 	}
 
 	if (phd->partitionIntegrityTable.extLength)
-		udf_debug("partitionIntegrityTable (part %d)\n", i);
+		udf_debug("partitionIntegrityTable (part %d)\n", p_index);
 
 	if (phd->freedSpaceTable.extLength) {
 		kernel_lb_addr loc = {
 			.logicalBlockNum = le32_to_cpu(
 				phd->freedSpaceTable.extPosition),
-			.partitionReferenceNum = i,
+			.partitionReferenceNum = p_index,
 		};
 
 		map->s_fspace.s_table = udf_iget(sb, loc);
 		if (!map->s_fspace.s_table) {
-			udf_debug("cannot load freedSpaceTable (part %d)\n", i);
-			ret = 1;
-			goto out_bh;
+			udf_debug("cannot load freedSpaceTable (part %d)\n",
+				p_index);
+			return 1;
 		}
 
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
 		udf_debug("freedSpaceTable (part %d) @ %ld\n",
-				i, map->s_fspace.s_table->i_ino);
+				p_index, map->s_fspace.s_table->i_ino);
 	}
 
 	if (phd->freedSpaceBitmap.extLength) {
-		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, i);
-		if (!bitmap) {
-			ret = 1;
-			goto out_bh;
-		}
+		struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
+		if (!bitmap)
+			return 1;
 		map->s_fspace.s_bitmap = bitmap;
 		bitmap->s_extLength = le32_to_cpu(
 				phd->freedSpaceBitmap.extLength);
 		bitmap->s_extPosition = le32_to_cpu(
 				phd->freedSpaceBitmap.extPosition);
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
-		udf_debug("freedSpaceBitmap (part %d) @ %d\n", i,
+		udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
 					bitmap->s_extPosition);
 	}
+	return 0;
+}
+
+static int udf_load_partdesc(struct super_block *sb, sector_t block)
+{
+	struct buffer_head *bh;
+	struct partitionDesc *p;
+	struct udf_part_map *map;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	bool found = false;
+	int i;
+	uint16_t partitionNumber;
+	uint16_t ident;
+	int ret = 0;
+
+	bh = udf_read_tagged(sb, block, block, &ident);
+	if (!bh)
+		return 1;
+	if (ident != TAG_IDENT_PD)
+		goto out_bh;
+
+	p = (struct partitionDesc *)bh->b_data;
+	partitionNumber = le16_to_cpu(p->partitionNumber);
+	for (i = 0; i < sbi->s_partitions; i++) {
+		map = &sbi->s_partmaps[i];
+		udf_debug("Searching map: (%d == %d)\n",
+			  map->s_partition_num, partitionNumber);
+		found = map->s_partition_num == partitionNumber;
+		if (found)
+			break;
+	}
+
+	if (!found) {
+		udf_debug("Partition (%d) not found in partition map\n",
+			  partitionNumber);
+		goto out_bh;
+	}
 
+	ret = udf_fill_partdesc_info(sb, p, i);
 out_bh:
 	/* In case loading failed, we handle cleanup in udf_fill_super */
 	brelse(bh);
-- 
cgit v1.2.3


From 38b74a53e5625b7bbbd08918294b86f1db2f0565 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 2 Apr 2008 16:01:35 +0200
Subject: udf: Move processing of virtual partitions

This patch move processing of UDF virtual partitions close to the place
where other partition types are processed. As a result we now also
properly fill in partition access type.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 165 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 85 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index c5fef85a9c15..7ec828566df2 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -84,7 +84,6 @@ static void udf_write_super(struct super_block *);
 static int udf_remount_fs(struct super_block *, int *, char *);
 static int udf_check_valid(struct super_block *, int, int);
 static int udf_vrs(struct super_block *sb, int silent);
-static int udf_load_partition(struct super_block *, kernel_lb_addr *);
 static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
 static void udf_find_anchor(struct super_block *);
 static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
@@ -1125,14 +1124,54 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 	return 0;
 }
 
+static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map = &sbi->s_partmaps[p_index];
+	kernel_lb_addr ino;
+	struct buffer_head *bh;
+
+	/* VAT file entry is in the last recorded block */
+	ino.partitionReferenceNum = type1_index;
+	ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
+	sbi->s_vat_inode = udf_iget(sb, ino);
+	if (!sbi->s_vat_inode)
+		return 1;
+
+	if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
+		map->s_type_specific.s_virtual.s_start_offset =
+			udf_ext0_offset(sbi->s_vat_inode);
+		map->s_type_specific.s_virtual.s_num_entries =
+			(sbi->s_vat_inode->i_size - 36) >> 2;
+	} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
+		uint32_t pos;
+		struct virtualAllocationTable20 *vat20;
+
+		pos = udf_block_map(sbi->s_vat_inode, 0);
+		bh = sb_bread(sb, pos);
+		if (!bh)
+			return 1;
+		vat20 = (struct virtualAllocationTable20 *)bh->b_data +
+				udf_ext0_offset(sbi->s_vat_inode);
+		map->s_type_specific.s_virtual.s_start_offset =
+			le16_to_cpu(vat20->lengthHeader) +
+			udf_ext0_offset(sbi->s_vat_inode);
+		map->s_type_specific.s_virtual.s_num_entries =
+			(sbi->s_vat_inode->i_size -
+				map->s_type_specific.s_virtual.
+					s_start_offset) >> 2;
+		brelse(bh);
+	}
+	return 0;
+}
+
 static int udf_load_partdesc(struct super_block *sb, sector_t block)
 {
 	struct buffer_head *bh;
 	struct partitionDesc *p;
 	struct udf_part_map *map;
 	struct udf_sb_info *sbi = UDF_SB(sb);
-	bool found = false;
-	int i;
+	int i, type1_idx;
 	uint16_t partitionNumber;
 	uint16_t ident;
 	int ret = 0;
@@ -1145,22 +1184,59 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 
 	p = (struct partitionDesc *)bh->b_data;
 	partitionNumber = le16_to_cpu(p->partitionNumber);
+
+	/* First scan for TYPE1 and SPARABLE partitions */
 	for (i = 0; i < sbi->s_partitions; i++) {
 		map = &sbi->s_partmaps[i];
 		udf_debug("Searching map: (%d == %d)\n",
 			  map->s_partition_num, partitionNumber);
-		found = map->s_partition_num == partitionNumber;
-		if (found)
+		if (map->s_partition_num == partitionNumber &&
+		    (map->s_partition_type == UDF_TYPE1_MAP15 ||
+		     map->s_partition_type == UDF_SPARABLE_MAP15))
 			break;
 	}
 
-	if (!found) {
+	if (i >= sbi->s_partitions) {
 		udf_debug("Partition (%d) not found in partition map\n",
 			  partitionNumber);
 		goto out_bh;
 	}
 
 	ret = udf_fill_partdesc_info(sb, p, i);
+
+	/*
+	 * Now rescan for VIRTUAL partitions when TYPE1 partitions are
+	 * already set up
+	 */
+	type1_idx = i;
+	for (i = 0; i < sbi->s_partitions; i++) {
+		map = &sbi->s_partmaps[i];
+
+		if (map->s_partition_num == partitionNumber &&
+		    (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+		     map->s_partition_type == UDF_VIRTUAL_MAP20))
+			break;
+	}
+
+	if (i >= sbi->s_partitions)
+		goto out_bh;
+
+	ret = udf_fill_partdesc_info(sb, p, i);
+	if (ret)
+		goto out_bh;
+
+	if (!sbi->s_last_block) {
+		sbi->s_last_block = udf_get_last_block(sb);
+		udf_find_anchor(sb);
+		if (!sbi->s_last_block) {
+			udf_debug("Unable to determine Lastblock (For "
+					"Virtual Partition)\n");
+			ret = 1;
+			goto out_bh;
+		}
+	}
+
+	ret = udf_load_vat(sb, i, type1_idx);
 out_bh:
 	/* In case loading failed, we handle cleanup in udf_fill_super */
 	brelse(bh);
@@ -1480,13 +1556,13 @@ static int udf_check_valid(struct super_block *sb, int novrs, int silent)
 	return !block;
 }
 
-static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
+static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
 {
 	struct anchorVolDescPtr *anchor;
 	uint16_t ident;
 	struct buffer_head *bh;
 	long main_s, main_e, reserve_s, reserve_e;
-	int i, j;
+	int i;
 	struct udf_sb_info *sbi;
 
 	if (!sb)
@@ -1535,77 +1611,6 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
 	}
 	udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
 
-	for (i = 0; i < sbi->s_partitions; i++) {
-		kernel_lb_addr uninitialized_var(ino);
-		struct udf_part_map *map = &sbi->s_partmaps[i];
-
-		if (map->s_partition_type != UDF_VIRTUAL_MAP15 &&
-			map->s_partition_type != UDF_VIRTUAL_MAP20)
-			continue;
-
-		if (!sbi->s_last_block) {
-			sbi->s_last_block = udf_get_last_block(sb);
-			udf_find_anchor(sb);
-		}
-
-		if (!sbi->s_last_block) {
-			udf_debug("Unable to determine Lastblock (For "
-					"Virtual Partition)\n");
-			return 1;
-		}
-
-		for (j = 0; j < sbi->s_partitions; j++) {
-			struct udf_part_map *map2 = &sbi->s_partmaps[j];
-			if (j != i &&
-				map->s_volumeseqnum ==
-					map2->s_volumeseqnum &&
-				map->s_partition_num ==
-					map2->s_partition_num) {
-				ino.partitionReferenceNum = j;
-				ino.logicalBlockNum =
-					sbi->s_last_block -
-						map2->s_partition_root;
-				break;
-			}
-		}
-
-		if (j == sbi->s_partitions)
-			return 1;
-
-		sbi->s_vat_inode = udf_iget(sb, ino);
-		if (!sbi->s_vat_inode)
-			return 1;
-
-		if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
-			map->s_type_specific.s_virtual.s_start_offset =
-				udf_ext0_offset(sbi->s_vat_inode);
-			map->s_type_specific.s_virtual.s_num_entries =
-				(sbi->s_vat_inode->i_size - 36) >> 2;
-		} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
-			uint32_t pos;
-			struct virtualAllocationTable20 *vat20;
-
-			pos = udf_block_map(sbi->s_vat_inode, 0);
-			bh = sb_bread(sb, pos);
-			if (!bh)
-				return 1;
-			vat20 = (struct virtualAllocationTable20 *)
-				bh->b_data +
-				udf_ext0_offset(sbi->s_vat_inode);
-			map->s_type_specific.s_virtual.s_start_offset =
-				le16_to_cpu(vat20->lengthHeader) +
-				udf_ext0_offset(sbi->s_vat_inode);
-			map->s_type_specific.s_virtual.s_num_entries =
-				(sbi->s_vat_inode->i_size -
-					map->s_type_specific.s_virtual.
-						s_start_offset) >> 2;
-			brelse(bh);
-		}
-		map->s_partition_root = udf_get_pblock(sb, 0, i, 0);
-		map->s_partition_len =
-			sbi->s_partmaps[ino.partitionReferenceNum].
-							s_partition_len;
-	}
 	return 0;
 }
 
@@ -1790,7 +1795,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sb->s_magic = UDF_SUPER_MAGIC;
 	sb->s_time_gran = 1000;
 
-	if (udf_load_partition(sb, &fileset)) {
+	if (udf_load_sequence(sb, &fileset)) {
 		printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
 		goto error_out;
 	}
-- 
cgit v1.2.3


From 423cf6dc04eb79d441bfda2b127bc4b57134b41d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Apr 2008 15:59:23 +0200
Subject: udf: Cleanup anchor block detection.

UDF anchor block detection is complicated by several things - there are several
places where the anchor point can be, some of them relative to the last
recorded block which some devices report wrongly. Moreover some devices on some
media seem to have 7 spare blocks sectors for every 32 blocks (at least as far
as I understand the old code) so we have to count also with that possibility.

This patch splits anchor block detection into several functions so that it is
clearer what we actually try to do. We fix several bugs of the type "for such
and such media, we fail to check block blah" as a result of the cleanup.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c  | 239 +++++++++++++++++++++++---------------------------------
 fs/udf/udf_sb.h |   2 +-
 2 files changed, 99 insertions(+), 142 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7ec828566df2..fe0dafebde71 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -678,149 +678,120 @@ static int udf_vrs(struct super_block *sb, int silent)
 }
 
 /*
- * udf_find_anchor
- *
- * PURPOSE
- *	Find an anchor volume descriptor.
- *
- * PRE-CONDITIONS
- *	sb			Pointer to _locked_ superblock.
- *	lastblock		Last block on media.
- *
- * POST-CONDITIONS
- *	<return>		1 if not found, 0 if ok
- *
- * HISTORY
- *	July 1, 1997 - Andrew E. Mileski
- *	Written, tested, and released.
+ * Check whether there is an anchor block in the given block
  */
-static void udf_find_anchor(struct super_block *sb)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+					bool varconv)
 {
-	int lastblock;
 	struct buffer_head *bh = NULL;
+	tag *t;
 	uint16_t ident;
 	uint32_t location;
+
+	if (varconv)
+		bh = sb_bread(sb, udf_fixed_to_variable(block));
+	else
+		bh = sb_bread(sb, block);
+
+	if (!bh)
+		return 0;
+
+	t = (tag *)bh->b_data;
+	ident = le16_to_cpu(t->tagIdent);
+	location = le32_to_cpu(t->tagLocation);
+	brelse(bh);
+	if (ident != TAG_IDENT_AVDP)
+		return 0;
+	return location == block;
+}
+
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
+					sector_t lastblock)
+{
+	sector_t last[4];
 	int i;
-	struct udf_sb_info *sbi;
+	struct udf_sb_info *sbi = UDF_SB(sb);
 
-	sbi = UDF_SB(sb);
-	lastblock = sbi->s_last_block;
+	last[0] = lastblock;
+	last[1] = last[0] - 2;
+	last[2] = last[0] - 150;
+	last[3] = last[0] - 152;
 
-	if (lastblock) {
-		int varlastblock = udf_variable_to_fixed(lastblock);
-		int last[] =  { lastblock, lastblock - 2,
-				lastblock - 150, lastblock - 152,
-				varlastblock, varlastblock - 2,
-				varlastblock - 150, varlastblock - 152 };
-
-		lastblock = 0;
-
-		/* Search for an anchor volume descriptor pointer */
-
-		/*  according to spec, anchor is in either:
-		 *     block 256
-		 *     lastblock-256
-		 *     lastblock
-		 *  however, if the disc isn't closed, it could be 512 */
-
-		for (i = 0; !lastblock && i < ARRAY_SIZE(last); i++) {
-			ident = location = 0;
-			if (last[i] >= 0) {
-				bh = sb_bread(sb, last[i]);
-				if (bh) {
-					tag *t = (tag *)bh->b_data;
-					ident = le16_to_cpu(t->tagIdent);
-					location = le32_to_cpu(t->tagLocation);
-					brelse(bh);
-				}
-			}
+	/*  according to spec, anchor is in either:
+	 *     block 256
+	 *     lastblock-256
+	 *     lastblock
+	 *  however, if the disc isn't closed, it could be 512 */
 
-			if (ident == TAG_IDENT_AVDP) {
-				if (location == last[i] - sbi->s_session) {
-					lastblock = last[i] - sbi->s_session;
-					sbi->s_anchor[0] = lastblock;
-					sbi->s_anchor[1] = lastblock - 256;
-				} else if (location ==
-						udf_variable_to_fixed(last[i]) -
-							sbi->s_session) {
-					UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
-					lastblock =
-						udf_variable_to_fixed(last[i]) -
-							sbi->s_session;
-					sbi->s_anchor[0] = lastblock;
-					sbi->s_anchor[1] = lastblock - 256 -
-								sbi->s_session;
-				} else {
-					udf_debug("Anchor found at block %d, "
-						  "location mismatch %d.\n",
-						  last[i], location);
-				}
-			} else if (ident == TAG_IDENT_FE ||
-					ident == TAG_IDENT_EFE) {
-				lastblock = last[i];
-				sbi->s_anchor[3] = 512;
-			} else {
-				ident = location = 0;
-				if (last[i] >= 256) {
-					bh = sb_bread(sb, last[i] - 256);
-					if (bh) {
-						tag *t = (tag *)bh->b_data;
-						ident = le16_to_cpu(
-								t->tagIdent);
-						location = le32_to_cpu(
-								t->tagLocation);
-						brelse(bh);
-					}
-				}
+	for (i = 0; i < ARRAY_SIZE(last); i++) {
+		if (last[i] < 0)
+			continue;
 
-				if (ident == TAG_IDENT_AVDP &&
-				    location == last[i] - 256 -
-						sbi->s_session) {
-					lastblock = last[i];
-					sbi->s_anchor[1] = last[i] - 256;
-				} else {
-					ident = location = 0;
-					if (last[i] >= 312 + sbi->s_session) {
-						bh = sb_bread(sb,
-								last[i] - 312 -
-								sbi->s_session);
-						if (bh) {
-							tag *t = (tag *)
-								 bh->b_data;
-							ident = le16_to_cpu(
-								t->tagIdent);
-							location = le32_to_cpu(
-								t->tagLocation);
-							brelse(bh);
-						}
-					}
+		if (udf_check_anchor_block(sb, last[i], varconv)) {
+			sbi->s_anchor[0] = last[i];
+			sbi->s_anchor[1] = last[i] - 256;
+			return last[i];
+		}
 
-					if (ident == TAG_IDENT_AVDP &&
-					    location == udf_variable_to_fixed(last[i]) - 256) {
-						UDF_SET_FLAG(sb,
-							     UDF_FLAG_VARCONV);
-						lastblock = udf_variable_to_fixed(last[i]);
-						sbi->s_anchor[1] = lastblock - 256;
-					}
-				}
-			}
+		if (last[i] < 256)
+			continue;
+
+		if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+			sbi->s_anchor[1] = last[i] - 256;
+			return last[i];
 		}
 	}
 
-	if (!lastblock) {
-		/* We haven't found the lastblock. check 312 */
-		bh = sb_bread(sb, 312 + sbi->s_session);
-		if (bh) {
-			tag *t = (tag *)bh->b_data;
-			ident = le16_to_cpu(t->tagIdent);
-			location = le32_to_cpu(t->tagLocation);
-			brelse(bh);
+	if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+		sbi->s_anchor[0] = sbi->s_session + 256;
+		return last[0];
+	}
+	if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+		sbi->s_anchor[0] = sbi->s_session + 512;
+		return last[0];
+	}
+	return 0;
+}
 
-			if (ident == TAG_IDENT_AVDP && location == 256)
-				UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
-		}
+/*
+ * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
+ * be the last block on the media.
+ *
+ * Return 1 if not found, 0 if ok
+ *
+ */
+static void udf_find_anchor(struct super_block *sb)
+{
+	sector_t lastblock;
+	struct buffer_head *bh = NULL;
+	uint16_t ident;
+	int i;
+	struct udf_sb_info *sbi = UDF_SB(sb);
+
+	lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+	if (lastblock)
+		goto check_anchor;
+
+	/* No anchor found? Try VARCONV conversion of block numbers */
+	/* Firstly, we try to not convert number of the last block */
+	lastblock = udf_scan_anchors(sb, 1,
+				udf_variable_to_fixed(sbi->s_last_block));
+	if (lastblock) {
+		UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+		goto check_anchor;
 	}
 
+	/* Secondly, we try with converted number of the last block */
+	lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
+	if (lastblock)
+		UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+
+check_anchor:
+	/*
+	 * Check located anchors and the anchor block supplied via
+	 * mount options
+	 */
 	for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
 		if (!sbi->s_anchor[i])
 			continue;
@@ -830,9 +801,7 @@ static void udf_find_anchor(struct super_block *sb)
 			sbi->s_anchor[i] = 0;
 		else {
 			brelse(bh);
-			if ((ident != TAG_IDENT_AVDP) &&
-				(i || (ident != TAG_IDENT_FE &&
-					ident != TAG_IDENT_EFE)))
+			if (ident != TAG_IDENT_AVDP)
 				sbi->s_anchor[i] = 0;
 		}
 	}
@@ -1225,17 +1194,6 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	if (ret)
 		goto out_bh;
 
-	if (!sbi->s_last_block) {
-		sbi->s_last_block = udf_get_last_block(sb);
-		udf_find_anchor(sb);
-		if (!sbi->s_last_block) {
-			udf_debug("Unable to determine Lastblock (For "
-					"Virtual Partition)\n");
-			ret = 1;
-			goto out_bh;
-		}
-	}
-
 	ret = udf_load_vat(sb, i, type1_idx);
 out_bh:
 	/* In case loading failed, we handle cleanup in udf_fill_super */
@@ -1778,7 +1736,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sbi->s_last_block = uopt.lastblock;
 	sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
 	sbi->s_anchor[2] = uopt.anchor;
-	sbi->s_anchor[3] = 256;
 
 	if (udf_check_valid(sb, uopt.novrs, silent)) {
 		/* read volume recognition sequences */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index fb2a6e9f8dac..8308a12e1232 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -100,7 +100,7 @@ struct udf_sb_info {
 
 	/* Sector headers */
 	__s32			s_session;
-	__u32			s_anchor[4];
+	__u32			s_anchor[3];
 	__u32			s_last_block;
 
 	struct buffer_head	*s_lvid_bh;
-- 
cgit v1.2.3


From 5fb28aa25ab0b71af2e441d68e63ad257e610a04 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Apr 2008 16:15:04 +0200
Subject: udf: Improve anchor block detection

Add <last block>+1 and <last block>-1 to a list of blocks which can be the
real last recorded block on a UDF media. Sebastian Manciulea
<manciuleas@yahoo.com> claims this helps some drive + media combinations
he is able to test.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index fe0dafebde71..b9e719de0704 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -709,14 +709,16 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
 static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
 					sector_t lastblock)
 {
-	sector_t last[4];
+	sector_t last[6];
 	int i;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 
 	last[0] = lastblock;
-	last[1] = last[0] - 2;
-	last[2] = last[0] - 150;
-	last[3] = last[0] - 152;
+	last[1] = last[0] - 1;
+	last[2] = last[0] + 1;
+	last[3] = last[0] - 2;
+	last[4] = last[0] - 150;
+	last[5] = last[0] - 152;
 
 	/*  according to spec, anchor is in either:
 	 *     block 256
@@ -727,6 +729,9 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
 	for (i = 0; i < ARRAY_SIZE(last); i++) {
 		if (last[i] < 0)
 			continue;
+		if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+				sb->s_blocksize_bits)
+			continue;
 
 		if (udf_check_anchor_block(sb, last[i], varconv)) {
 			sbi->s_anchor[0] = last[i];
-- 
cgit v1.2.3


From 4f7874c868eaedd0e64b2f6c800bc852bdc7f38b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Apr 2008 23:16:38 +0200
Subject: udf: Silence warning about accesses beyond end of device

Some of the computed positions of anchor block could be beyond the end of
device. Skip reading such blocks.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index b9e719de0704..28ed3f5ebd70 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -688,8 +688,12 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
 	uint16_t ident;
 	uint32_t location;
 
-	if (varconv)
+	if (varconv) {
+		if (udf_fixed_to_variable(block) >=
+		    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+			return 0;
 		bh = sb_bread(sb, udf_fixed_to_variable(block));
+	}
 	else
 		bh = sb_bread(sb, block);
 
-- 
cgit v1.2.3


From c82a127505d39fa81c886eceed6fdf8c1ff4a06b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Apr 2008 01:16:32 +0200
Subject: udf: Fix detection of VAT version

We incorrectly (way to strictly) checked version of VAT on loading and thus
refuse to mount correct media.  There are just two format versions - below 2.0
and above 2.0 and we understand both. So update the version check accordingly.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 28ed3f5ebd70..14f965e8a738 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1256,12 +1256,12 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 				u16 suf =
 					le16_to_cpu(((__le16 *)upm2->partIdent.
 							identSuffix)[0]);
-				if (suf == 0x0150) {
+				if (suf < 0x0200) {
 					map->s_partition_type =
 							UDF_VIRTUAL_MAP15;
 					map->s_partition_func =
 							udf_get_pblock_virt15;
-				} else if (suf == 0x0200) {
+				} else {
 					map->s_partition_type =
 							UDF_VIRTUAL_MAP20;
 					map->s_partition_func =
-- 
cgit v1.2.3


From 742e1795e2d9dc99657742e5bbbb7907596bf6c3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Apr 2008 01:17:52 +0200
Subject: udf: Allow loading of VAT inode

UDF media with VAT could have never worked because udf_fill_inode() didn't
know about case FILE_TYPE_VAT20. Fix this.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2362bf0c6900..c150b6df6261 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1275,6 +1275,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	case ICBTAG_FILE_TYPE_REALTIME:
 	case ICBTAG_FILE_TYPE_REGULAR:
 	case ICBTAG_FILE_TYPE_UNDEF:
+	case ICBTAG_FILE_TYPE_VAT20:
 		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 			inode->i_data.a_ops = &udf_adinicb_aops;
 		else
-- 
cgit v1.2.3


From fa5e08156335d0687c85b4e724db9448fb166601 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Apr 2008 02:08:53 +0200
Subject: udf: Handle VAT packed inside inode properly

We didn't handle VAT packed inside the inode - we tried to call udf_block_map()
on such file which lead to strange results at best. Add proper handling of
packed VAT as we do it with other packed files.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/partition.c | 10 +++++++---
 fs/udf/super.c     | 24 +++++++++++++++---------
 2 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 307c9c33d184..b2e6e1eddb90 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -54,11 +54,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
 	struct udf_virtual_data *vdata;
-	struct udf_inode_info *iinfo;
+	struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode);
 
 	map = &sbi->s_partmaps[partition];
 	vdata = &map->s_type_specific.s_virtual;
-	index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
 
 	if (block > vdata->s_num_entries) {
 		udf_debug("Trying to access block beyond end of VAT "
@@ -66,6 +65,11 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
 		return 0xFFFFFFFF;
 	}
 
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+		loc = le32_to_cpu(((__le32 *)iinfo->i_ext.i_data)[block]);
+		goto translate;
+	}
+	index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
 	if (block >= index) {
 		block -= index;
 		newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t)));
@@ -88,7 +92,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
 
 	brelse(bh);
 
-	iinfo = UDF_I(sbi->s_vat_inode);
+translate:
 	if (iinfo->i_location.partitionReferenceNum == partition) {
 		udf_debug("recursive call to udf_get_pblock!\n");
 		return 0xFFFFFFFF;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 14f965e8a738..abdb9b31da46 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1107,7 +1107,10 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map = &sbi->s_partmaps[p_index];
 	kernel_lb_addr ino;
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
+	struct udf_inode_info *vati;
+	uint32_t pos;
+	struct virtualAllocationTable20 *vat20;
 
 	/* VAT file entry is in the last recorded block */
 	ino.partitionReferenceNum = type1_index;
@@ -1122,15 +1125,18 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 		map->s_type_specific.s_virtual.s_num_entries =
 			(sbi->s_vat_inode->i_size - 36) >> 2;
 	} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
-		uint32_t pos;
-		struct virtualAllocationTable20 *vat20;
+		vati = UDF_I(sbi->s_vat_inode);
+		if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+			pos = udf_block_map(sbi->s_vat_inode, 0);
+			bh = sb_bread(sb, pos);
+			if (!bh)
+				return 1;
+			vat20 = (struct virtualAllocationTable20 *)bh->b_data;
+		} else {
+			vat20 = (struct virtualAllocationTable20 *)
+							vati->i_ext.i_data;
+		}
 
-		pos = udf_block_map(sbi->s_vat_inode, 0);
-		bh = sb_bread(sb, pos);
-		if (!bh)
-			return 1;
-		vat20 = (struct virtualAllocationTable20 *)bh->b_data +
-				udf_ext0_offset(sbi->s_vat_inode);
 		map->s_type_specific.s_virtual.s_start_offset =
 			le16_to_cpu(vat20->lengthHeader) +
 			udf_ext0_offset(sbi->s_vat_inode);
-- 
cgit v1.2.3


From 96200be3077c5ede16a90b33aca815b444e66043 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Apr 2008 13:29:20 +0200
Subject: udf: Mount filesystem read-only if it has pseudooverwrite partition

As we don't properly support writing to pseudooverwrite partition (we should
add entries to VAT and relocate blocks instead of just writing them), mount
filesystems with such partition as read-only.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index abdb9b31da46..650f20fe9d62 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1208,6 +1208,14 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	ret = udf_fill_partdesc_info(sb, p, i);
 	if (ret)
 		goto out_bh;
+	/*
+	 * Mark filesystem read-only if we have a partition with virtual map
+	 * since we don't handle writing to it (we overwrite blocks instead of
+	 * relocating them).
+	 */
+	sb->s_flags |= MS_RDONLY;
+	printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only because "
+		"writing to pseudooverwrite partition is not implemented.\n");
 
 	ret = udf_load_vat(sb, i, type1_idx);
 out_bh:
-- 
cgit v1.2.3


From f4bcbbd92ebda971f7c2cd1132b399808ed6cf9b Mon Sep 17 00:00:00 2001
From: Sebastian Manciulea <manciuleas@yahoo.com>
Date: Tue, 8 Apr 2008 14:02:11 +0200
Subject: udf: Fix handling of multisession media

According to OSTA UDF specification, only anchor blocks and primary volume
descriptors are placed on media relative to the last session. All other block
numbers are absolute (in the partition or the whole media). This seems to be
confirmed by multisession media created by other systems.

Signed-off-by: Sebastian Manciulea <manciuleas@yahoo.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/misc.c  | 13 +++++--------
 fs/udf/super.c |  2 +-
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 581b6e4cc591..96996204d928 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -203,16 +203,15 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 {
 	tag *tag_p;
 	struct buffer_head *bh = NULL;
-	struct udf_sb_info *sbi = UDF_SB(sb);
 
 	/* Read the block */
 	if (block == 0xFFFFFFFF)
 		return NULL;
 
-	bh = udf_tread(sb, block + sbi->s_session);
+	bh = udf_tread(sb, block);
 	if (!bh) {
 		udf_debug("block=%d, location=%d: read failed\n",
-			  block + sbi->s_session, location);
+			  block, location);
 		return NULL;
 	}
 
@@ -222,8 +221,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 
 	if (location != le32_to_cpu(tag_p->tagLocation)) {
 		udf_debug("location mismatch block %u, tag %u != %u\n",
-			  block + sbi->s_session,
-			  le32_to_cpu(tag_p->tagLocation), location);
+			  block, le32_to_cpu(tag_p->tagLocation), location);
 		goto error_out;
 	}
 
@@ -247,9 +245,8 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 					le16_to_cpu(tag_p->descCRCLength), 0))
 		return bh;
 
-	udf_debug("Crc failure block %d: crc = %d, crclen = %d\n",
-		  block + sbi->s_session, le16_to_cpu(tag_p->descCRC),
-		  le16_to_cpu(tag_p->descCRCLength));
+	udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
+	    le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
 
 error_out:
 	brelse(bh);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 650f20fe9d62..787cedf6cc07 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -587,7 +587,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 static int udf_vrs(struct super_block *sb, int silent)
 {
 	struct volStructDesc *vsd = NULL;
-	int sector = 32768;
+	loff_t sector = 32768;
 	int sectorsize;
 	struct buffer_head *bh = NULL;
 	int iso9660 = 0;
-- 
cgit v1.2.3


From bfb257a5981af805a9394f00f75d3d9f7b611cc0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Apr 2008 20:37:21 +0200
Subject: udf: Add read-only support for 2.50 UDF media

This patch implements parsing of metadata partitions and reading of Metadata
File thus allowing to read UDF 2.50 media. Error resilience is implemented
through accessing the Metadata Mirror File in case the data the Metadata File
cannot be read. The patch is based on the original patch by Sebastian Manciulea
<manciuleas@yahoo.com> and Mircea Fedoreanu <mirceaf_spl@yahoo.com>.

Signed-off-by: Sebastian Manciulea <manciuleas@yahoo.com>
Signed-off-by: Mircea Fedoreanu <mirceaf_spl@yahoo.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c     |   9 +++
 fs/udf/partition.c |  55 ++++++++++++++++
 fs/udf/super.c     | 190 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/udf/udf_sb.h    |  16 ++++-
 fs/udf/udfdecl.h   |   2 +
 5 files changed, 258 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c150b6df6261..6e151f170c08 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1301,6 +1301,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
+	case ICBTAG_FILE_TYPE_MAIN:
+		udf_debug("METADATA FILE-----\n");
+		break;
+	case ICBTAG_FILE_TYPE_MIRROR:
+		udf_debug("METADATA MIRROR FILE-----\n");
+		break;
+	case ICBTAG_FILE_TYPE_BITMAP:
+		udf_debug("METADATA BITMAP FILE-----\n");
+		break;
 	default:
 		printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
 				"file type=%d\n", inode->i_ino,
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index b2e6e1eddb90..2dfe4be2eeb2 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -266,3 +266,58 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
 
 	return 0;
 }
+
+static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
+					uint16_t partition, uint32_t offset)
+{
+	struct super_block *sb = inode->i_sb;
+	struct udf_part_map *map;
+	kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t ext_offset;
+	struct extent_position epos = {};
+	uint32_t phyblock;
+
+	if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) !=
+						(EXT_RECORDED_ALLOCATED >> 30))
+		phyblock = 0xFFFFFFFF;
+	else {
+		map = &UDF_SB(sb)->s_partmaps[partition];
+		/* map to sparable/physical partition desc */
+		phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
+			map->s_partition_num, ext_offset + offset);
+	}
+
+	brelse(epos.bh);
+	return phyblock;
+}
+
+uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
+				uint16_t partition, uint32_t offset)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_meta_data *mdata;
+	uint32_t retblk;
+	struct inode *inode;
+
+	udf_debug("READING from METADATA\n");
+
+	map = &sbi->s_partmaps[partition];
+	mdata = &map->s_type_specific.s_metadata;
+	inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;
+
+	/* We shouldn't mount such media... */
+	BUG_ON(!inode);
+	retblk = udf_try_read_meta(inode, block, partition, offset);
+	if (retblk == 0xFFFFFFFF) {
+		udf_warning(sb, __func__, "error reading from METADATA, "
+			"trying to read from MIRROR");
+		inode = mdata->s_mirror_fe;
+		if (!inode)
+			return 0xFFFFFFFF;
+		retblk = udf_try_read_meta(inode, block, partition, offset);
+	}
+
+	return retblk;
+}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 787cedf6cc07..29b19678327a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -950,6 +950,101 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 	return 0;
 }
 
+static int udf_load_metadata_files(struct super_block *sb, int partition)
+{
+	struct udf_sb_info *sbi = UDF_SB(sb);
+	struct udf_part_map *map;
+	struct udf_meta_data *mdata;
+	kernel_lb_addr addr;
+	int fe_error = 0;
+
+	map = &sbi->s_partmaps[partition];
+	mdata = &map->s_type_specific.s_metadata;
+
+	/* metadata address */
+	addr.logicalBlockNum =  mdata->s_meta_file_loc;
+	addr.partitionReferenceNum = map->s_partition_num;
+
+	udf_debug("Metadata file location: block = %d part = %d\n",
+			  addr.logicalBlockNum, addr.partitionReferenceNum);
+
+	mdata->s_metadata_fe = udf_iget(sb, addr);
+
+	if (mdata->s_metadata_fe == NULL) {
+		udf_warning(sb, __func__, "metadata inode efe not found, "
+				"will try mirror inode.");
+		fe_error = 1;
+	} else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
+		 ICBTAG_FLAG_AD_SHORT) {
+		udf_warning(sb, __func__, "metadata inode efe does not have "
+			"short allocation descriptors!");
+		fe_error = 1;
+		iput(mdata->s_metadata_fe);
+		mdata->s_metadata_fe = NULL;
+	}
+
+	/* mirror file entry */
+	addr.logicalBlockNum = mdata->s_mirror_file_loc;
+	addr.partitionReferenceNum = map->s_partition_num;
+
+	udf_debug("Mirror metadata file location: block = %d part = %d\n",
+			  addr.logicalBlockNum, addr.partitionReferenceNum);
+
+	mdata->s_mirror_fe = udf_iget(sb, addr);
+
+	if (mdata->s_mirror_fe == NULL) {
+		if (fe_error) {
+			udf_error(sb, __func__, "mirror inode efe not found "
+			"and metadata inode is missing too, exiting...");
+			goto error_exit;
+		} else
+			udf_warning(sb, __func__, "mirror inode efe not found,"
+					" but metadata inode is OK");
+	} else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
+		 ICBTAG_FLAG_AD_SHORT) {
+		udf_warning(sb, __func__, "mirror inode efe does not have "
+			"short allocation descriptors!");
+		iput(mdata->s_mirror_fe);
+		mdata->s_mirror_fe = NULL;
+		if (fe_error)
+			goto error_exit;
+	}
+
+	/*
+	 * bitmap file entry
+	 * Note:
+	 * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102)
+	*/
+	if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
+		addr.logicalBlockNum = mdata->s_bitmap_file_loc;
+		addr.partitionReferenceNum = map->s_partition_num;
+
+		udf_debug("Bitmap file location: block = %d part = %d\n",
+			addr.logicalBlockNum, addr.partitionReferenceNum);
+
+		mdata->s_bitmap_fe = udf_iget(sb, addr);
+
+		if (mdata->s_bitmap_fe == NULL) {
+			if (sb->s_flags & MS_RDONLY)
+				udf_warning(sb, __func__, "bitmap inode efe "
+					"not found but it's ok since the disc"
+					" is mounted read-only");
+			else {
+				udf_error(sb, __func__, "bitmap inode efe not "
+					"found and attempted read-write mount");
+				goto error_exit;
+			}
+		}
+	}
+
+	udf_debug("udf_load_metadata_files Ok\n");
+
+	return 0;
+
+error_exit:
+	return 1;
+}
+
 static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
 			     kernel_lb_addr *root)
 {
@@ -1169,7 +1264,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	p = (struct partitionDesc *)bh->b_data;
 	partitionNumber = le16_to_cpu(p->partitionNumber);
 
-	/* First scan for TYPE1 and SPARABLE partitions */
+	/* First scan for TYPE1, SPARABLE and METADATA partitions */
 	for (i = 0; i < sbi->s_partitions; i++) {
 		map = &sbi->s_partmaps[i];
 		udf_debug("Searching map: (%d == %d)\n",
@@ -1189,8 +1284,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	ret = udf_fill_partdesc_info(sb, p, i);
 
 	/*
-	 * Now rescan for VIRTUAL partitions when TYPE1 partitions are
-	 * already set up
+	 * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
+	 * PHYSICAL partitions are already set up
 	 */
 	type1_idx = i;
 	for (i = 0; i < sbi->s_partitions; i++) {
@@ -1198,7 +1293,8 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 
 		if (map->s_partition_num == partitionNumber &&
 		    (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
-		     map->s_partition_type == UDF_VIRTUAL_MAP20))
+		     map->s_partition_type == UDF_VIRTUAL_MAP20 ||
+		     map->s_partition_type == UDF_METADATA_MAP25))
 			break;
 	}
 
@@ -1208,16 +1304,28 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
 	ret = udf_fill_partdesc_info(sb, p, i);
 	if (ret)
 		goto out_bh;
-	/*
-	 * Mark filesystem read-only if we have a partition with virtual map
-	 * since we don't handle writing to it (we overwrite blocks instead of
-	 * relocating them).
-	 */
-	sb->s_flags |= MS_RDONLY;
-	printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only because "
-		"writing to pseudooverwrite partition is not implemented.\n");
 
-	ret = udf_load_vat(sb, i, type1_idx);
+	if (map->s_partition_type == UDF_METADATA_MAP25) {
+		ret = udf_load_metadata_files(sb, i);
+		if (ret) {
+			printk(KERN_ERR "UDF-fs: error loading MetaData "
+			"partition map %d\n", i);
+			goto out_bh;
+		}
+	} else {
+		ret = udf_load_vat(sb, i, type1_idx);
+		if (ret)
+			goto out_bh;
+		/*
+		 * Mark filesystem read-only if we have a partition with
+		 * virtual map since we don't handle writing to it (we
+		 * overwrite blocks instead of relocating them).
+		 */
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
+			"because writing to pseudooverwrite partition is "
+			"not implemented.\n");
+	}
 out_bh:
 	/* In case loading failed, we handle cleanup in udf_fill_super */
 	brelse(bh);
@@ -1316,6 +1424,50 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
 					}
 				}
 				map->s_partition_func = udf_get_pblock_spar15;
+			} else if (!strncmp(upm2->partIdent.ident,
+						UDF_ID_METADATA,
+						strlen(UDF_ID_METADATA))) {
+				struct udf_meta_data *mdata =
+					&map->s_type_specific.s_metadata;
+				struct metadataPartitionMap *mdm =
+						(struct metadataPartitionMap *)
+						&(lvd->partitionMaps[offset]);
+				udf_debug("Parsing Logical vol part %d "
+					"type %d  id=%s\n", i, type,
+					UDF_ID_METADATA);
+
+				map->s_partition_type = UDF_METADATA_MAP25;
+				map->s_partition_func = udf_get_pblock_meta25;
+
+				mdata->s_meta_file_loc   =
+					le32_to_cpu(mdm->metadataFileLoc);
+				mdata->s_mirror_file_loc =
+					le32_to_cpu(mdm->metadataMirrorFileLoc);
+				mdata->s_bitmap_file_loc =
+					le32_to_cpu(mdm->metadataBitmapFileLoc);
+				mdata->s_alloc_unit_size =
+					le32_to_cpu(mdm->allocUnitSize);
+				mdata->s_align_unit_size =
+					le16_to_cpu(mdm->alignUnitSize);
+				mdata->s_dup_md_flag 	 =
+					mdm->flags & 0x01;
+
+				udf_debug("Metadata Ident suffix=0x%x\n",
+					(le16_to_cpu(
+					 ((__le16 *)
+					      mdm->partIdent.identSuffix)[0])));
+				udf_debug("Metadata part num=%d\n",
+					le16_to_cpu(mdm->partitionNum));
+				udf_debug("Metadata part alloc unit size=%d\n",
+					le32_to_cpu(mdm->allocUnitSize));
+				udf_debug("Metadata file loc=%d\n",
+					le32_to_cpu(mdm->metadataFileLoc));
+				udf_debug("Mirror file loc=%d\n",
+				       le32_to_cpu(mdm->metadataMirrorFileLoc));
+				udf_debug("Bitmap file loc=%d\n",
+				       le32_to_cpu(mdm->metadataBitmapFileLoc));
+				udf_debug("Duplicate Flag: %d %d\n",
+					mdata->s_dup_md_flag, mdm->flags);
 			} else {
 				udf_debug("Unknown ident: %s\n",
 					  upm2->partIdent.ident);
@@ -1677,6 +1829,7 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 static void udf_free_partition(struct udf_part_map *map)
 {
 	int i;
+	struct udf_meta_data *mdata;
 
 	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
 		iput(map->s_uspace.s_table);
@@ -1689,6 +1842,17 @@ static void udf_free_partition(struct udf_part_map *map)
 	if (map->s_partition_type == UDF_SPARABLE_MAP15)
 		for (i = 0; i < 4; i++)
 			brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
+	else if (map->s_partition_type == UDF_METADATA_MAP25) {
+		mdata = &map->s_type_specific.s_metadata;
+		iput(mdata->s_metadata_fe);
+		mdata->s_metadata_fe = NULL;
+
+		iput(mdata->s_mirror_fe);
+		mdata->s_mirror_fe = NULL;
+
+		iput(mdata->s_bitmap_fe);
+		mdata->s_bitmap_fe = NULL;
+	}
 }
 
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 8308a12e1232..1c1c514a9725 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -6,7 +6,7 @@
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC			0x15013346
 
-#define UDF_MAX_READ_VERSION		0x0201
+#define UDF_MAX_READ_VERSION		0x0250
 #define UDF_MAX_WRITE_VERSION		0x0201
 
 #define UDF_FLAG_USE_EXTENDED_FE	0
@@ -46,9 +46,22 @@
 #define UDF_VIRTUAL_MAP15		0x1512U
 #define UDF_VIRTUAL_MAP20		0x2012U
 #define UDF_SPARABLE_MAP15		0x1522U
+#define UDF_METADATA_MAP25		0x2511U
 
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
 
+struct udf_meta_data {
+	__u32	s_meta_file_loc;
+	__u32	s_mirror_file_loc;
+	__u32	s_bitmap_file_loc;
+	__u32	s_alloc_unit_size;
+	__u16	s_align_unit_size;
+	__u8 	s_dup_md_flag;
+	struct inode *s_metadata_fe;
+	struct inode *s_mirror_fe;
+	struct inode *s_bitmap_fe;
+};
+
 struct udf_sparing_data {
 	__u16	s_packet_len;
 	struct buffer_head *s_spar_map[4];
@@ -82,6 +95,7 @@ struct udf_part_map {
 	union {
 		struct udf_sparing_data s_sparing;
 		struct udf_virtual_data s_virtual;
+		struct udf_meta_data s_metadata;
 	} s_type_specific;
 	__u32	(*s_partition_func)(struct super_block *, __u32, __u16, __u32);
 	__u16	s_volumeseqnum;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 2cb2f5de4245..30f664ab0cd9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -177,6 +177,8 @@ extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t,
 				      uint32_t);
 extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t,
 				      uint32_t);
+extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
+					  uint32_t);
 extern int udf_relocate_blocks(struct super_block *, long, long *);
 
 /* unicode.c */
-- 
cgit v1.2.3


From 47c9358a015199ed37c66235a2238271a7c8041f Mon Sep 17 00:00:00 2001
From: Sebastian Manciulea <manciuleas@yahoo.com>
Date: Mon, 14 Apr 2008 17:06:36 +0200
Subject: udf: Fix bug in VAT mapping code

Fix mapping of blocks using VAT when it is stored in an inode.
UDF_I(inode)->i_data already points to the beginning of VAT header so there's
no need to add udf_ext0_offset(inode).

Signed-off-by: Sebastian Manciulea <manciuleas@yahoo.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/partition.c | 3 ++-
 fs/udf/super.c     | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 2dfe4be2eeb2..63610f026ae1 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -66,7 +66,8 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
 	}
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-		loc = le32_to_cpu(((__le32 *)iinfo->i_ext.i_data)[block]);
+		loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data +
+			vdata->s_start_offset))[block]);
 		goto translate;
 	}
 	index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 29b19678327a..8f02b30c22ef 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1215,8 +1215,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 		return 1;
 
 	if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
-		map->s_type_specific.s_virtual.s_start_offset =
-			udf_ext0_offset(sbi->s_vat_inode);
+		map->s_type_specific.s_virtual.s_start_offset = 0;
 		map->s_type_specific.s_virtual.s_num_entries =
 			(sbi->s_vat_inode->i_size - 36) >> 2;
 	} else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
@@ -1233,8 +1232,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 		}
 
 		map->s_type_specific.s_virtual.s_start_offset =
-			le16_to_cpu(vat20->lengthHeader) +
-			udf_ext0_offset(sbi->s_vat_inode);
+			le16_to_cpu(vat20->lengthHeader);
 		map->s_type_specific.s_virtual.s_num_entries =
 			(sbi->s_vat_inode->i_size -
 				map->s_type_specific.s_virtual.
-- 
cgit v1.2.3


From 706047a79725b585cf272fdefc234b31b6545c72 Mon Sep 17 00:00:00 2001
From: Sebastian Manciulea <manciuleas@yahoo.com>
Date: Mon, 14 Apr 2008 17:13:01 +0200
Subject: udf: Fix compilation warnings when UDF debug is on

Fix two compilation warnings (and actual bugs in message formatting)
when UDF debugging is turned on.

Signed-off-by: Sebastian Manciulea <manciuleas@yahoo.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c   | 7 ++++---
 fs/udf/udfdecl.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8f02b30c22ef..6823733c0121 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -608,7 +608,8 @@ static int udf_vrs(struct super_block *sb, int silent)
 	sector += (sbi->s_session << sb->s_blocksize_bits);
 
 	udf_debug("Starting at sector %u (%ld byte sectors)\n",
-		  (sector >> sb->s_blocksize_bits), sb->s_blocksize);
+		  (unsigned int)(sector >> sb->s_blocksize_bits),
+		  sb->s_blocksize);
 	/* Process the sequence (if applicable) */
 	for (; !nsr02 && !nsr03; sector += sectorsize) {
 		/* Read a block */
@@ -1117,8 +1118,8 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 	if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
 		map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
 
-	udf_debug("Partition (%d:%d type %x) starts at physical %d, "
-		  "block length %d\n", partitionNumber, p_index,
+	udf_debug("Partition (%d type %x) starts at physical %d, "
+		  "block length %d\n", p_index,
 		  map->s_partition_type, map->s_partition_root,
 		  map->s_partition_len);
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 30f664ab0cd9..61897d312a2e 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -16,7 +16,7 @@
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS	8
 
-#undef UDFFS_DEBUG
+#define UDFFS_DEBUG
 
 #ifdef UDFFS_DEBUG
 #define udf_debug(f, a...) \
-- 
cgit v1.2.3


From f845fced913b1437659bb5baf187698547697afe Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Thu, 17 Apr 2008 09:47:48 +0200
Subject: udf: use crc_itu_t from lib instead of udf_crc

As pointed out by Sergey Vlasov, UDF implements its own version of
the CRC ITU-T V.41.  Convert it to use the one in the library.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Cc: Sergey Vlasov <vsu@altlinux.ru>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/Kconfig       |   1 +
 fs/udf/Makefile  |   2 +-
 fs/udf/crc.c     | 172 -------------------------------------------------------
 fs/udf/inode.c   |  11 ++--
 fs/udf/misc.c    |  12 ++--
 fs/udf/namei.c   |  21 ++++---
 fs/udf/super.c   |  10 ++--
 fs/udf/udfdecl.h |   3 -
 fs/udf/unicode.c |   3 +-
 9 files changed, 32 insertions(+), 203 deletions(-)
 delete mode 100644 fs/udf/crc.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d7312825592b..86229c913ace 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -663,6 +663,7 @@ config ZISOFS
 
 config UDF_FS
 	tristate "UDF file system support"
+	select CRC_ITU_T
 	help
 	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
 	  you intend to mount DVD discs or CDRW's written in packet mode, or
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index be845e7540ef..0d4503f7446d 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_UDF_FS) += udf.o
 
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
 		partition.o super.o truncate.o symlink.o fsync.o \
-		crc.o directory.o misc.o udftime.o unicode.o
+		directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
deleted file mode 100644
index f178c63686e0..000000000000
--- a/fs/udf/crc.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * crc.c
- *
- * PURPOSE
- *	Routines to generate, calculate, and test a 16-bit CRC.
- *
- * DESCRIPTION
- *	The CRC code was devised by Don P. Mitchell of AT&T Bell Laboratories
- *	and Ned W. Rhodes of Software Systems Group. It has been published in
- *	"Design and Validation of Computer Protocols", Prentice Hall,
- *	Englewood Cliffs, NJ, 1991, Chapter 3, ISBN 0-13-539925-4.
- *
- *	Copyright is held by AT&T.
- *
- *	AT&T gives permission for the free use of the CRC source code.
- *
- * COPYRIGHT
- *	This file is distributed under the terms of the GNU General Public
- *	License (GPL). Copies of the GPL can be obtained from:
- *		ftp://prep.ai.mit.edu/pub/gnu/GPL
- *	Each contributing author retains all rights to their own work.
- */
-
-#include "udfdecl.h"
-
-static const uint16_t crc_table[256] = {
-	0x0000U, 0x1021U, 0x2042U, 0x3063U, 0x4084U, 0x50a5U, 0x60c6U, 0x70e7U,
-	0x8108U, 0x9129U, 0xa14aU, 0xb16bU, 0xc18cU, 0xd1adU, 0xe1ceU, 0xf1efU,
-	0x1231U, 0x0210U, 0x3273U, 0x2252U, 0x52b5U, 0x4294U, 0x72f7U, 0x62d6U,
-	0x9339U, 0x8318U, 0xb37bU, 0xa35aU, 0xd3bdU, 0xc39cU, 0xf3ffU, 0xe3deU,
-	0x2462U, 0x3443U, 0x0420U, 0x1401U, 0x64e6U, 0x74c7U, 0x44a4U, 0x5485U,
-	0xa56aU, 0xb54bU, 0x8528U, 0x9509U, 0xe5eeU, 0xf5cfU, 0xc5acU, 0xd58dU,
-	0x3653U, 0x2672U, 0x1611U, 0x0630U, 0x76d7U, 0x66f6U, 0x5695U, 0x46b4U,
-	0xb75bU, 0xa77aU, 0x9719U, 0x8738U, 0xf7dfU, 0xe7feU, 0xd79dU, 0xc7bcU,
-	0x48c4U, 0x58e5U, 0x6886U, 0x78a7U, 0x0840U, 0x1861U, 0x2802U, 0x3823U,
-	0xc9ccU, 0xd9edU, 0xe98eU, 0xf9afU, 0x8948U, 0x9969U, 0xa90aU, 0xb92bU,
-	0x5af5U, 0x4ad4U, 0x7ab7U, 0x6a96U, 0x1a71U, 0x0a50U, 0x3a33U, 0x2a12U,
-	0xdbfdU, 0xcbdcU, 0xfbbfU, 0xeb9eU, 0x9b79U, 0x8b58U, 0xbb3bU, 0xab1aU,
-	0x6ca6U, 0x7c87U, 0x4ce4U, 0x5cc5U, 0x2c22U, 0x3c03U, 0x0c60U, 0x1c41U,
-	0xedaeU, 0xfd8fU, 0xcdecU, 0xddcdU, 0xad2aU, 0xbd0bU, 0x8d68U, 0x9d49U,
-	0x7e97U, 0x6eb6U, 0x5ed5U, 0x4ef4U, 0x3e13U, 0x2e32U, 0x1e51U, 0x0e70U,
-	0xff9fU, 0xefbeU, 0xdfddU, 0xcffcU, 0xbf1bU, 0xaf3aU, 0x9f59U, 0x8f78U,
-	0x9188U, 0x81a9U, 0xb1caU, 0xa1ebU, 0xd10cU, 0xc12dU, 0xf14eU, 0xe16fU,
-	0x1080U, 0x00a1U, 0x30c2U, 0x20e3U, 0x5004U, 0x4025U, 0x7046U, 0x6067U,
-	0x83b9U, 0x9398U, 0xa3fbU, 0xb3daU, 0xc33dU, 0xd31cU, 0xe37fU, 0xf35eU,
-	0x02b1U, 0x1290U, 0x22f3U, 0x32d2U, 0x4235U, 0x5214U, 0x6277U, 0x7256U,
-	0xb5eaU, 0xa5cbU, 0x95a8U, 0x8589U, 0xf56eU, 0xe54fU, 0xd52cU, 0xc50dU,
-	0x34e2U, 0x24c3U, 0x14a0U, 0x0481U, 0x7466U, 0x6447U, 0x5424U, 0x4405U,
-	0xa7dbU, 0xb7faU, 0x8799U, 0x97b8U, 0xe75fU, 0xf77eU, 0xc71dU, 0xd73cU,
-	0x26d3U, 0x36f2U, 0x0691U, 0x16b0U, 0x6657U, 0x7676U, 0x4615U, 0x5634U,
-	0xd94cU, 0xc96dU, 0xf90eU, 0xe92fU, 0x99c8U, 0x89e9U, 0xb98aU, 0xa9abU,
-	0x5844U, 0x4865U, 0x7806U, 0x6827U, 0x18c0U, 0x08e1U, 0x3882U, 0x28a3U,
-	0xcb7dU, 0xdb5cU, 0xeb3fU, 0xfb1eU, 0x8bf9U, 0x9bd8U, 0xabbbU, 0xbb9aU,
-	0x4a75U, 0x5a54U, 0x6a37U, 0x7a16U, 0x0af1U, 0x1ad0U, 0x2ab3U, 0x3a92U,
-	0xfd2eU, 0xed0fU, 0xdd6cU, 0xcd4dU, 0xbdaaU, 0xad8bU, 0x9de8U, 0x8dc9U,
-	0x7c26U, 0x6c07U, 0x5c64U, 0x4c45U, 0x3ca2U, 0x2c83U, 0x1ce0U, 0x0cc1U,
-	0xef1fU, 0xff3eU, 0xcf5dU, 0xdf7cU, 0xaf9bU, 0xbfbaU, 0x8fd9U, 0x9ff8U,
-	0x6e17U, 0x7e36U, 0x4e55U, 0x5e74U, 0x2e93U, 0x3eb2U, 0x0ed1U, 0x1ef0U
-};
-
-/*
- * udf_crc
- *
- * PURPOSE
- *	Calculate a 16-bit CRC checksum using ITU-T V.41 polynomial.
- *
- * DESCRIPTION
- *	The OSTA-UDF(tm) 1.50 standard states that using CRCs is mandatory.
- *	The polynomial used is:	x^16 + x^12 + x^15 + 1
- *
- * PRE-CONDITIONS
- *	data		Pointer to the data block.
- *	size		Size of the data block.
- *
- * POST-CONDITIONS
- *	<return>	CRC of the data block.
- *
- * HISTORY
- *	July 21, 1997 - Andrew E. Mileski
- *	Adapted from OSTA-UDF(tm) 1.50 standard.
- */
-uint16_t udf_crc(const uint8_t *data, uint32_t size, uint16_t crc)
-{
-	while (size--)
-		crc = crc_table[(crc >> 8 ^ *(data++)) & 0xffU] ^ (crc << 8);
-
-	return crc;
-}
-
-/****************************************************************************/
-#if defined(TEST)
-
-/*
- * PURPOSE
- *	Test udf_crc()
- *
- * HISTORY
- *	July 21, 1997 - Andrew E. Mileski
- *	Adapted from OSTA-UDF(tm) 1.50 standard.
- */
-
-unsigned char bytes[] = { 0x70U, 0x6AU, 0x77U };
-
-int main(void)
-{
-	unsigned short x;
-
-	x = udf_crc(bytes, sizeof bytes);
-	printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
-
-	return 0;
-}
-
-#endif /* defined(TEST) */
-
-/****************************************************************************/
-#if defined(GENERATE)
-
-/*
- * PURPOSE
- *	Generate a table for fast 16-bit CRC calculations (any polynomial).
- *
- * DESCRIPTION
- *	The ITU-T V.41 polynomial is 010041.
- *
- * HISTORY
- *	July 21, 1997 - Andrew E. Mileski
- *	Adapted from OSTA-UDF(tm) 1.50 standard.
- */
-
-#include <stdio.h>
-
-int main(int argc, char **argv)
-{
-	unsigned long crc, poly;
-	int n, i;
-
-	/* Get the polynomial */
-	sscanf(argv[1], "%lo", &poly);
-	if (poly & 0xffff0000U) {
-		fprintf(stderr, "polynomial is too large\en");
-		exit(1);
-	}
-
-	printf("/* CRC 0%o */\n", poly);
-
-	/* Create a table */
-	printf("static unsigned short crc_table[256] = {\n");
-	for (n = 0; n < 256; n++) {
-		if (n % 8 == 0)
-			printf("\t");
-		crc = n << 8;
-		for (i = 0; i < 8; i++) {
-			if (crc & 0x8000U)
-				crc = (crc << 1) ^ poly;
-			else
-				crc <<= 1;
-			crc &= 0xFFFFU;
-		}
-		if (n == 255)
-			printf("0x%04xU ", crc);
-		else
-			printf("0x%04xU, ", crc);
-		if (n % 8 == 7)
-			printf("\n");
-	}
-	printf("};\n");
-
-	return 0;
-}
-
-#endif /* defined(GENERATE) */
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6e151f170c08..6e74b117aaf0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
+#include <linux/crc-itu-t.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -1419,9 +1420,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 						iinfo->i_location.
 							logicalBlockNum);
 		use->descTag.descCRCLength = cpu_to_le16(crclen);
-		use->descTag.descCRC = cpu_to_le16(udf_crc((char *)use +
-							   sizeof(tag), crclen,
-							   0));
+		use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
+							   sizeof(tag),
+							   crclen));
 		use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
 
 		mark_buffer_dirty(bh);
@@ -1584,8 +1585,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
 								sizeof(tag);
 	fe->descTag.descCRCLength = cpu_to_le16(crclen);
-	fe->descTag.descCRC = cpu_to_le16(udf_crc((char *)fe + sizeof(tag),
-						  crclen, 0));
+	fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+						  crclen));
 	fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
 
 	/* write the data blocks */
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 96996204d928..84bf0fd4a4f1 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/crc-itu-t.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -135,8 +136,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
 		/* rewrite CRC + checksum of eahd */
 		crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
 		eahd->descTag.descCRCLength = cpu_to_le16(crclen);
-		eahd->descTag.descCRC = cpu_to_le16(udf_crc((char *)eahd +
-						sizeof(tag), crclen, 0));
+		eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
+						sizeof(tag), crclen));
 		eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
 		iinfo->i_lenEAttr += size;
 		return (struct genericFormat *)&ea[offset];
@@ -241,8 +242,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
 
 	/* Verify the descriptor CRC */
 	if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
-	    le16_to_cpu(tag_p->descCRC) == udf_crc(bh->b_data + sizeof(tag),
-					le16_to_cpu(tag_p->descCRCLength), 0))
+	    le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
+					bh->b_data + sizeof(tag),
+					le16_to_cpu(tag_p->descCRCLength)))
 		return bh;
 
 	udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
@@ -266,7 +268,7 @@ void udf_update_tag(char *data, int length)
 	length -= sizeof(tag);
 
 	tptr->descCRCLength = cpu_to_le16(length);
-	tptr->descCRC = cpu_to_le16(udf_crc(data + sizeof(tag), length, 0));
+	tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
 	tptr->tagChecksum = udf_tag_checksum(tptr);
 }
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 68686b79650a..ba5537d4bc15 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
+#include <linux/crc-itu-t.h>
 
 static inline int udf_match(int len1, const char *name1, int len2,
 			    const char *name2)
@@ -97,25 +98,23 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 		memset(fibh->ebh->b_data, 0x00, padlen + offset);
 	}
 
-	crc = udf_crc((uint8_t *)cfi + sizeof(tag),
-		      sizeof(struct fileIdentDesc) - sizeof(tag), 0);
+	crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
+		      sizeof(struct fileIdentDesc) - sizeof(tag));
 
 	if (fibh->sbh == fibh->ebh) {
-		crc = udf_crc((uint8_t *)sfi->impUse,
+		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
 			      crclen + sizeof(tag) -
-			      sizeof(struct fileIdentDesc), crc);
+			      sizeof(struct fileIdentDesc));
 	} else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
-		crc = udf_crc(fibh->ebh->b_data +
+		crc = crc_itu_t(crc, fibh->ebh->b_data +
 					sizeof(struct fileIdentDesc) +
 					fibh->soffset,
 			      crclen + sizeof(tag) -
-					sizeof(struct fileIdentDesc),
-			      crc);
+					sizeof(struct fileIdentDesc));
 	} else {
-		crc = udf_crc((uint8_t *)sfi->impUse,
-			      -fibh->soffset - sizeof(struct fileIdentDesc),
-			      crc);
-		crc = udf_crc(fibh->ebh->b_data, fibh->eoffset, crc);
+		crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
+			      -fibh->soffset - sizeof(struct fileIdentDesc));
+		crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset);
 	}
 
 	cfi->descTag.descCRC = cpu_to_le16(crc);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6823733c0121..b564fc140fe4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -56,6 +56,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/bitmap.h>
+#include <linux/crc-itu-t.h>
 #include <asm/byteorder.h>
 
 #include "udf_sb.h"
@@ -1765,8 +1766,8 @@ static void udf_open_lvid(struct super_block *sb)
 	lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
 
 	lvid->descTag.descCRC = cpu_to_le16(
-		udf_crc((char *)lvid + sizeof(tag),
-			le16_to_cpu(lvid->descTag.descCRCLength), 0));
+		crc_itu_t(0, (char *)lvid + sizeof(tag),
+			le16_to_cpu(lvid->descTag.descCRCLength)));
 
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	mark_buffer_dirty(bh);
@@ -1800,9 +1801,8 @@ static void udf_close_lvid(struct super_block *sb)
 	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
 
 	lvid->descTag.descCRC = cpu_to_le16(
-		udf_crc((char *)lvid + sizeof(tag),
-			le16_to_cpu(lvid->descTag.descCRCLength),
-			0));
+			crc_itu_t(0, (char *)lvid + sizeof(tag),
+				le16_to_cpu(lvid->descTag.descCRCLength)));
 
 	lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
 	mark_buffer_dirty(bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 61897d312a2e..f3f45d029277 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -220,9 +220,6 @@ extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
 extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
 extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 
-/* crc.c */
-extern uint16_t udf_crc(const uint8_t *, uint32_t, uint16_t);
-
 /* udftime.c */
 extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
 						timestamp src);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index f5872ae325a3..9fdf8c93c58e 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>	/* for memset */
 #include <linux/nls.h>
+#include <linux/crc-itu-t.h>
 
 #include "udf_sb.h"
 
@@ -454,7 +455,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
 		} else if (newIndex > 250)
 			newIndex = 250;
 		newName[newIndex++] = CRC_MARK;
-		valueCRC = udf_crc(fidName, fidNameLen, 0);
+		valueCRC = crc_itu_t(0, fidName, fidNameLen);
 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
-- 
cgit v1.2.3


From e12070a5dca8bfeee352e9655ae79772a96b32f8 Mon Sep 17 00:00:00 2001
From: Niv Sardi <xaiki@sgi.com>
Date: Thu, 6 Mar 2008 13:43:03 +1100
Subject: [XFS] actually check error returned by xfs_flush_pages, clean up and
 bailout if fails.

SGI-PV: 973041
SGI-Modid: xfs-linux-melb:xfs-kern:30462a

Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_bmap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2def273855a2..87f646749817 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5869,6 +5869,10 @@ xfs_getbmap(
 		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
 		error = xfs_flush_pages(ip, (xfs_off_t)0,
 					       -1, 0, FI_REMAPF);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+		return error;
+		}
 	}
 
 	ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
-- 
cgit v1.2.3


From 461aa8a22595e3bd3e6f4dc2894d7c4315ea2bb9 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 6 Mar 2008 13:43:11 +1100
Subject: [XFS] make inode reclaim synchronise with xfs_iflush_done()

On a forced shutdown, xfs_finish_reclaim() will skip flushing the inode.
If the inode flush lock is not already held and there is an outstanding
xfs_iflush_done() then we might free the inode prematurely. By acquiring
and releasing the flush lock we will synchronise with xfs_iflush_done().

SGI-PV: 909874
SGI-Modid: xfs-linux-melb:xfs-kern:30468a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64c5953feca4..ce82a2d0acbe 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3694,12 +3694,12 @@ xfs_finish_reclaim(
 	 * We get the flush lock regardless, though, just to make sure
 	 * we don't free it while it is being flushed.
 	 */
-	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		if (!locked) {
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_iflock(ip);
-		}
+	if (!locked) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_iflock(ip);
+	}
 
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		if (ip->i_update_core ||
 		    ((ip->i_itemp != NULL) &&
 		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
@@ -3719,17 +3719,11 @@ xfs_finish_reclaim(
 		ASSERT(ip->i_update_core == 0);
 		ASSERT(ip->i_itemp == NULL ||
 		       ip->i_itemp->ili_format.ilf_fields == 0);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	} else if (locked) {
-		/*
-		 * We are not interested in doing an iflush if we're
-		 * in the process of shutting down the filesystem forcibly.
-		 * So, just reclaim the inode.
-		 */
-		xfs_ifunlock(ip);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
  reclaim:
 	xfs_ireclaim(ip);
 	return 0;
-- 
cgit v1.2.3


From 163d3686bb09d88e2120bffe780a3f2d7cc4c948 Mon Sep 17 00:00:00 2001
From: Donald Douwsma <donaldd@sgi.com>
Date: Thu, 6 Mar 2008 13:43:20 +1100
Subject: [XFS] Remove the xfs_refcache

Remove the xfs_refcache, it was only needed while we were still
building for 2.4 kernels.

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30472a

Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_linux.h |  1 -
 fs/xfs/xfs_inode.h           |  4 ----
 fs/xfs/xfs_rename.c          |  5 +----
 fs/xfs/xfs_vfsops.c          | 20 --------------------
 fs/xfs/xfs_vnodeops.c        | 34 +---------------------------------
 5 files changed, 2 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 3ca39c4e5d2a..e5143323e71f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -99,7 +99,6 @@
 /*
  * Feature macros (disable/enable)
  */
-#undef  HAVE_REFCACHE	/* reference cache not needed for NFS in 2.6 */
 #define HAVE_SPLICE	/* a splice(2) exists in 2.6, but not in 2.4 */
 #ifdef CONFIG_SMP
 #define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bfcd72cbaeea..eaa01895ff93 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -240,10 +240,6 @@ typedef struct xfs_inode {
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
-#ifdef HAVE_REFCACHE
-	struct xfs_inode	**i_refcache;	/* ptr to entry in ref cache */
-	struct xfs_inode	*i_release;	/* inode to unref */
-#endif
 	/* Miscellaneous state. */
 	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7eb157a59f9e..1c6d40ed6816 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -36,7 +36,6 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
 #include "xfs_vnodeops.h"
@@ -580,10 +579,8 @@ xfs_rename(
 	 * the vnode references.
 	 */
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (target_ip != NULL) {
-		xfs_refcache_purge_ip(target_ip);
+	if (target_ip != NULL)
 		IRELE(target_ip);
-	}
 	/*
 	 * Let interposed file systems know about removed links.
 	 */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7094caff13cf..79bdfb3d6081 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_refcache.h"
 #include "xfs_buf_item.h"
 #include "xfs_log_priv.h"
 #include "xfs_dir2_trace.h"
@@ -157,7 +156,6 @@ xfs_cleanup(void)
 
 	xfs_cleanup_procfs();
 	xfs_sysctl_unregister();
-	xfs_refcache_destroy();
 	xfs_filestream_uninit();
 	xfs_mru_cache_uninit();
 	xfs_acl_zone_destroy(xfs_acl_zone);
@@ -584,11 +582,6 @@ xfs_unmount(
 					0 : DM_FLAGS_UNWANTED;
 	}
 #endif
-	/*
-	 * First blow any referenced inode from this file system
-	 * out of the reference cache, and delete the timer.
-	 */
-	xfs_refcache_purge_mp(mp);
 
 	/*
 	 * Blow away any referenced inode in the filestreams cache.
@@ -652,7 +645,6 @@ xfs_quiesce_fs(
 {
 	int			count = 0, pincount;
 
-	xfs_refcache_purge_mp(mp);
 	xfs_flush_buftarg(mp->m_ddev_targp, 0);
 	xfs_finish_reclaim_all(mp, 0);
 
@@ -1322,18 +1314,6 @@ xfs_syncsub(
 		}
 	}
 
-	/*
-	 * If this is the periodic sync, then kick some entries out of
-	 * the reference cache.  This ensures that idle entries are
-	 * eventually kicked out of the cache.
-	 */
-	if (flags & SYNC_REFCACHE) {
-		if (flags & SYNC_WAIT)
-			xfs_refcache_purge_mp(mp);
-		else
-			xfs_refcache_purge_some(mp);
-	}
-
 	/*
 	 * If asked, update the disk superblock with incore counter values if we
 	 * are using non-persistent counters so that they don't get too far out
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ce82a2d0acbe..35ac59dabf8b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -48,7 +48,6 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_rtalloc.h"
-#include "xfs_refcache.h"
 #include "xfs_trans_space.h"
 #include "xfs_log_priv.h"
 #include "xfs_filestream.h"
@@ -1520,12 +1519,6 @@ xfs_release(
 			xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
 	}
 
-#ifdef HAVE_REFCACHE
-	/* If we are in the NFS reference cache then don't do this now */
-	if (ip->i_refcache)
-		return 0;
-#endif
-
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 		     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -2448,14 +2441,6 @@ xfs_remove(
 		goto std_return;
 	}
 
-	/*
-	 * Before we drop our extra reference to the inode, purge it
-	 * from the refcache if it is there.  By waiting until afterwards
-	 * to do the IRELE, we ensure that we won't go inactive in the
-	 * xfs_refcache_purge_ip routine (although that would be OK).
-	 */
-	xfs_refcache_purge_ip(ip);
-
 	/*
 	 * If we are using filestreams, kill the stream association.
 	 * If the file is still open it may get a new one but that
@@ -2495,14 +2480,6 @@ xfs_remove(
 	cancel_flags |= XFS_TRANS_ABORT;
 	xfs_trans_cancel(tp, cancel_flags);
 
-	/*
-	 * Before we drop our extra reference to the inode, purge it
-	 * from the refcache if it is there.  By waiting until afterwards
-	 * to do the IRELE, we ensure that we won't go inactive in the
-	 * xfs_refcache_purge_ip routine (although that would be OK).
-	 */
-	xfs_refcache_purge_ip(ip);
-
 	IRELE(ip);
 
 	goto std_return;
@@ -3460,16 +3437,7 @@ xfs_rwunlock(
 {
  	if (S_ISDIR(ip->i_d.di_mode))
   		return;
-	if (locktype == VRWLOCK_WRITE) {
-		/*
-		 * In the write case, we may have added a new entry to
-		 * the reference cache.  This might store a pointer to
-		 * an inode to be released in this inode.  If it is there,
-		 * clear the pointer and release the inode after unlocking
-		 * this one.
-		 */
-		xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
-	} else {
+	if (locktype != VRWLOCK_WRITE) {
 		ASSERT((locktype == VRWLOCK_READ) ||
 		       (locktype == VRWLOCK_WRITE_DIRECT));
 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-- 
cgit v1.2.3


From e9a56b7cdaf6129892fd7c8d950b71a1a4304bb0 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 6 Mar 2008 13:43:27 +1100
Subject: [XFS] Fix regression due to refcache removal

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30490a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 35ac59dabf8b..40b95e3a88ba 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3437,7 +3437,9 @@ xfs_rwunlock(
 {
  	if (S_ISDIR(ip->i_d.di_mode))
   		return;
-	if (locktype != VRWLOCK_WRITE) {
+	if (locktype == VRWLOCK_WRITE) {
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	} else {
 		ASSERT((locktype == VRWLOCK_READ) ||
 		       (locktype == VRWLOCK_WRITE_DIRECT));
 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-- 
cgit v1.2.3


From 4ae29b4321b99b711bcfde5527c4fbf249eac60f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:34 +1100
Subject: [XFS] Factor xfs_itobp() and xfs_inotobp().

The only difference between the functions is one passes an inode for the
lookup, the other passes an inode number. However, they don't do the same
validity checking or set all the same state on the buffer that is returned
yet they should.

Factor the functions into a common implementation.

SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30500a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_inode.c | 261 ++++++++++++++++++++++-------------------------------
 1 file changed, 106 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f43a6e01d68f..6f156faf9d46 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -125,6 +125,85 @@ xfs_inobp_check(
 }
 #endif
 
+/*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_imap_t	*imap,
+	xfs_buf_t	**bpp,
+	uint		buf_flags,
+	uint		imap_flags)
+{
+	int		error;
+	int		i;
+	int		ni;
+	xfs_buf_t	*bp;
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+				   (int)imap->im_len, XFS_BUF_LOCK, &bp);
+	if (error) {
+		cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+				"an error %d on %s.  Returning error.",
+				error, mp->m_fsname);
+		return error;
+	}
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 */
+#ifdef DEBUG
+	ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else	/* usual case */
+	ni = 1;
+#endif
+
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+			    XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP,
+						XFS_RANDOM_ITOBP_INOTOBP))) {
+			if (imap_flags & XFS_IMAP_BULKSTAT) {
+				xfs_trans_brelse(tp, bp);
+				return XFS_ERROR(EINVAL);
+			}
+			XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+						XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+			cmn_err(CE_PANIC,
+					"Device %s - bad inode magic/vsn "
+					"daddr %lld #%d (magic=%x)",
+				XFS_BUFTARG_NAME(mp->m_ddev_targp),
+				(unsigned long long)imap->im_blkno, i,
+				be16_to_cpu(dip->di_core.di_magic));
+#endif
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Mark the buffer as an inode buffer now that it looks good
+	 */
+	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+	*bpp = bp;
+	return 0;
+}
+
 /*
  * This routine is called to map an inode number within a file
  * system to the buffer containing the on-disk version of the
@@ -147,72 +226,19 @@ xfs_inotobp(
 	xfs_buf_t	**bpp,
 	int		*offset)
 {
-	int		di_ok;
 	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	xfs_dinode_t	*dip;
 
-	/*
-	 * Call the space management code to find the location of the
-	 * inode on disk.
-	 */
 	imap.im_blkno = 0;
 	error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
-	if (error != 0) {
-		cmn_err(CE_WARN,
-	"xfs_inotobp: xfs_imap()  returned an "
-	"error %d on %s.  Returning error.", error, mp->m_fsname);
+	if (error)
 		return error;
-	}
-
-	/*
-	 * If the inode number maps to a block outside the bounds of the
-	 * file system then return NULL rather than calling read_buf
-	 * and panicing when we get an error from the driver.
-	 */
-	if ((imap.im_blkno + imap.im_len) >
-	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-		cmn_err(CE_WARN,
-	"xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
-	"of the file system %s.  Returning EINVAL.",
-			(unsigned long long)imap.im_blkno,
-			imap.im_len, mp->m_fsname);
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-	 * default to just a read_buf() call.
-	 */
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
 
-	if (error) {
-		cmn_err(CE_WARN,
-	"xfs_inotobp: xfs_trans_read_buf()  returned an "
-	"error %d on %s.  Returning error.", error, mp->m_fsname);
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+	if (error)
 		return error;
-	}
-	dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
-	di_ok =
-		be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-		XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-	if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
-			XFS_RANDOM_ITOBP_INOTOBP))) {
-		XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
-		xfs_trans_brelse(tp, bp);
-		cmn_err(CE_WARN,
-	"xfs_inotobp: XFS_TEST_ERROR()  returned an "
-	"error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
 
-	xfs_inobp_check(mp, bp);
-
-	/*
-	 * Set *dipp to point to the on-disk inode in the buffer.
-	 */
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	*offset = imap.im_boffset;
@@ -253,40 +279,14 @@ xfs_itobp(
 	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	int		i;
-	int		ni;
 
 	if (ip->i_blkno == (xfs_daddr_t)0) {
-		/*
-		 * Call the space management code to find the location of the
-		 * inode on disk.
-		 */
 		imap.im_blkno = bno;
-		if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
-					XFS_IMAP_LOOKUP | imap_flags)))
+		error = xfs_imap(mp, tp, ip->i_ino, &imap,
+					XFS_IMAP_LOOKUP | imap_flags);
+		if (error)
 			return error;
 
-		/*
-		 * If the inode number maps to a block outside the bounds
-		 * of the file system then return NULL rather than calling
-		 * read_buf and panicing when we get an error from the
-		 * driver.
-		 */
-		if ((imap.im_blkno + imap.im_len) >
-		    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-					"(imap.im_blkno (0x%llx) "
-					"+ imap.im_len (0x%llx)) > "
-					" XFS_FSB_TO_BB(mp, "
-					"mp->m_sb.sb_dblocks) (0x%llx)",
-					(unsigned long long) imap.im_blkno,
-					(unsigned long long) imap.im_len,
-					XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
-			return XFS_ERROR(EINVAL);
-		}
-
 		/*
 		 * Fill in the fields in the inode that will be used to
 		 * map the inode to its buffer from now on.
@@ -305,76 +305,10 @@ xfs_itobp(
 	}
 	ASSERT(bno == 0 || bno == imap.im_blkno);
 
-	/*
-	 * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-	 * default to just a read_buf() call.
-	 */
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-	if (error) {
-#ifdef DEBUG
-		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-				"xfs_trans_read_buf() returned error %d, "
-				"imap.im_blkno 0x%llx, imap.im_len 0x%llx",
-				error, (unsigned long long) imap.im_blkno,
-				(unsigned long long) imap.im_len);
-#endif /* DEBUG */
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+	if (error)
 		return error;
-	}
-
-	/*
-	 * Validate the magic number and version of every inode in the buffer
-	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-	 * No validation is done here in userspace (xfs_repair).
-	 */
-#if !defined(__KERNEL__)
-	ni = 0;
-#elif defined(DEBUG)
-	ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else	/* usual case */
-	ni = 1;
-#endif
-
-	for (i = 0; i < ni; i++) {
-		int		di_ok;
-		xfs_dinode_t	*dip;
 
-		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-					(i << mp->m_sb.sb_inodelog));
-		di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-			    XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-						XFS_ERRTAG_ITOBP_INOTOBP,
-						XFS_RANDOM_ITOBP_INOTOBP))) {
-			if (imap_flags & XFS_IMAP_BULKSTAT) {
-				xfs_trans_brelse(tp, bp);
-				return XFS_ERROR(EINVAL);
-			}
-#ifdef DEBUG
-			cmn_err(CE_ALERT,
-					"Device %s - bad inode magic/vsn "
-					"daddr %lld #%d (magic=%x)",
-				XFS_BUFTARG_NAME(mp->m_ddev_targp),
-				(unsigned long long)imap.im_blkno, i,
-				be16_to_cpu(dip->di_core.di_magic));
-#endif
-			XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
-					     mp, dip);
-			xfs_trans_brelse(tp, bp);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-	}
-
-	xfs_inobp_check(mp, bp);
-
-	/*
-	 * Mark the buffer as an inode buffer now that it looks good
-	 */
-	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
-	/*
-	 * Set *dipp to point to the on-disk inode in the buffer.
-	 */
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	return 0;
@@ -2678,14 +2612,31 @@ xfs_imap(
 	fsbno = imap->im_blkno ?
 		XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
 	error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-	if (error != 0) {
+	if (error)
 		return error;
-	}
+
 	imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
 	imap->im_len = XFS_FSB_TO_BB(mp, len);
 	imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
 	imap->im_ioffset = (ushort)off;
 	imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+
+	/*
+	 * If the inode number maps to a block outside the bounds
+	 * of the file system then return NULL rather than calling
+	 * read_buf and panicing when we get an error from the
+	 * driver.
+	 */
+	if ((imap->im_blkno + imap->im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+			"(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+			" XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+			(unsigned long long) imap->im_blkno,
+			(unsigned long long) imap->im_len,
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+		return EINVAL;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From a3f74ffb6d1448d9a8f482e593b80ec15f1695d4 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:42 +1100
Subject: [XFS] Don't block pdflush when writing back inodes

When pdflush is writing back inodes, it can get stuck on inode cluster
buffers that are currently under I/O. This occurs when we write data to
multiple inodes in the same inode cluster at the same time.

Effectively, delayed allocation marks the inode dirty during the data
writeback. Hence if the inode cluster was flushed during the writeback of
the first inode, the writeback of the second inode will block waiting for
the inode cluster write to complete before writing it again for the newly
dirtied inode.

Basically, we want to avoid this from happening so we don't block pdflush
and slow down all of writeback. Hence we introduce a non-blocking async
inode flush flag that pdflush uses. If this flag is set, we use
non-blocking operations (e.g. try locks) whereever we can to avoid
blocking or extra I/O being issued.

SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30501a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |   3 +-
 fs/xfs/linux-2.6/xfs_vnode.h |   5 +-
 fs/xfs/xfs_inode.c           | 135 ++++++++++++++++++++++++++-----------------
 fs/xfs/xfs_inode.h           |   3 +-
 fs/xfs/xfs_itable.c          |   3 +-
 fs/xfs/xfs_log_recover.c     |   3 +-
 fs/xfs/xfs_trans_buf.c       |   3 +-
 fs/xfs/xfs_vnodeops.c        |  55 ++++--------------
 8 files changed, 105 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8831d9518790..cb9ce90d1deb 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
 	struct inode		*inode,
 	int			sync)
 {
-	int			error = 0, flags = FLUSH_INODE;
+	int			error = 0;
+	int			flags = 0;
 
 	xfs_itrace_entry(XFS_I(inode));
 	if (sync) {
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b5ea418693b1..f200e0244082 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -73,12 +73,9 @@ typedef enum bhv_vrwlock {
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
 /*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
  */
 #define FLUSH_SYNC		1	/* wait for flush to complete	*/
-#define FLUSH_INODE		2	/* flush the inode itself	*/
-#define FLUSH_LOG		4	/* force the last log entry for
-					 * this inode out to disk	*/
 
 /*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6f156faf9d46..3c3e9e3c1da8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -145,11 +145,16 @@ xfs_imap_to_bp(
 	xfs_buf_t	*bp;
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-				   (int)imap->im_len, XFS_BUF_LOCK, &bp);
+				   (int)imap->im_len, buf_flags, &bp);
 	if (error) {
-		cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+		if (error != EAGAIN) {
+			cmn_err(CE_WARN,
+				"xfs_imap_to_bp: xfs_trans_read_buf()returned "
 				"an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
+		} else {
+			ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+		}
 		return error;
 	}
 
@@ -274,7 +279,8 @@ xfs_itobp(
 	xfs_dinode_t	**dipp,
 	xfs_buf_t	**bpp,
 	xfs_daddr_t	bno,
-	uint		imap_flags)
+	uint		imap_flags,
+	uint		buf_flags)
 {
 	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
@@ -305,10 +311,17 @@ xfs_itobp(
 	}
 	ASSERT(bno == 0 || bno == imap.im_blkno);
 
-	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
 	if (error)
 		return error;
 
+	if (!bp) {
+		ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+		ASSERT(tp == NULL);
+		*bpp = NULL;
+		return EAGAIN;
+	}
+
 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 	*bpp = bp;
 	return 0;
@@ -812,7 +825,7 @@ xfs_iread(
 	 * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
 	 * know that this is a new incore inode.
 	 */
-	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
+	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
 	if (error) {
 		kmem_zone_free(xfs_inode_zone, ip);
 		return error;
@@ -1901,7 +1914,7 @@ xfs_iunlink(
 		 * Here we put the head pointer into our next pointer,
 		 * and then we fall through to point the head at us.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 		if (error)
 			return error;
 
@@ -2009,7 +2022,7 @@ xfs_iunlink_remove(
 		 * of dealing with the buffer when there is no need to
 		 * change it.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2071,7 +2084,7 @@ xfs_iunlink_remove(
 		 * Now last_ibp points to the buffer previous to us on
 		 * the unlinked list.  Pull us from the list.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2334,7 +2347,7 @@ xfs_ifree(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
+	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
 	if (error)
 		return error;
 
@@ -2777,38 +2790,41 @@ xfs_iunpin(
 }
 
 /*
- * This is called to wait for the given inode to be unpinned.
- * It will sleep until this happens.  The caller must have the
- * inode locked in at least shared mode so that the buffer cannot
- * be subsequently pinned once someone is waiting for it to be
- * unpinned.
+ * This is called to unpin an inode. It can be directed to wait or to return
+ * immediately without waiting for the inode to be unpinned.  The caller must
+ * have the inode locked in at least shared mode so that the buffer cannot be
+ * subsequently pinned once someone is waiting for it to be unpinned.
  */
 STATIC void
-xfs_iunpin_wait(
-	xfs_inode_t	*ip)
+__xfs_iunpin_wait(
+	xfs_inode_t	*ip,
+	int		wait)
 {
-	xfs_inode_log_item_t	*iip;
-	xfs_lsn_t	lsn;
+	xfs_inode_log_item_t	*iip = ip->i_itemp;
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
-
-	if (atomic_read(&ip->i_pincount) == 0) {
+	if (atomic_read(&ip->i_pincount) == 0)
 		return;
-	}
 
-	iip = ip->i_itemp;
-	if (iip && iip->ili_last_lsn) {
-		lsn = iip->ili_last_lsn;
-	} else {
-		lsn = (xfs_lsn_t)0;
-	}
+	/* Give the log a push to start the unpinning I/O */
+	xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
+				iip->ili_last_lsn : 0, XFS_LOG_FORCE);
+	if (wait)
+		wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
 
-	/*
-	 * Give the log a push so we don't wait here too long.
-	 */
-	xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+static inline void
+xfs_iunpin_wait(
+	xfs_inode_t	*ip)
+{
+	__xfs_iunpin_wait(ip, 1);
+}
 
-	wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+static inline void
+xfs_iunpin_nowait(
+	xfs_inode_t	*ip)
+{
+	__xfs_iunpin_wait(ip, 0);
 }
 
 
@@ -3003,6 +3019,7 @@ xfs_iflush(
 	int			bufwasdelwri;
 	struct hlist_node	*entry;
 	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
+	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
 
 	XFS_STATS_INC(xs_iflush_count);
 
@@ -3027,11 +3044,21 @@ xfs_iflush(
 	}
 
 	/*
-	 * We can't flush the inode until it is unpinned, so
-	 * wait for it.  We know noone new can pin it, because
-	 * we are holding the inode lock shared and you need
-	 * to hold it exclusively to pin the inode.
+	 * We can't flush the inode until it is unpinned, so wait for it if we
+	 * are allowed to block.  We know noone new can pin it, because we are
+	 * holding the inode lock shared and you need to hold it exclusively to
+	 * pin the inode.
+	 *
+	 * If we are not allowed to block, force the log out asynchronously so
+	 * that when we come back the inode will be unpinned. If other inodes
+	 * in the same cluster are dirty, they will probably write the inode
+	 * out for us if they occur after the log force completes.
 	 */
+	if (noblock && xfs_ipincount(ip)) {
+		xfs_iunpin_nowait(ip);
+		xfs_ifunlock(ip);
+		return EAGAIN;
+	}
 	xfs_iunpin_wait(ip);
 
 	/*
@@ -3047,15 +3074,6 @@ xfs_iflush(
 		return XFS_ERROR(EIO);
 	}
 
-	/*
-	 * Get the buffer containing the on-disk inode.
-	 */
-	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
-	if (error) {
-		xfs_ifunlock(ip);
-		return error;
-	}
-
 	/*
 	 * Decide how buffer will be flushed out.  This is done before
 	 * the call to xfs_iflush_int because this field is zeroed by it.
@@ -3072,6 +3090,7 @@ xfs_iflush(
 		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
 			flags = 0;
 			break;
+		case XFS_IFLUSH_ASYNC_NOBLOCK:
 		case XFS_IFLUSH_ASYNC:
 		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
 			flags = INT_ASYNC;
@@ -3091,6 +3110,7 @@ xfs_iflush(
 		case XFS_IFLUSH_DELWRI:
 			flags = INT_DELWRI;
 			break;
+		case XFS_IFLUSH_ASYNC_NOBLOCK:
 		case XFS_IFLUSH_ASYNC:
 			flags = INT_ASYNC;
 			break;
@@ -3104,6 +3124,16 @@ xfs_iflush(
 		}
 	}
 
+	/*
+	 * Get the buffer containing the on-disk inode.
+	 */
+	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+				noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+	if (error || !bp) {
+		xfs_ifunlock(ip);
+		return error;
+	}
+
 	/*
 	 * First flush out the inode that xfs_iflush was called with.
 	 */
@@ -3112,6 +3142,13 @@ xfs_iflush(
 		goto corrupt_out;
 	}
 
+	/*
+	 * If the buffer is pinned then push on the log now so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp))
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+
 	/*
 	 * inode clustering:
 	 * see if other inodes can be gathered into this write
@@ -3181,14 +3218,6 @@ xfs_iflush(
 		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
 	}
 
-	/*
-	 * If the buffer is pinned then push on the log so we won't
-	 * get stuck waiting in the write for too long.
-	 */
-	if (XFS_BUF_ISPINNED(bp)){
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-	}
-
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & INT_ASYNC) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index eaa01895ff93..c3bfffca9214 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -457,6 +457,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define	XFS_IFLUSH_SYNC			3
 #define	XFS_IFLUSH_ASYNC		4
 #define	XFS_IFLUSH_DELWRI		5
+#define	XFS_IFLUSH_ASYNC_NOBLOCK	6
 
 /*
  * Flags for xfs_itruncate_start().
@@ -511,7 +512,7 @@ int		xfs_finish_reclaim_all(struct xfs_mount *, int);
  */
 int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-			  xfs_daddr_t, uint);
+			  xfs_daddr_t, uint, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			  xfs_inode_t **, xfs_daddr_t, uint);
 int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f615e04364f4..45d8776408ef 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -614,7 +614,8 @@ xfs_bulkstat(
 							xfs_buf_relse(bp);
 						error = xfs_itobp(mp, NULL, ip,
 								&dip, &bp, bno,
-								XFS_IMAP_BULKSTAT);
+								XFS_IMAP_BULKSTAT,
+								XFS_BUF_LOCK);
 						if (!error)
 							clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
 						kmem_zone_free(xfs_inode_zone, ip);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b2b70eba282c..cd24711ae276 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3214,7 +3214,8 @@ xlog_recover_process_iunlinks(
 					 * next inode in the bucket.
 					 */
 					error = xfs_itobp(mp, NULL, ip, &dip,
-							&ibp, 0, 0);
+							&ibp, 0, 0,
+							XFS_BUF_LOCK);
 					ASSERT(error || (dip != NULL));
 				}
 
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b898022b..4e5c010f5040 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
 	if (tp == NULL) {
 		bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
 		if (!bp)
-			return XFS_ERROR(ENOMEM);
+			return (flags & XFS_BUF_TRYLOCK) ?
+					EAGAIN : XFS_ERROR(ENOMEM);
 
 		if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
 			xfs_ioerror_alert("xfs_trans_read_buf", mp,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 40b95e3a88ba..14140f6225ba 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3468,29 +3468,6 @@ xfs_inode_flush(
 	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
 		return 0;
 
-	if (flags & FLUSH_LOG) {
-		if (iip && iip->ili_last_lsn) {
-			xlog_t		*log = mp->m_log;
-			xfs_lsn_t	sync_lsn;
-			int		log_flags = XFS_LOG_FORCE;
-
-			spin_lock(&log->l_grant_lock);
-			sync_lsn = log->l_last_sync_lsn;
-			spin_unlock(&log->l_grant_lock);
-
-			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
-				if (flags & FLUSH_SYNC)
-					log_flags |= XFS_LOG_SYNC;
-				error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
-				if (error)
-					return error;
-			}
-
-			if (ip->i_update_core == 0)
-				return 0;
-		}
-	}
-
 	/*
 	 * We make this non-blocking if the inode is contended,
 	 * return EAGAIN to indicate to the caller that they
@@ -3498,30 +3475,22 @@ xfs_inode_flush(
 	 * blocking on inodes inside another operation right
 	 * now, they get caught later by xfs_sync.
 	 */
-	if (flags & FLUSH_INODE) {
-		int	flush_flags;
-
-		if (flags & FLUSH_SYNC) {
-			xfs_ilock(ip, XFS_ILOCK_SHARED);
-			xfs_iflock(ip);
-		} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-			if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-				xfs_iunlock(ip, XFS_ILOCK_SHARED);
-				return EAGAIN;
-			}
-		} else {
+	if (flags & FLUSH_SYNC) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		xfs_iflock(ip);
+	} else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			return EAGAIN;
 		}
-
-		if (flags & FLUSH_SYNC)
-			flush_flags = XFS_IFLUSH_SYNC;
-		else
-			flush_flags = XFS_IFLUSH_ASYNC;
-
-		error = xfs_iflush(ip, flush_flags);
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	} else {
+		return EAGAIN;
 	}
 
+	error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+						    : XFS_IFLUSH_ASYNC_NOBLOCK);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
 	return error;
 }
 
-- 
cgit v1.2.3


From bad5584332e888ac40ca13584e8c114149ddb01e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:49 +1100
Subject: [XFS] Remove the xfs_icluster structure

Remove the xfs_icluster structure and replace with a radix tree lookup.

We don't need to keep a list of inodes in each cluster around anymore as
we can look them up quickly when we need to. The only time we need to do
this now is during inode writeback.

Factor the inode cluster writeback code out of xfs_iflush and convert it
to use radix_tree_gang_lookup() instead of walking a list of inodes built
when we first read in the inodes.

This remove 3 pointers from each xfs_inode structure and the xfs_icluster
structure per inode cluster. Hence we reduce the cache footprint of the
xfs_inodes by between 5-10% depending on cluster sparseness.

To be truly efficient we need a radix_tree_gang_lookup_range() call to
stop searching once we are past the end of the cluster instead of trying
to find a full cluster's worth of inodes.

Before (ia64):

$ cat /sys/slab/xfs_inode/object_size 536

After:

$ cat /sys/slab/xfs_inode/object_size 512

SGI-PV: 977460
SGI-Modid: xfs-linux-melb:xfs-kern:30502a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_iget.c   |  49 +---------
 fs/xfs/xfs_inode.c  | 268 ++++++++++++++++++++++++++++++----------------------
 fs/xfs/xfs_inode.h  |  16 ----
 fs/xfs/xfs_vfsops.c |   5 -
 4 files changed, 156 insertions(+), 182 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8e09b71f4104..e657c5128460 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -78,7 +78,6 @@ xfs_iget_core(
 	xfs_inode_t	*ip;
 	xfs_inode_t	*iq;
 	int		error;
-	xfs_icluster_t	*icl, *new_icl = NULL;
 	unsigned long	first_index, mask;
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
@@ -229,11 +228,9 @@ finish_inode:
 	}
 
 	/*
-	 * This is a bit messy - we preallocate everything we _might_
-	 * need before we pick up the ici lock. That way we don't have to
-	 * juggle locks and go all the way back to the start.
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock.
 	 */
-	new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
 	if (radix_tree_preload(GFP_KERNEL)) {
 		xfs_idestroy(ip);
 		delay(1);
@@ -242,17 +239,6 @@ finish_inode:
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = agino & mask;
 	write_lock(&pag->pag_ici_lock);
-
-	/*
-	 * Find the cluster if it exists
-	 */
-	icl = NULL;
-	if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
-							first_index, 1)) {
-		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
-			icl = iq->i_cluster;
-	}
-
 	/*
 	 * insert the new inode
 	 */
@@ -267,30 +253,13 @@ finish_inode:
 	}
 
 	/*
-	 * These values _must_ be set before releasing ihlock!
+	 * These values _must_ be set before releasing the radix tree lock!
 	 */
 	ip->i_udquot = ip->i_gdquot = NULL;
 	xfs_iflags_set(ip, XFS_INEW);
 
-	ASSERT(ip->i_cluster == NULL);
-
-	if (!icl) {
-		spin_lock_init(&new_icl->icl_lock);
-		INIT_HLIST_HEAD(&new_icl->icl_inodes);
-		icl = new_icl;
-		new_icl = NULL;
-	} else {
-		ASSERT(!hlist_empty(&icl->icl_inodes));
-	}
-	spin_lock(&icl->icl_lock);
-	hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
-	ip->i_cluster = icl;
-	spin_unlock(&icl->icl_lock);
-
 	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
-	if (new_icl)
-		kmem_zone_free(xfs_icluster_zone, new_icl);
 
 	/*
 	 * Link ip to its mount and thread it on the mount's inode list.
@@ -528,18 +497,6 @@ xfs_iextract(
 	write_unlock(&pag->pag_ici_lock);
 	xfs_put_perag(mp, pag);
 
-	/*
-	 * Remove from cluster list
-	 */
-	mp = ip->i_mount;
-	spin_lock(&ip->i_cluster->icl_lock);
-	hlist_del(&ip->i_cnode);
-	spin_unlock(&ip->i_cluster->icl_lock);
-
-	/* was last inode in cluster? */
-	if (hlist_empty(&ip->i_cluster->icl_inodes))
-		kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
-
 	/*
 	 * Remove from mount's inode list.
 	 */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3c3e9e3c1da8..040c0e41729b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,7 +55,6 @@
 
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
-kmem_zone_t *xfs_icluster_zone;
 
 /*
  * Used in xfs_itruncate().  This is the maximum number of extents
@@ -2994,6 +2993,153 @@ xfs_iflush_fork(
 	return 0;
 }
 
+STATIC int
+xfs_iflush_cluster(
+	xfs_inode_t	*ip,
+	xfs_buf_t	*bp)
+{
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
+	unsigned long		first_index, mask;
+	int			ilist_size;
+	xfs_inode_t		**ilist;
+	xfs_inode_t		*iq;
+	xfs_inode_log_item_t	*iip;
+	int			nr_found;
+	int			clcount = 0;
+	int			bufwasdelwri;
+	int			i;
+
+	ASSERT(pag->pagi_inodeok);
+	ASSERT(pag->pag_ici_init);
+
+	ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+	if (!ilist)
+		return 0;
+
+	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+	read_lock(&pag->pag_ici_lock);
+	/* really need a gang lookup range call here */
+	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+					first_index,
+					XFS_INODE_CLUSTER_SIZE(mp));
+	if (nr_found == 0)
+		goto out_free;
+
+	for (i = 0; i < nr_found; i++) {
+		iq = ilist[i];
+		if (iq == ip)
+			continue;
+		/* if the inode lies outside this cluster, we're done. */
+		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+			break;
+		/*
+		 * Do an un-protected check to see if the inode is dirty and
+		 * is a candidate for flushing.  These checks will be repeated
+		 * later after the appropriate locks are acquired.
+		 */
+		iip = iq->i_itemp;
+		if ((iq->i_update_core == 0) &&
+		    ((iip == NULL) ||
+		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+		      xfs_ipincount(iq) == 0) {
+			continue;
+		}
+
+		/*
+		 * Try to get locks.  If any are unavailable or it is pinned,
+		 * then this inode cannot be flushed and is skipped.
+		 */
+
+		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+			continue;
+		if (!xfs_iflock_nowait(iq)) {
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+		if (xfs_ipincount(iq)) {
+			xfs_ifunlock(iq);
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+			continue;
+		}
+
+		/*
+		 * arriving here means that this inode can be flushed.  First
+		 * re-check that it's dirty before flushing.
+		 */
+		iip = iq->i_itemp;
+		if ((iq->i_update_core != 0) || ((iip != NULL) &&
+		     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+			int error;
+			error = xfs_iflush_int(iq, bp);
+			if (error) {
+				xfs_iunlock(iq, XFS_ILOCK_SHARED);
+				goto cluster_corrupt_out;
+			}
+			clcount++;
+		} else {
+			xfs_ifunlock(iq);
+		}
+		xfs_iunlock(iq, XFS_ILOCK_SHARED);
+	}
+
+	if (clcount) {
+		XFS_STATS_INC(xs_icluster_flushcnt);
+		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+	}
+
+out_free:
+	read_unlock(&pag->pag_ici_lock);
+	kmem_free(ilist, ilist_size);
+	return 0;
+
+
+cluster_corrupt_out:
+	/*
+	 * Corruption detected in the clustering loop.  Invalidate the
+	 * inode buffer and shut down the filesystem.
+	 */
+	read_unlock(&pag->pag_ici_lock);
+	/*
+	 * Clean up the buffer.  If it was B_DELWRI, just release it --
+	 * brelse can handle it with no problems.  If not, shut down the
+	 * filesystem before releasing the buffer.
+	 */
+	bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+	if (bufwasdelwri)
+		xfs_buf_relse(bp);
+
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
+	if (!bufwasdelwri) {
+		/*
+		 * Just like incore_relse: if we have b_iodone functions,
+		 * mark the buffer as an error and call them.  Otherwise
+		 * mark it as stale and brelse.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp)) {
+			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_STALE(bp);
+			XFS_BUF_SHUT(bp);
+			XFS_BUF_ERROR(bp,EIO);
+			xfs_biodone(bp);
+		} else {
+			XFS_BUF_STALE(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	/*
+	 * Unlocks the flush lock
+	 */
+	xfs_iflush_abort(iq);
+	kmem_free(ilist, ilist_size);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
 /*
  * xfs_iflush() will write a modified inode's changes out to the
  * inode's on disk home.  The caller must have the inode lock held
@@ -3013,13 +3159,8 @@ xfs_iflush(
 	xfs_dinode_t		*dip;
 	xfs_mount_t		*mp;
 	int			error;
-	/* REFERENCED */
-	xfs_inode_t		*iq;
-	int			clcount;	/* count of inodes clustered */
-	int			bufwasdelwri;
-	struct hlist_node	*entry;
-	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
+	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 
 	XFS_STATS_INC(xs_iflush_count);
 
@@ -3138,9 +3279,8 @@ xfs_iflush(
 	 * First flush out the inode that xfs_iflush was called with.
 	 */
 	error = xfs_iflush_int(ip, bp);
-	if (error) {
+	if (error)
 		goto corrupt_out;
-	}
 
 	/*
 	 * If the buffer is pinned then push on the log now so we won't
@@ -3153,70 +3293,9 @@ xfs_iflush(
 	 * inode clustering:
 	 * see if other inodes can be gathered into this write
 	 */
-	spin_lock(&ip->i_cluster->icl_lock);
-	ip->i_cluster->icl_buf = bp;
-
-	clcount = 0;
-	hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
-		if (iq == ip)
-			continue;
-
-		/*
-		 * Do an un-protected check to see if the inode is dirty and
-		 * is a candidate for flushing.  These checks will be repeated
-		 * later after the appropriate locks are acquired.
-		 */
-		iip = iq->i_itemp;
-		if ((iq->i_update_core == 0) &&
-		    ((iip == NULL) ||
-		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-		      xfs_ipincount(iq) == 0) {
-			continue;
-		}
-
-		/*
-		 * Try to get locks.  If any are unavailable,
-		 * then this inode cannot be flushed and is skipped.
-		 */
-
-		/* get inode locks (just i_lock) */
-		if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
-			/* get inode flush lock */
-			if (xfs_iflock_nowait(iq)) {
-				/* check if pinned */
-				if (xfs_ipincount(iq) == 0) {
-					/* arriving here means that
-					 * this inode can be flushed.
-					 * first re-check that it's
-					 * dirty
-					 */
-					iip = iq->i_itemp;
-					if ((iq->i_update_core != 0)||
-					    ((iip != NULL) &&
-					     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
-						clcount++;
-						error = xfs_iflush_int(iq, bp);
-						if (error) {
-							xfs_iunlock(iq,
-								    XFS_ILOCK_SHARED);
-							goto cluster_corrupt_out;
-						}
-					} else {
-						xfs_ifunlock(iq);
-					}
-				} else {
-					xfs_ifunlock(iq);
-				}
-			}
-			xfs_iunlock(iq, XFS_ILOCK_SHARED);
-		}
-	}
-	spin_unlock(&ip->i_cluster->icl_lock);
-
-	if (clcount) {
-		XFS_STATS_INC(xs_icluster_flushcnt);
-		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
-	}
+	error = xfs_iflush_cluster(ip, bp);
+	if (error)
+		goto cluster_corrupt_out;
 
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
@@ -3230,52 +3309,11 @@ xfs_iflush(
 corrupt_out:
 	xfs_buf_relse(bp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-	xfs_iflush_abort(ip);
-	/*
-	 * Unlocks the flush lock
-	 */
-	return XFS_ERROR(EFSCORRUPTED);
-
 cluster_corrupt_out:
-	/* Corruption detected in the clustering loop.  Invalidate the
-	 * inode buffer and shut down the filesystem.
-	 */
-	spin_unlock(&ip->i_cluster->icl_lock);
-
-	/*
-	 * Clean up the buffer.  If it was B_DELWRI, just release it --
-	 * brelse can handle it with no problems.  If not, shut down the
-	 * filesystem before releasing the buffer.
-	 */
-	if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
-		xfs_buf_relse(bp);
-	}
-
-	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
-	if(!bufwasdelwri)  {
-		/*
-		 * Just like incore_relse: if we have b_iodone functions,
-		 * mark the buffer as an error and call them.  Otherwise
-		 * mark it as stale and brelse.
-		 */
-		if (XFS_BUF_IODONE_FUNC(bp)) {
-			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-			XFS_BUF_UNDONE(bp);
-			XFS_BUF_STALE(bp);
-			XFS_BUF_SHUT(bp);
-			XFS_BUF_ERROR(bp,EIO);
-			xfs_biodone(bp);
-		} else {
-			XFS_BUF_STALE(bp);
-			xfs_buf_relse(bp);
-		}
-	}
-
-	xfs_iflush_abort(iq);
 	/*
 	 * Unlocks the flush lock
 	 */
+	xfs_iflush_abort(ip);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c3bfffca9214..93c37697a72c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -132,19 +132,6 @@ typedef struct dm_attrs_s {
 	__uint16_t	da_pad;		/* DMIG extra padding */
 } dm_attrs_t;
 
-/*
- * This is the xfs inode cluster structure.  This structure is used by
- * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
- * the same time.
- */
-typedef struct xfs_icluster {
-	struct hlist_head	icl_inodes;	/* list of inodes on cluster */
-	xfs_daddr_t		icl_blkno;	/* starting block number of
-						 * the cluster */
-	struct xfs_buf		*icl_buf;	/* the inode buffer */
-	spinlock_t		icl_lock;	/* inode list lock */
-} xfs_icluster_t;
-
 /*
  * This is the xfs in-core inode structure.
  * Most of the on-disk inode is embedded in the i_d field.
@@ -248,8 +235,6 @@ typedef struct xfs_inode {
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
-	xfs_icluster_t		*i_cluster;	/* cluster list header */
-	struct hlist_node	i_cnode;	/* cluster link node */
 
 	xfs_fsize_t		i_size;		/* in-memory size */
 	xfs_fsize_t		i_new_size;	/* size when write completes */
@@ -594,7 +579,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #define	xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
 
-extern struct kmem_zone	*xfs_icluster_zone;
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 79bdfb3d6081..3ec27bf8531c 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -112,9 +112,6 @@ xfs_init(void)
 	xfs_ili_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
 					KM_ZONE_SPREAD, NULL);
-	xfs_icluster_zone =
-		kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
-					KM_ZONE_SPREAD, NULL);
 
 	/*
 	 * Allocate global trace buffers.
@@ -152,7 +149,6 @@ xfs_cleanup(void)
 	extern kmem_zone_t	*xfs_inode_zone;
 	extern kmem_zone_t	*xfs_efd_zone;
 	extern kmem_zone_t	*xfs_efi_zone;
-	extern kmem_zone_t	*xfs_icluster_zone;
 
 	xfs_cleanup_procfs();
 	xfs_sysctl_unregister();
@@ -187,7 +183,6 @@ xfs_cleanup(void)
 	kmem_zone_destroy(xfs_efi_zone);
 	kmem_zone_destroy(xfs_ifork_zone);
 	kmem_zone_destroy(xfs_ili_zone);
-	kmem_zone_destroy(xfs_icluster_zone);
 }
 
 /*
-- 
cgit v1.2.3


From 3354040897f828644be6ca5783588e9f64a53b8e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:43:59 +1100
Subject: [XFS] Use xfs_inode_clean() in more places

Remove open coded checks for the whether the inode is clean and replace
them with an inlined function.

SGI-PV: 977461
SGI-Modid: xfs-linux-melb:xfs-kern:30503a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_inode.c      | 27 +++++----------------------
 fs/xfs/xfs_inode_item.h |  8 ++++++++
 fs/xfs/xfs_vnodeops.c   |  4 +---
 3 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 040c0e41729b..d7514f8317df 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2118,13 +2118,6 @@ xfs_iunlink_remove(
 	return 0;
 }
 
-STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
-{
-	return (((ip->i_itemp == NULL) ||
-		!(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-		(ip->i_update_core == 0));
-}
-
 STATIC void
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
@@ -3004,7 +2997,6 @@ xfs_iflush_cluster(
 	int			ilist_size;
 	xfs_inode_t		**ilist;
 	xfs_inode_t		*iq;
-	xfs_inode_log_item_t	*iip;
 	int			nr_found;
 	int			clcount = 0;
 	int			bufwasdelwri;
@@ -3040,13 +3032,8 @@ xfs_iflush_cluster(
 		 * is a candidate for flushing.  These checks will be repeated
 		 * later after the appropriate locks are acquired.
 		 */
-		iip = iq->i_itemp;
-		if ((iq->i_update_core == 0) &&
-		    ((iip == NULL) ||
-		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-		      xfs_ipincount(iq) == 0) {
+		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
 			continue;
-		}
 
 		/*
 		 * Try to get locks.  If any are unavailable or it is pinned,
@@ -3069,10 +3056,8 @@ xfs_iflush_cluster(
 		 * arriving here means that this inode can be flushed.  First
 		 * re-check that it's dirty before flushing.
 		 */
-		iip = iq->i_itemp;
-		if ((iq->i_update_core != 0) || ((iip != NULL) &&
-		     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
-			int error;
+		if (!xfs_inode_clean(iq)) {
+			int	error;
 			error = xfs_iflush_int(iq, bp);
 			if (error) {
 				xfs_iunlock(iq, XFS_ILOCK_SHARED);
@@ -3176,8 +3161,7 @@ xfs_iflush(
 	 * If the inode isn't dirty, then just release the inode
 	 * flush lock and do nothing.
 	 */
-	if ((ip->i_update_core == 0) &&
-	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+	if (xfs_inode_clean(ip)) {
 		ASSERT((iip != NULL) ?
 			 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
 		xfs_ifunlock(ip);
@@ -3343,8 +3327,7 @@ xfs_iflush_int(
 	 * If the inode isn't dirty, then just release the inode
 	 * flush lock and do nothing.
 	 */
-	if ((ip->i_update_core == 0) &&
-	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+	if (xfs_inode_clean(ip)) {
 		xfs_ifunlock(ip);
 		return 0;
 	}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bfe92ea17952..40513077ab36 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w)
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
 
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+	return (!ip->i_itemp ||
+		!(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+	       !ip->i_update_core;
+}
+
+
 #ifdef __KERNEL__
 
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 14140f6225ba..5390d124ad35 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3454,7 +3454,6 @@ xfs_inode_flush(
 	int		flags)
 {
 	xfs_mount_t	*mp = ip->i_mount;
-	xfs_inode_log_item_t *iip = ip->i_itemp;
 	int		error = 0;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -3464,8 +3463,7 @@ xfs_inode_flush(
 	 * Bypass inodes which have already been cleaned by
 	 * the inode flush clustering code inside xfs_iflush
 	 */
-	if ((ip->i_update_core == 0) &&
-	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
+	if (xfs_inode_clean(ip))
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From b589334c7a1fff85d2f009d5db4c34fad48925e9 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:44:06 +1100
Subject: [XFS] Prevent AIL lock contention during transaction completion

When hundreds of processors attempt to commit transactions at the same
time, they can contend on the AIL lock when updating the tail LSN held in
the in-core log structure.

At the moment, the tail LSN is only needed when actually writing out an
iclog, so it really does not need to be updated on every single
transaction completion - only those that result in switching iclogs and
flushing them to disk.

The result is that we reduce the number of times we need to grab the AIL
lock and the log grant lock by up to two orders of magnitude on large
processor count machines. The problem has previously been hidden by AIL
lock contention walking the AIL list which was recently solved and
uncovered this issue.

SGI-PV: 975671
SGI-Modid: xfs-linux-melb:xfs-kern:30504a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 31f2b04f2c97..2e35077ff6b2 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2813,15 +2813,13 @@ xlog_state_put_ticket(xlog_t	    *log,
  *
  */
 STATIC int
-xlog_state_release_iclog(xlog_t		*log,
-			 xlog_in_core_t	*iclog)
+xlog_state_release_iclog(
+	xlog_t		*log,
+	xlog_in_core_t	*iclog)
 {
 	int		sync = 0;	/* do we sync? */
 
-	xlog_assign_tail_lsn(log->l_mp);
-
 	spin_lock(&log->l_icloglock);
-
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
 		return XFS_ERROR(EIO);
@@ -2833,13 +2831,14 @@ xlog_state_release_iclog(xlog_t		*log,
 
 	if (--iclog->ic_refcnt == 0 &&
 	    iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+		/* update tail before writing to iclog */
+		xlog_assign_tail_lsn(log->l_mp);
 		sync++;
 		iclog->ic_state = XLOG_STATE_SYNCING;
 		iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
 		xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
 		/* cycle incremented when incrementing curr_block */
 	}
-
 	spin_unlock(&log->l_icloglock);
 
 	/*
@@ -2849,11 +2848,9 @@ xlog_state_release_iclog(xlog_t		*log,
 	 * this iclog has consistent data, so we ignore IOERROR
 	 * flags after this point.
 	 */
-	if (sync) {
+	if (sync)
 		return xlog_sync(log, iclog);
-	}
 	return 0;
-
 }	/* xlog_state_release_iclog */
 
 
-- 
cgit v1.2.3


From 155cc6b784a959ed456fe46dca522e1d28b3b718 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:44:14 +1100
Subject: [XFS] Use atomics for iclog reference counting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we update the log tail LSN less frequently on transaction
completion, we pass the contention straight to the global log state lock
(l_iclog_lock) during transaction completion.

We currently have to take this lock to decrement the iclog reference
count. there is a reference count on each iclog, so we need to take �he
global lock for all refcount changes.

When large numbers of processes are all doing small trnasctions, the iclog
reference counts will be quite high, and the state change that absolutely
requires the l_iclog_lock is the except rather than the norm.

Change the reference counting on the iclogs to use atomic_inc/dec so that
we can use atomic_dec_and_lock during transaction completion and avoid the
need for grabbing the l_iclog_lock for every reference count decrement
except the one that matters - the last.

SGI-PV: 975671
SGI-Modid: xfs-linux-melb:xfs-kern:30505a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c      | 36 ++++++++++++++++++++----------------
 fs/xfs/xfs_log_priv.h |  2 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2e35077ff6b2..1fa980933895 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -675,7 +675,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 
 		spin_lock(&log->l_icloglock);
 		iclog = log->l_iclog;
-		iclog->ic_refcnt++;
+		atomic_inc(&iclog->ic_refcnt);
 		spin_unlock(&log->l_icloglock);
 		xlog_state_want_sync(log, iclog);
 		(void) xlog_state_release_iclog(log, iclog);
@@ -713,7 +713,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		 */
 		spin_lock(&log->l_icloglock);
 		iclog = log->l_iclog;
-		iclog->ic_refcnt++;
+		atomic_inc(&iclog->ic_refcnt);
 		spin_unlock(&log->l_icloglock);
 
 		xlog_state_want_sync(log, iclog);
@@ -1405,7 +1405,7 @@ xlog_sync(xlog_t		*log,
 	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
 
 	XFS_STATS_INC(xs_log_writes);
-	ASSERT(iclog->ic_refcnt == 0);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 
 	/* Add for LR header */
 	count_init = log->l_iclog_hsize + iclog->ic_offset;
@@ -2309,7 +2309,7 @@ xlog_state_done_syncing(
 
 	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
 	       iclog->ic_state == XLOG_STATE_IOERROR);
-	ASSERT(iclog->ic_refcnt == 0);
+	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
 	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
 
 
@@ -2391,7 +2391,7 @@ restart:
 	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
 	head = &iclog->ic_header;
 
-	iclog->ic_refcnt++;			/* prevents sync */
+	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
 	log_offset = iclog->ic_offset;
 
 	/* On the 1st write to an iclog, figure out lsn.  This works
@@ -2423,12 +2423,12 @@ restart:
 		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
 
 		/* If I'm the only one writing to this iclog, sync it to disk */
-		if (iclog->ic_refcnt == 1) {
+		if (atomic_read(&iclog->ic_refcnt) == 1) {
 			spin_unlock(&log->l_icloglock);
 			if ((error = xlog_state_release_iclog(log, iclog)))
 				return error;
 		} else {
-			iclog->ic_refcnt--;
+			atomic_dec(&iclog->ic_refcnt);
 			spin_unlock(&log->l_icloglock);
 		}
 		goto restart;
@@ -2819,18 +2819,21 @@ xlog_state_release_iclog(
 {
 	int		sync = 0;	/* do we sync? */
 
-	spin_lock(&log->l_icloglock);
+	if (iclog->ic_state & XLOG_STATE_IOERROR)
+		return XFS_ERROR(EIO);
+
+	ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+	if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+		return 0;
+
 	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
 		return XFS_ERROR(EIO);
 	}
-
-	ASSERT(iclog->ic_refcnt > 0);
 	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
 	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
 
-	if (--iclog->ic_refcnt == 0 &&
-	    iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
 		/* update tail before writing to iclog */
 		xlog_assign_tail_lsn(log->l_mp);
 		sync++;
@@ -2950,7 +2953,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 		 * previous iclog and go to sleep.
 		 */
 		if (iclog->ic_state == XLOG_STATE_DIRTY ||
-		    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+		    (atomic_read(&iclog->ic_refcnt) == 0
+		     && iclog->ic_offset == 0)) {
 			iclog = iclog->ic_prev;
 			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
 			    iclog->ic_state == XLOG_STATE_DIRTY)
@@ -2958,14 +2962,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 			else
 				goto maybe_sleep;
 		} else {
-			if (iclog->ic_refcnt == 0) {
+			if (atomic_read(&iclog->ic_refcnt) == 0) {
 				/* We are the only one with access to this
 				 * iclog.  Flush it out now.  There should
 				 * be a roundoff of zero to show that someone
 				 * has already taken care of the roundoff from
 				 * the previous sync.
 				 */
-				iclog->ic_refcnt++;
+				atomic_inc(&iclog->ic_refcnt);
 				lsn = be64_to_cpu(iclog->ic_header.h_lsn);
 				xlog_state_switch_iclogs(log, iclog, 0);
 				spin_unlock(&log->l_icloglock);
@@ -3097,7 +3101,7 @@ try_again:
 			already_slept = 1;
 			goto try_again;
 		} else {
-			iclog->ic_refcnt++;
+			atomic_inc(&iclog->ic_refcnt);
 			xlog_state_switch_iclogs(log, iclog, 0);
 			spin_unlock(&log->l_icloglock);
 			if (xlog_state_release_iclog(log, iclog))
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c6244cc733c0..01c63db25a1d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -339,7 +339,7 @@ typedef struct xlog_iclog_fields {
 #endif
 	int			ic_size;
 	int			ic_offset;
-	int			ic_refcnt;
+	atomic_t		ic_refcnt;
 	int			ic_bwritecnt;
 	ushort_t		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
-- 
cgit v1.2.3


From db0bb7baa1533db156d8af3ebeda1f0473a0197a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:35 +1100
Subject: [XFS] cleanup xfs_vn_mknod

- use proper goto based unwinding instead of the current mess of
  multiple conditionals
- rename ip to inode because that's the normal convention for Linux
  inodes while ip is the convention for xfs_inodes
- remove unlikely checks for the default_acl - branches marked unlikely
  might lead to extreme branch bredictor slowdons if taken and for some
  workloads a default acl is quite common
- properly indent the switch statements
- remove xfs_has_fs_struct as nfsd has a fs_struct in any semi-recent
  kernel

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30529a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 67 ++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cc4abd3daa49..346701183318 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -241,18 +241,6 @@ xfs_init_security(
 	return error;
 }
 
-/*
- * Determine whether a process has a valid fs_struct (kernel daemons
- * like knfsd don't have an fs_struct).
- *
- * XXX(hch):  nfsd is broken, better fix it instead.
- */
-STATIC_INLINE int
-xfs_has_fs_struct(struct task_struct *task)
-{
-	return (task->fs != init_task.fs);
-}
-
 STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
@@ -284,7 +272,7 @@ xfs_vn_mknod(
 	int		mode,
 	dev_t		rdev)
 {
-	struct inode	*ip;
+	struct inode	*inode;
 	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
@@ -297,7 +285,7 @@ xfs_vn_mknod(
 	if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
 		return -EINVAL;
 
-	if (unlikely(test_default_acl && test_default_acl(dvp))) {
+	if (test_default_acl && test_default_acl(dvp)) {
 		if (!_ACL_ALLOC(default_acl)) {
 			return -ENOMEM;
 		}
@@ -307,11 +295,14 @@ xfs_vn_mknod(
 		}
 	}
 
-	if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current))
+	if (IS_POSIXACL(dir) && !default_acl)
 		mode &= ~current->fs->umask;
 
 	switch (mode & S_IFMT) {
-	case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
 		rdev = sysv_encode_dev(rdev);
 	case S_IFREG:
 		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
@@ -324,32 +315,34 @@ xfs_vn_mknod(
 		break;
 	}
 
-	if (unlikely(!error)) {
-		error = xfs_init_security(vp, dir);
-		if (error)
-			xfs_cleanup_inode(dir, vp, dentry, mode);
-	}
+	if (unlikely(error))
+		goto out_free_acl;
 
-	if (unlikely(default_acl)) {
-		if (!error) {
-			error = _ACL_INHERIT(vp, mode, default_acl);
-			if (!error)
-				xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
-			else
-				xfs_cleanup_inode(dir, vp, dentry, mode);
-		}
+	error = xfs_init_security(vp, dir);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	if (default_acl) {
+		error = _ACL_INHERIT(vp, mode, default_acl);
+		if (unlikely(error))
+			goto out_cleanup_inode;
+		xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
 		_ACL_FREE(default_acl);
 	}
 
-	if (likely(!error)) {
-		ASSERT(vp);
-		ip = vn_to_inode(vp);
+	inode = vn_to_inode(vp);
 
-		if (S_ISDIR(mode))
-			xfs_validate_fields(ip);
-		d_instantiate(dentry, ip);
-		xfs_validate_fields(dir);
-	}
+	if (S_ISDIR(mode))
+		xfs_validate_fields(inode);
+	d_instantiate(dentry, inode);
+	xfs_validate_fields(dir);
+	return -error;
+
+ out_cleanup_inode:
+	xfs_cleanup_inode(dir, vp, dentry, mode);
+ out_free_acl:
+	if (default_acl)
+		_ACL_FREE(default_acl);
 	return -error;
 }
 
-- 
cgit v1.2.3


From a8b3acd57e3aaaf73a863a28e0e9f6cca37cd8e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:41 +1100
Subject: [XFS] vnode cleanup in xfs_fs_subr.c

Cleanup the unneeded intermediate vnode step in the flushing helpers and
go directly from the xfs_inode to the struct address_space.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30530a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_fs_subr.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index ac6d34cc355d..1eefe61f0e10 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -17,18 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_vnodeops.h"
-
-/*
- * The following six includes are needed so that we can include
- * xfs_inode.h.  What a mess..
- */
 #include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-
 #include "xfs_inode.h"
 
 int  fs_noerr(void) { return 0; }
@@ -42,11 +31,10 @@ xfs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	struct inode	*inode = vn_to_inode(vp);
+	struct address_space *mapping = ip->i_vnode->i_mapping;
 
-	if (VN_CACHED(vp))
-		truncate_inode_pages(inode->i_mapping, first);
+	if (mapping->nrpages)
+		truncate_inode_pages(mapping, first);
 }
 
 int
@@ -56,15 +44,14 @@ xfs_flushinval_pages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	struct inode	*inode = vn_to_inode(vp);
+	struct address_space *mapping = ip->i_vnode->i_mapping;
 	int		ret = 0;
 
-	if (VN_CACHED(vp)) {
+	if (mapping->nrpages) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_write_and_wait(inode->i_mapping);
+		ret = filemap_write_and_wait(mapping);
 		if (!ret)
-			truncate_inode_pages(inode->i_mapping, first);
+			truncate_inode_pages(mapping, first);
 	}
 	return ret;
 }
@@ -77,17 +64,16 @@ xfs_flush_pages(
 	uint64_t	flags,
 	int		fiopt)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	struct inode	*inode = vn_to_inode(vp);
+	struct address_space *mapping = ip->i_vnode->i_mapping;
 	int		ret = 0;
 	int		ret2;
 
-	if (VN_DIRTY(vp)) {
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_fdatawrite(inode->i_mapping);
+		ret = filemap_fdatawrite(mapping);
 		if (flags & XFS_B_ASYNC)
 			return ret;
-		ret2 = filemap_fdatawait(inode->i_mapping);
+		ret2 = filemap_fdatawait(mapping);
 		if (!ret)
 			ret = ret2;
 	}
-- 
cgit v1.2.3


From 43973964a386348af0a392266f008ba24170aa30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:50 +1100
Subject: [XFS] kill xfs_get_dir_entry

Instead of of xfs_get_dir_entry use a macro to get the xfs_inode from the
dentry in the callers and grab the reference manually.

Only grab the reference once as it's fine to keep it over the dmapi calls.
(And even that reference is actually superflous in Linux but I'll leave
that for another patch)

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30531a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_vnode.h |  2 +-
 fs/xfs/xfs_rename.c          | 12 ++++-------
 fs/xfs/xfs_utils.c           | 22 -------------------
 fs/xfs/xfs_utils.h           |  1 -
 fs/xfs/xfs_vnodeops.c        | 51 ++++++++------------------------------------
 5 files changed, 14 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f200e0244082..202231828283 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -227,7 +227,7 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
  */
 #define VNAME(dentry)		((char *) (dentry)->d_name.name)
 #define VNAMELEN(dentry)	((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry)	(vn_from_inode((dentry)->d_inode))
+#define VNAME_TO_INODE(dentry)	(XFS_I((dentry)->d_inode))
 
 /*
  * Dealing with bad inodes
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 1c6d40ed6816..fd1244cf50a7 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -93,7 +93,8 @@ xfs_lock_for_rename(
 	xfs_inode_t	**i_tab,/* array of inode returned, sorted */
 	int		*num_inodes)  /* number of inodes in array */
 {
-	xfs_inode_t		*ip1, *ip2, *temp;
+	xfs_inode_t		*ip1 = VNAME_TO_INODE(vname1);
+	xfs_inode_t		*ip2, *temp;
 	xfs_ino_t		inum1, inum2;
 	int			error;
 	int			i, j;
@@ -109,16 +110,11 @@ xfs_lock_for_rename(
 	 * to see if we still have the right inodes, directories, etc.
 	 */
 	lock_mode = xfs_ilock_map_shared(dp1);
-	error = xfs_get_dir_entry(vname1, &ip1);
-	if (error) {
-		xfs_iunlock_map_shared(dp1, lock_mode);
-		return error;
-	}
+	IHOLD(ip1);
+	xfs_itrace_ref(ip1);
 
 	inum1 = ip1->i_ino;
 
-	ASSERT(ip1);
-	xfs_itrace_ref(ip1);
 
 	/*
 	 * Unlock dp1 and lock dp2 if they are different.
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 18a85e746680..47c45ff4a067 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -40,28 +40,6 @@
 #include "xfs_itable.h"
 #include "xfs_utils.h"
 
-/*
- * xfs_get_dir_entry is used to get a reference to an inode given
- * its parent directory inode and the name of the file.	 It does
- * not lock the child inode, and it unlocks the directory before
- * returning.  The directory's generation number is returned for
- * use by a later call to xfs_lock_dir_and_entry.
- */
-int
-xfs_get_dir_entry(
-	bhv_vname_t	*dentry,
-	xfs_inode_t	**ipp)
-{
-	bhv_vnode_t	*vp;
-
-	vp = VNAME_TO_VNODE(dentry);
-
-	*ipp = xfs_vtoi(vp);
-	if (!*ipp)
-		return XFS_ERROR(ENOENT);
-	VN_HOLD(vp);
-	return 0;
-}
 
 int
 xfs_dir_lookup_int(
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f857fcccb723..c4c4a6aa6549 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,7 +21,6 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
 extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
 extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 5390d124ad35..4765e7c4b75d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2270,41 +2270,30 @@ xfs_remove(
 	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	char			*name = VNAME(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-	xfs_inode_t             *ip;
+	xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
+	int			namelen = VNAMELEN(dentry);
 	xfs_trans_t             *tp = NULL;
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	int			dm_di_mode = 0;
 	int			link_zero;
 	uint			resblks;
-	int			namelen;
 
 	xfs_itrace_entry(dp);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	namelen = VNAMELEN(dentry);
-
-	if (!xfs_get_dir_entry(dentry, &ip)) {
-	        dm_di_mode = ip->i_d.di_mode;
-		IRELE(ip);
-	}
-
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					name, NULL, dm_di_mode, 0, 0);
+					name, NULL, ip->i_d.di_mode, 0, 0);
 		if (error)
 			return error;
 	}
 
-	/* From this point on, return through std_return */
-	ip = NULL;
-
 	/*
 	 * We need to get a reference to ip before we get our log
 	 * reservation. The reason for this is that we cannot call
@@ -2317,13 +2306,7 @@ xfs_remove(
 	 * when we call xfs_iget.  Instead we get an unlocked reference
 	 * to the inode before getting our log reservation.
 	 */
-	error = xfs_get_dir_entry(dentry, &ip);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto std_return;
-	}
-
-	dm_di_mode = ip->i_d.di_mode;
+	IHOLD(ip);
 
 	xfs_itrace_entry(ip);
 	xfs_itrace_ref(ip);
@@ -2459,7 +2442,7 @@ xfs_remove(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 				dir_vp, DM_RIGHT_NULL,
 				NULL, DM_RIGHT_NULL,
-				name, NULL, dm_di_mode, error, 0);
+				name, NULL, ip->i_d.di_mode, error, 0);
 	}
 	return error;
 
@@ -2868,14 +2851,13 @@ xfs_rmdir(
 	char			*name = VNAME(dentry);
 	int			namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-  	xfs_inode_t             *cdp;   /* child directory */
+  	xfs_inode_t             *cdp = VNAME_TO_INODE(dentry);
 	xfs_trans_t             *tp;
 	int                     error;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	int			dm_di_mode = S_IFDIR;
 	int			last_cdp_link;
 	uint			resblks;
 
@@ -2884,24 +2866,15 @@ xfs_rmdir(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (!xfs_get_dir_entry(dentry, &cdp)) {
-	        dm_di_mode = cdp->i_d.di_mode;
-		IRELE(cdp);
-	}
-
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
 					dir_vp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, dm_di_mode, 0, 0);
+					name, NULL, cdp->i_d.di_mode, 0, 0);
 		if (error)
 			return XFS_ERROR(error);
 	}
 
-	/* Return through std_return after this point. */
-
-	cdp = NULL;
-
 	/*
 	 * We need to get a reference to cdp before we get our log
 	 * reservation.  The reason for this is that we cannot call
@@ -2914,13 +2887,7 @@ xfs_rmdir(
 	 * when we call xfs_iget.  Instead we get an unlocked reference
 	 * to the inode before getting our log reservation.
 	 */
-	error = xfs_get_dir_entry(dentry, &cdp);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto std_return;
-	}
-	mp = dp->i_mount;
-	dm_di_mode = cdp->i_d.di_mode;
+	IHOLD(cdp);
 
 	/*
 	 * Get the dquots for the inodes.
@@ -3077,7 +3044,7 @@ xfs_rmdir(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 					dir_vp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, dm_di_mode,
+					name, NULL, cdp->i_d.di_mode,
 					error, 0);
 	}
 	return error;
-- 
cgit v1.2.3


From 126468b1156211e26d97f74b2f1767acd141005a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:44:57 +1100
Subject: [XFS] kill xfs_rwlock/xfs_rwunlock

We can just use xfs_ilock/xfs_iunlock instead and get rid of the ugly
bhv_vrwlock_t.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30533a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  |  4 ++--
 fs/xfs/linux-2.6/xfs_lrw.c   | 31 +++++++++++++------------------
 fs/xfs/linux-2.6/xfs_vnode.h | 12 ------------
 fs/xfs/xfs_mount.h           |  2 +-
 fs/xfs/xfs_vnodeops.c        | 41 -----------------------------------------
 fs/xfs/xfs_vnodeops.h        |  2 --
 6 files changed, 16 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e0519529c26c..169e6c062794 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1532,9 +1532,9 @@ xfs_vm_bmap(
 	struct xfs_inode	*ip = XFS_I(inode);
 
 	xfs_itrace_entry(XFS_I(inode));
-	xfs_rwlock(ip, VRWLOCK_READ);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
-	xfs_rwunlock(ip, VRWLOCK_READ);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 166353388490..3c20007ab48f 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -228,11 +228,11 @@ xfs_read(
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+		int iolock = XFS_IOLOCK_SHARED;
 
 		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
-					dmflags, &locktype);
+					dmflags, &iolock);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			if (unlikely(ioflags & IO_ISDIRECT))
@@ -287,11 +287,11 @@ xfs_splice_read(
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		bhv_vrwlock_t locktype = VRWLOCK_READ;
+		int iolock = XFS_IOLOCK_SHARED;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
-					FILP_DELAY_FLAG(infilp), &locktype);
+					FILP_DELAY_FLAG(infilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			return -error;
@@ -330,11 +330,11 @@ xfs_splice_write(
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-		bhv_vrwlock_t locktype = VRWLOCK_WRITE;
+		int iolock = XFS_IOLOCK_EXCL;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
-					FILP_DELAY_FLAG(outfilp), &locktype);
+					FILP_DELAY_FLAG(outfilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return -error;
@@ -580,7 +580,6 @@ xfs_write(
 	xfs_fsize_t		isize, new_size;
 	int			iolock;
 	int			eventsent = 0;
-	bhv_vrwlock_t		locktype;
 	size_t			ocount = 0, count;
 	loff_t			pos;
 	int			need_i_mutex;
@@ -607,11 +606,9 @@ xfs_write(
 relock:
 	if (ioflags & IO_ISDIRECT) {
 		iolock = XFS_IOLOCK_SHARED;
-		locktype = VRWLOCK_WRITE_DIRECT;
 		need_i_mutex = 0;
 	} else {
 		iolock = XFS_IOLOCK_EXCL;
-		locktype = VRWLOCK_WRITE;
 		need_i_mutex = 1;
 		mutex_lock(&inode->i_mutex);
 	}
@@ -635,8 +632,7 @@ start:
 
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
-				      pos, count,
-				      dmflags, &locktype);
+				      pos, count, dmflags, &iolock);
 		if (error) {
 			goto out_unlock_internal;
 		}
@@ -667,7 +663,6 @@ start:
 		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
-			locktype = VRWLOCK_WRITE;
 			need_i_mutex = 1;
 			mutex_lock(&inode->i_mutex);
 			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
@@ -744,7 +739,6 @@ retry:
 			mutex_unlock(&inode->i_mutex);
 
 			iolock = XFS_IOLOCK_SHARED;
-			locktype = VRWLOCK_WRITE_DIRECT;
 			need_i_mutex = 0;
 		}
 
@@ -781,7 +775,7 @@ retry:
 
 	if (ret == -ENOSPC &&
 	    DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-		xfs_rwunlock(xip, locktype);
+		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
@@ -789,7 +783,7 @@ retry:
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
-		xfs_rwlock(xip, locktype);
+		xfs_ilock(xip, iolock);
 		if (error)
 			goto out_unlock_internal;
 		pos = xip->i_size;
@@ -817,7 +811,8 @@ retry:
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
 		int error2;
-		xfs_rwunlock(xip, locktype);
+
+		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 		error2 = sync_page_range(inode, mapping, pos, ret);
@@ -825,7 +820,7 @@ retry:
 			error = error2;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
-		xfs_rwlock(xip, locktype);
+		xfs_ilock(xip, iolock);
 		error2 = xfs_write_sync_logforce(mp, xip);
 		if (!error)
 			error = error2;
@@ -846,7 +841,7 @@ retry:
 			xip->i_d.di_size = xip->i_size;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
-	xfs_rwunlock(xip, locktype);
+	xfs_iunlock(xip, iolock);
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 202231828283..4ed5914adefb 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -45,18 +45,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
 	return vnode;
 }
 
-/*
- * Values for the vop_rwlock/rwunlock flags parameter.
- */
-typedef enum bhv_vrwlock {
-	VRWLOCK_NONE,
-	VRWLOCK_READ,
-	VRWLOCK_WRITE,
-	VRWLOCK_WRITE_DIRECT,
-	VRWLOCK_TRY_READ,
-	VRWLOCK_TRY_WRITE
-} bhv_vrwlock_t;
-
 /*
  * Return values for xfs_inactive.  A return value of
  * VN_INACTIVE_NOCACHE implies that the file system behavior
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8a4728d847..110ee83fcbec 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -67,7 +67,7 @@ struct xfs_mru_cache;
  */
 
 typedef int	(*xfs_send_data_t)(int, bhv_vnode_t *,
-			xfs_off_t, size_t, int, bhv_vrwlock_t *);
+			xfs_off_t, size_t, int, int *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
 typedef int	(*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4765e7c4b75d..811ee874d868 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3374,47 +3374,6 @@ std_return:
 	goto std_return;
 }
 
-int
-xfs_rwlock(
-	xfs_inode_t	*ip,
-	bhv_vrwlock_t	locktype)
-{
-	if (S_ISDIR(ip->i_d.di_mode))
-		return 1;
-	if (locktype == VRWLOCK_WRITE) {
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-	} else if (locktype == VRWLOCK_TRY_READ) {
-		return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
-	} else if (locktype == VRWLOCK_TRY_WRITE) {
-		return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
-	} else {
-		ASSERT((locktype == VRWLOCK_READ) ||
-		       (locktype == VRWLOCK_WRITE_DIRECT));
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	return 1;
-}
-
-
-void
-xfs_rwunlock(
-	xfs_inode_t     *ip,
-	bhv_vrwlock_t	locktype)
-{
- 	if (S_ISDIR(ip->i_d.di_mode))
-  		return;
-	if (locktype == VRWLOCK_WRITE) {
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	} else {
-		ASSERT((locktype == VRWLOCK_READ) ||
-		       (locktype == VRWLOCK_WRITE_DIRECT));
-		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	}
-	return;
-}
-
-
 int
 xfs_inode_flush(
 	xfs_inode_t	*ip,
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 4e3970f0e5e3..85340bafd42d 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -38,8 +38,6 @@ int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
 		char *target_path, mode_t mode, bhv_vnode_t **vpp,
 		struct cred *credp);
-int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
-void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
-- 
cgit v1.2.3


From 24bd861d1c3fff5248de7ba3bdddb3369087ad46 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:45:16 +1100
Subject: [XFS] don't encode parent in nfs filehandles unless nessecary

As Dave pointed out after the export ops changes we now always encode the
parent into the filehandle for regular files, but it's not actually needed
when the filesystem is export with no_subtree_check. This one-liner fixes
xfs_fs_encode_fh to skip encoding the parent unless nessecary.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30535a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index ca4f66c4de16..21f0e8257590 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -66,7 +66,7 @@ xfs_fs_encode_fh(
 	int			len;
 
 	/* Directories don't need their parent encoded, they have ".." */
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) || !connectable)
 		fileid_type = FILEID_INO32_GEN;
 	else
 		fileid_type = FILEID_INO32_GEN_PARENT;
-- 
cgit v1.2.3


From 44d814ced4cffbfe6a775c5bb8b941a6e734e7d9 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:45:29 +1100
Subject: [XFS] Update c/mtime correctly on truncates

XFS changes the c/mtime of an inode when truncating it to the same size.
The c/mtime is only supposed to change if the size is changed. Not to be
confused with ftruncate, where the c/mtime is supposed to be changed even
if the size is not changed.

The Linux VFS encodes this semantic difference in the flags it sends down
to ->setattr, which XFS currently ignores. We need to make XFS pay
attention to the VFS flags and hence Do The Right Thing.

SGI-PV: 977547
SGI-Modid: xfs-linux-melb:xfs-kern:30536a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 811ee874d868..b77dede91b71 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -633,6 +633,15 @@ xfs_setattr(
 	 * Truncate file.  Must have write permission and not be a directory.
 	 */
 	if (mask & XFS_AT_SIZE) {
+		/*
+		 * Only change the c/mtime if we are changing the size
+		 * or we are explicitly asked to change it. This handles
+		 * the semantic difference between truncate() and ftruncate()
+		 * as implemented in the VFS.
+		 */
+		if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+			timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
 		if (vap->va_size > ip->i_size) {
 			xfs_igrow_finish(tp, ip, vap->va_size,
 			    !(flags & ATTR_DMI));
@@ -661,10 +670,6 @@ xfs_setattr(
 			 */
 			xfs_iflags_set(ip, XFS_ITRUNCATED);
 		}
-		/*
-		 * Have to do this even if the file's size doesn't change.
-		 */
-		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 	}
 
 	/*
-- 
cgit v1.2.3


From 6ee4752ffe782be6e86bea1403a2fe0f682aa71a Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:45:35 +1100
Subject: [XFS] Use atomic counters for ktrace buffer indexes

ktrace_enter() is consuming vast amounts of CPU time due to the use of a
single global lock for protecting buffer index increments. Change it to
use per-buffer atomic counters - this reduces ktrace_enter() overhead
during a trace intensive test on a 4p machine from 58% of all CPU time to
12% and halves test runtime.

SGI-PV: 977546
SGI-Modid: xfs-linux-melb:xfs-kern:30537a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/support/ktrace.c | 21 ++++++++-------------
 fs/xfs/support/ktrace.h |  2 +-
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 129067cfcb86..4e0444c0aca6 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -92,7 +92,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 
 	ktp->kt_entries  = ktep;
 	ktp->kt_nentries = nentries;
-	ktp->kt_index    = 0;
+	atomic_set(&ktp->kt_index, 0);
 	ktp->kt_rollover = 0;
 	return ktp;
 }
@@ -151,8 +151,6 @@ ktrace_enter(
 	void            *val14,
 	void            *val15)
 {
-	static DEFINE_SPINLOCK(wrap_lock);
-	unsigned long	flags;
 	int             index;
 	ktrace_entry_t  *ktep;
 
@@ -161,12 +159,8 @@ ktrace_enter(
 	/*
 	 * Grab an entry by pushing the index up to the next one.
 	 */
-	spin_lock_irqsave(&wrap_lock, flags);
-	index = ktp->kt_index;
-	if (++ktp->kt_index == ktp->kt_nentries)
-		ktp->kt_index = 0;
-	spin_unlock_irqrestore(&wrap_lock, flags);
-
+	index = atomic_add_return(1, &ktp->kt_index);
+	index = (index - 1) % ktp->kt_nentries;
 	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
 		ktp->kt_rollover = 1;
 
@@ -199,11 +193,12 @@ int
 ktrace_nentries(
 	ktrace_t        *ktp)
 {
-	if (ktp == NULL) {
+	int	index;
+	if (ktp == NULL)
 		return 0;
-	}
 
-	return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+	index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
+	return (ktp->kt_rollover ? ktp->kt_nentries : index);
 }
 
 /*
@@ -228,7 +223,7 @@ ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
 	int             nentries;
 
 	if (ktp->kt_rollover)
-		index = ktp->kt_index;
+		index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
 	else
 		index = 0;
 
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 56e72b40a859..782dbbb6a9d0 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -30,7 +30,7 @@ typedef struct ktrace_entry {
  */
 typedef struct ktrace {
 	int		kt_nentries;	/* number of entries in trace buf */
-	int		kt_index;	/* current index in entries */
+	atomic_t	kt_index;	/* current index in entries */
 	int		kt_rollover;
 	ktrace_entry_t	*kt_entries;	/* buffer of entries */
 } ktrace_t;
-- 
cgit v1.2.3


From d234154125197053d5215711b5df867979e55ebd Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 6 Mar 2008 13:45:43 +1100
Subject: [XFS] Use power-of-2 sized buffers to reduce overhead

Now that the ktrace_enter() code is using atomics, the non-power-of-2
buffer sizes - which require modulus operations to get the index - are
showing up as using substantial CPU in the profiles.

Force the buffer sizes to be rounded up to the nearest power of two and
use masking rather than modulus operations to convert the index counter to
the buffer index. This reduces ktrace_enter overhead to 8% of a CPU time,
and again almost halves the trace intensive test runtime.

SGI-PV: 977546
SGI-Modid: xfs-linux-melb:xfs-kern:30538a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/support/ktrace.c | 22 ++++++++++++++--------
 fs/xfs/support/ktrace.h |  1 +
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 4e0444c0aca6..0b75d302508f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -24,7 +24,7 @@ static int          ktrace_zentries;
 void __init
 ktrace_init(int zentries)
 {
-	ktrace_zentries = zentries;
+	ktrace_zentries = roundup_pow_of_two(zentries);
 
 	ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
 					"ktrace_hdr");
@@ -47,13 +47,16 @@ ktrace_uninit(void)
  * ktrace_alloc()
  *
  * Allocate a ktrace header and enough buffering for the given
- * number of entries.
+ * number of entries. Round the number of entries up to a
+ * power of 2 so we can do fast masking to get the index from
+ * the atomic index counter.
  */
 ktrace_t *
 ktrace_alloc(int nentries, unsigned int __nocast sleep)
 {
 	ktrace_t        *ktp;
 	ktrace_entry_t  *ktep;
+	int		entries;
 
 	ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
 
@@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 	/*
 	 * Special treatment for buffers with the ktrace_zentries entries
 	 */
-	if (nentries == ktrace_zentries) {
+	entries = roundup_pow_of_two(nentries);
+	if (entries == ktrace_zentries) {
 		ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
 							    sleep);
 	} else {
-		ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+		ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
 							    sleep | KM_LARGE);
 	}
 
@@ -91,7 +95,9 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 	}
 
 	ktp->kt_entries  = ktep;
-	ktp->kt_nentries = nentries;
+	ktp->kt_nentries = entries;
+	ASSERT(is_power_of_2(entries));
+	ktp->kt_index_mask = entries - 1;
 	atomic_set(&ktp->kt_index, 0);
 	ktp->kt_rollover = 0;
 	return ktp;
@@ -160,7 +166,7 @@ ktrace_enter(
 	 * Grab an entry by pushing the index up to the next one.
 	 */
 	index = atomic_add_return(1, &ktp->kt_index);
-	index = (index - 1) % ktp->kt_nentries;
+	index = (index - 1) & ktp->kt_index_mask;
 	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
 		ktp->kt_rollover = 1;
 
@@ -197,7 +203,7 @@ ktrace_nentries(
 	if (ktp == NULL)
 		return 0;
 
-	index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
+	index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
 	return (ktp->kt_rollover ? ktp->kt_nentries : index);
 }
 
@@ -223,7 +229,7 @@ ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
 	int             nentries;
 
 	if (ktp->kt_rollover)
-		index = atomic_read(&ktp->kt_index) % ktp->kt_nentries;
+		index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
 	else
 		index = 0;
 
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 782dbbb6a9d0..741d6947ca60 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -31,6 +31,7 @@ typedef struct ktrace_entry {
 typedef struct ktrace {
 	int		kt_nentries;	/* number of entries in trace buf */
 	atomic_t	kt_index;	/* current index in entries */
+	unsigned int	kt_index_mask;
 	int		kt_rollover;
 	ktrace_entry_t	*kt_entries;	/* buffer of entries */
 } ktrace_t;
-- 
cgit v1.2.3


From bc4ac74a4e5bd7db02976eb1b681e1d11f81c9ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:45:58 +1100
Subject: [XFS] cleanup vnode use in dmapi calls

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30545a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 13 ++++-------
 fs/xfs/linux-2.6/xfs_lrw.c  | 14 +++++-------
 fs/xfs/xfs_bmap.c           |  2 +-
 fs/xfs/xfs_mount.h          | 22 +++++++++---------
 fs/xfs/xfs_rename.c         |  8 +++----
 fs/xfs/xfs_vfsops.c         |  4 ++--
 fs/xfs/xfs_vnodeops.c       | 55 +++++++++++++++++++++------------------------
 7 files changed, 54 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index edab1ffbb163..05905246434d 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -469,16 +469,11 @@ xfs_file_open_exec(
 	struct inode	*inode)
 {
 	struct xfs_mount *mp = XFS_M(inode->i_sb);
+	struct xfs_inode *ip = XFS_I(inode);
 
-	if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) {
-		if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) {
-			bhv_vnode_t *vp = vn_from_inode(inode);
-
-			return -XFS_SEND_DATA(mp, DM_EVENT_READ,
-						vp, 0, 0, 0, NULL);
-		}
-	}
-
+	if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
+	             DM_EVENT_ENABLED(ip, DM_EVENT_READ))
+		return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
 	return 0;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 3c20007ab48f..01a8f26e1b17 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -231,7 +231,7 @@ xfs_read(
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 		int iolock = XFS_IOLOCK_SHARED;
 
-		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
+		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
 					dmflags, &iolock);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -276,7 +276,6 @@ xfs_splice_read(
 	int			flags,
 	int			ioflags)
 {
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
 
@@ -290,7 +289,7 @@ xfs_splice_read(
 		int iolock = XFS_IOLOCK_SHARED;
 		int error;
 
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
+		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
 					FILP_DELAY_FLAG(infilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -317,7 +316,6 @@ xfs_splice_write(
 	int			flags,
 	int			ioflags)
 {
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
 	struct inode		*inode = outfilp->f_mapping->host;
@@ -333,7 +331,7 @@ xfs_splice_write(
 		int iolock = XFS_IOLOCK_EXCL;
 		int error;
 
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
 					FILP_DELAY_FLAG(outfilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -631,7 +629,7 @@ start:
 			dmflags |= DM_FLAGS_IMUX;
 
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
+		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
 				      pos, count, dmflags, &iolock);
 		if (error) {
 			goto out_unlock_internal;
@@ -778,8 +776,8 @@ retry:
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
-				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
+				DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 87f646749817..19aae13b7f95 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5811,7 +5811,7 @@ xfs_getbmap(
 	if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
 	    DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
 	    whichfork == XFS_DATA_FORK) {
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
+		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
 		if (error)
 			return XFS_ERROR(error);
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 110ee83fcbec..7b37fa009297 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,17 +66,17 @@ struct xfs_mru_cache;
  * Prototypes and functions for the Data Migration subsystem.
  */
 
-typedef int	(*xfs_send_data_t)(int, bhv_vnode_t *,
+typedef int	(*xfs_send_data_t)(int, struct xfs_inode *,
 			xfs_off_t, size_t, int, int *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int	(*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
+typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-			bhv_vnode_t *,
-			dm_right_t, bhv_vnode_t *, dm_right_t,
+			struct xfs_inode *, dm_right_t,
+			struct xfs_inode *, dm_right_t,
 			char *, char *, mode_t, int, int);
 typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
 			char *, char *);
-typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *,
+typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
 			dm_right_t, mode_t, int, int);
 
 typedef struct xfs_dmops {
@@ -88,20 +88,20 @@ typedef struct xfs_dmops {
 	xfs_send_unmount_t	xfs_send_unmount;
 } xfs_dmops_t;
 
-#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
-	(*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock)
+#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
+	(*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
 #define XFS_SEND_MMAP(mp, vma,fl) \
 	(*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, vp,right) \
-	(*(mp)->m_dm_ops->xfs_send_destroy)(vp,right)
+#define XFS_SEND_DESTROY(mp, ip,right) \
+	(*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
 #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
 	(*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
 	(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_MOUNT(mp,right,path,name) \
 	(*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \
-	(*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl)
+#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
+	(*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
 
 
 /*
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fd1244cf50a7..6f80cfdfbd88 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -258,8 +258,8 @@ xfs_rename(
 	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
 	    DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-					src_dir_vp, DM_RIGHT_NULL,
-					target_dir_vp, DM_RIGHT_NULL,
+					src_dp, DM_RIGHT_NULL,
+					target_dp, DM_RIGHT_NULL,
 					src_name, target_name,
 					0, 0, 0);
 		if (error) {
@@ -591,8 +591,8 @@ std_return:
 	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
 	    DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
 		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-					src_dir_vp, DM_RIGHT_NULL,
-					target_dir_vp, DM_RIGHT_NULL,
+					src_dp, DM_RIGHT_NULL,
+					target_dp, DM_RIGHT_NULL,
 					src_name, target_name,
 					0, error, 0);
 	}
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 3ec27bf8531c..4c132a87d437 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -566,7 +566,7 @@ xfs_unmount(
 #ifdef HAVE_DMAPI
 	if (mp->m_flags & XFS_MOUNT_DMAPI) {
 		error = XFS_SEND_PREUNMOUNT(mp,
-				rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
+				rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
 				NULL, NULL, 0, 0,
 				(mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
 					0:DM_FLAGS_UNWANTED);
@@ -617,7 +617,7 @@ out:
 		/* Note: mp structure must still exist for
 		 * XFS_SEND_UNMOUNT() call.
 		 */
-		XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL,
+		XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
 			DM_RIGHT_NULL, 0, error, unmount_event_flags);
 	}
 	if (xfs_unmountfs_needed) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b77dede91b71..7e124b55c26b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -326,7 +326,7 @@ xfs_setattr(
 		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 		    !(flags & ATTR_DMI)) {
 			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
+			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
 				vap->va_size, 0, dmflags, NULL);
 			if (code) {
 				lock_flags = 0;
@@ -881,7 +881,7 @@ xfs_setattr(
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 	    !(flags & ATTR_DMI)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
+		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL, NULL, NULL,
 					0, 0, AT_DELAY_FLAG(flags));
 	}
@@ -1586,9 +1586,8 @@ xfs_inactive(
 
 	mp = ip->i_mount;
 
-	if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
-		(void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
-	}
+	if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
+		XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
 
 	error = 0;
 
@@ -1820,7 +1819,7 @@ xfs_create(
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-				dir_vp, DM_RIGHT_NULL, NULL,
+				dp, DM_RIGHT_NULL, NULL,
 				DM_RIGHT_NULL, name, NULL,
 				mode, 0, 0);
 
@@ -1976,8 +1975,8 @@ std_return:
 	if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
 	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-			dir_vp, DM_RIGHT_NULL,
-			*vpp ? vp:NULL,
+			dp, DM_RIGHT_NULL,
+			*vpp ? ip : NULL,
 			DM_RIGHT_NULL, name, NULL,
 			mode, error, 0);
 	}
@@ -2272,7 +2271,6 @@ xfs_remove(
 	xfs_inode_t             *dp,
 	bhv_vname_t		*dentry)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	char			*name = VNAME(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
@@ -2292,7 +2290,7 @@ xfs_remove(
 		return XFS_ERROR(EIO);
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
+		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
 					name, NULL, ip->i_d.di_mode, 0, 0);
 		if (error)
@@ -2445,7 +2443,7 @@ xfs_remove(
  std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-				dir_vp, DM_RIGHT_NULL,
+				dp, DM_RIGHT_NULL,
 				NULL, DM_RIGHT_NULL,
 				name, NULL, ip->i_d.di_mode, error, 0);
 	}
@@ -2504,8 +2502,8 @@ xfs_link(
 
 	if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-					target_dir_vp, DM_RIGHT_NULL,
-					src_vp, DM_RIGHT_NULL,
+					tdp, DM_RIGHT_NULL,
+					sip, DM_RIGHT_NULL,
 					target_name, NULL, 0, 0, 0);
 		if (error)
 			return error;
@@ -2615,8 +2613,8 @@ xfs_link(
 std_return:
 	if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-				target_dir_vp, DM_RIGHT_NULL,
-				src_vp, DM_RIGHT_NULL,
+				tdp, DM_RIGHT_NULL,
+				sip, DM_RIGHT_NULL,
 				target_name, NULL, 0, error, 0);
 	}
 	return error;
@@ -2665,7 +2663,7 @@ xfs_mkdir(
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-					dir_vp, DM_RIGHT_NULL, NULL,
+					dp, DM_RIGHT_NULL, NULL,
 					DM_RIGHT_NULL, dir_name, NULL,
 					mode, 0, 0);
 		if (error)
@@ -2823,8 +2821,8 @@ std_return:
 	if ((created || (error != 0 && dm_event_sent != 0)) &&
 	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-					dir_vp, DM_RIGHT_NULL,
-					created ? XFS_ITOV(cdp):NULL,
+					dp, DM_RIGHT_NULL,
+					created ? cdp : NULL,
 					DM_RIGHT_NULL,
 					dir_name, NULL,
 					mode, error, 0);
@@ -2873,7 +2871,7 @@ xfs_rmdir(
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
-					dir_vp, DM_RIGHT_NULL,
+					dp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
 					name, NULL, cdp->i_d.di_mode, 0, 0);
 		if (error)
@@ -3047,7 +3045,7 @@ xfs_rmdir(
  std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-					dir_vp, DM_RIGHT_NULL,
+					dp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
 					name, NULL, cdp->i_d.di_mode,
 					error, 0);
@@ -3144,7 +3142,7 @@ xfs_symlink(
 	}
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
+		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
 					link_name, target_path, 0, 0, 0);
 		if (error)
@@ -3348,8 +3346,8 @@ xfs_symlink(
 std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-					dir_vp, DM_RIGHT_NULL,
-					error ? NULL : XFS_ITOV(ip),
+					dp, DM_RIGHT_NULL,
+					error ? NULL : ip,
 					DM_RIGHT_NULL, link_name, target_path,
 					0, error, 0);
 	}
@@ -3707,9 +3705,8 @@ xfs_alloc_file_space(
 		end_dmi_offset = offset+len;
 		if (end_dmi_offset > ip->i_size)
 			end_dmi_offset = ip->i_size;
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
-			offset, end_dmi_offset - offset,
-			0, NULL);
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
+				      end_dmi_offset - offset, 0, NULL);
 		if (error)
 			return error;
 	}
@@ -3818,8 +3815,8 @@ dmapi_enospc_check:
 	if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
 	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-				XFS_ITOV(ip), DM_RIGHT_NULL,
-				XFS_ITOV(ip), DM_RIGHT_NULL,
+				ip, DM_RIGHT_NULL,
+				ip, DM_RIGHT_NULL,
 				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
 		if (error == 0)
 			goto retry;	/* Maybe DMAPI app. has made space */
@@ -3964,7 +3961,7 @@ xfs_free_file_space(
 	    DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
 		if (end_dmi_offset > ip->i_size)
 			end_dmi_offset = ip->i_size;
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
 				offset, end_dmi_offset - offset,
 				AT_DELAY_FLAG(attr_flags), NULL);
 		if (error)
-- 
cgit v1.2.3


From 979ebab11623894528d4d37b947533ea4e8649d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:05 +1100
Subject: [XFS] cleanup vnode use in xfs_create/mknod/mkdir

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30546a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 21 +++++++++++----------
 fs/xfs/xfs_vnodeops.c       | 25 +++++++++----------------
 fs/xfs/xfs_vnodeops.h       |  4 ++--
 3 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 346701183318..62899a1ec7f7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -273,7 +273,7 @@ xfs_vn_mknod(
 	dev_t		rdev)
 {
 	struct inode	*inode;
-	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
+	struct xfs_inode *ip = NULL;
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
 	int		error;
@@ -285,11 +285,11 @@ xfs_vn_mknod(
 	if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
 		return -EINVAL;
 
-	if (test_default_acl && test_default_acl(dvp)) {
+	if (test_default_acl && test_default_acl(dir)) {
 		if (!_ACL_ALLOC(default_acl)) {
 			return -ENOMEM;
 		}
-		if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
+		if (!_ACL_GET_DEFAULT(dir, default_acl)) {
 			_ACL_FREE(default_acl);
 			default_acl = NULL;
 		}
@@ -305,10 +305,10 @@ xfs_vn_mknod(
 	case S_IFSOCK:
 		rdev = sysv_encode_dev(rdev);
 	case S_IFREG:
-		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
+		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &ip, NULL);
 		break;
 	case S_IFDIR:
-		error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL);
+		error = xfs_mkdir(XFS_I(dir), dentry, mode, &ip, NULL);
 		break;
 	default:
 		error = EINVAL;
@@ -318,19 +318,20 @@ xfs_vn_mknod(
 	if (unlikely(error))
 		goto out_free_acl;
 
-	error = xfs_init_security(vp, dir);
+	inode = ip->i_vnode;
+
+	error = xfs_init_security(inode, dir);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
 	if (default_acl) {
-		error = _ACL_INHERIT(vp, mode, default_acl);
+		error = _ACL_INHERIT(inode, mode, default_acl);
 		if (unlikely(error))
 			goto out_cleanup_inode;
-		xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
+		xfs_iflags_set(ip, XFS_IMODIFIED);
 		_ACL_FREE(default_acl);
 	}
 
-	inode = vn_to_inode(vp);
 
 	if (S_ISDIR(mode))
 		xfs_validate_fields(inode);
@@ -339,7 +340,7 @@ xfs_vn_mknod(
 	return -error;
 
  out_cleanup_inode:
-	xfs_cleanup_inode(dir, vp, dentry, mode);
+	xfs_cleanup_inode(dir, inode, dentry, mode);
  out_free_acl:
 	if (default_acl)
 		_ACL_FREE(default_acl);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7e124b55c26b..a42d7fe6a5e8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1791,14 +1791,12 @@ xfs_create(
 	bhv_vname_t		*dentry,
 	mode_t			mode,
 	xfs_dev_t		rdev,
-	bhv_vnode_t		**vpp,
+	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
 	char			*name = VNAME(dentry);
 	xfs_mount_t	        *mp = dp->i_mount;
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	xfs_inode_t		*ip;
-	bhv_vnode_t	        *vp = NULL;
 	xfs_trans_t		*tp;
 	int                     error;
 	xfs_bmap_free_t		free_list;
@@ -1812,7 +1810,7 @@ xfs_create(
 	uint			resblks;
 	int			namelen;
 
-	ASSERT(!*vpp);
+	ASSERT(!*ipp);
 	xfs_itrace_entry(dp);
 
 	namelen = VNAMELEN(dentry);
@@ -1911,7 +1909,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	VN_HOLD(dir_vp);
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
@@ -1949,7 +1947,6 @@ xfs_create(
 	 * vnode to the caller, we bump the vnode ref count now.
 	 */
 	IHOLD(ip);
-	vp = XFS_ITOV(ip);
 
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error) {
@@ -1967,16 +1964,16 @@ xfs_create(
 	XFS_QM_DQRELE(mp, udqp);
 	XFS_QM_DQRELE(mp, gdqp);
 
-	*vpp = vp;
+	*ipp = ip;
 
 	/* Fallthrough to std_return with error = 0  */
 
 std_return:
-	if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
+	if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
 	    DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
 			dp, DM_RIGHT_NULL,
-			*vpp ? ip : NULL,
+			*ipp ? ip : NULL,
 			DM_RIGHT_NULL, name, NULL,
 			mode, error, 0);
 	}
@@ -2634,15 +2631,13 @@ xfs_mkdir(
 	xfs_inode_t             *dp,
 	bhv_vname_t		*dentry,
 	mode_t			mode,
-	bhv_vnode_t		**vpp,
+	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	char			*dir_name = VNAME(dentry);
 	int			dir_namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t		*cdp;	/* inode of created dir */
-	bhv_vnode_t		*cvp;	/* vnode of created dir */
 	xfs_trans_t		*tp;
 	int			cancel_flags;
 	int			error;
@@ -2749,7 +2744,7 @@ xfs_mkdir(
 	 * from here on will result in the transaction cancel
 	 * unlocking dp so don't do it explicitly in the error path.
 	 */
-	VN_HOLD(dir_vp);
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
@@ -2780,11 +2775,9 @@ xfs_mkdir(
 	if (error)
 		goto error2;
 
-	cvp = XFS_ITOV(cdp);
-
 	created = B_TRUE;
 
-	*vpp = cvp;
+	*ipp = cdp;
 	IHOLD(cdp);
 
 	/*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 85340bafd42d..0acef1231417 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -26,12 +26,12 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
 		bhv_vnode_t **vpp);
 int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
-		xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp);
+		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
 int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
 		bhv_vname_t *dentry);
 int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
-		mode_t mode, bhv_vnode_t **vpp, struct cred *credp);
+		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
 int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
-- 
cgit v1.2.3


From a3da789640871c897901c5f766e33be78d56f35a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:12 +1100
Subject: [XFS] cleanup vnode use in xfs_link

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30547a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 23 +++++++++++------------
 fs/xfs/xfs_vnodeops.c       | 12 +++++-------
 fs/xfs/xfs_vnodeops.h       |  2 +-
 3 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 62899a1ec7f7..1df48209d60a 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -395,23 +395,22 @@ xfs_vn_link(
 	struct inode	*dir,
 	struct dentry	*dentry)
 {
-	struct inode	*ip;	/* inode of guy being linked to */
-	bhv_vnode_t	*vp;	/* vp of name being linked */
+	struct inode	*inode;	/* inode of guy being linked to */
 	int		error;
 
-	ip = old_dentry->d_inode;	/* inode being linked to */
-	vp = vn_from_inode(ip);
+	inode = old_dentry->d_inode;
 
-	VN_HOLD(vp);
-	error = xfs_link(XFS_I(dir), vp, dentry);
+	igrab(inode);
+	error = xfs_link(XFS_I(dir), XFS_I(inode), dentry);
 	if (unlikely(error)) {
-		VN_RELE(vp);
-	} else {
-		xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
-		xfs_validate_fields(ip);
-		d_instantiate(dentry, ip);
+		iput(inode);
+		return -error;
 	}
-	return -error;
+
+	xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+	xfs_validate_fields(inode);
+	d_instantiate(dentry, inode);
+	return 0;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a42d7fe6a5e8..10d2d22eb037 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2471,12 +2471,10 @@ xfs_remove(
 int
 xfs_link(
 	xfs_inode_t		*tdp,
-	bhv_vnode_t		*src_vp,
+	xfs_inode_t		*sip,
 	bhv_vname_t		*dentry)
 {
-	bhv_vnode_t		*target_dir_vp = XFS_ITOV(tdp);
 	xfs_mount_t		*mp = tdp->i_mount;
-	xfs_inode_t		*sip = xfs_vtoi(src_vp);
 	xfs_trans_t		*tp;
 	xfs_inode_t		*ips[2];
 	int			error;
@@ -2489,10 +2487,10 @@ xfs_link(
 	int			target_namelen;
 
 	xfs_itrace_entry(tdp);
-	xfs_itrace_entry(xfs_vtoi(src_vp));
+	xfs_itrace_entry(sip);
 
 	target_namelen = VNAMELEN(dentry);
-	ASSERT(!VN_ISDIR(src_vp));
+	ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -2544,8 +2542,8 @@ xfs_link(
 	 * xfs_trans_cancel will both unlock the inodes and
 	 * decrement the associated ref counts.
 	 */
-	VN_HOLD(src_vp);
-	VN_HOLD(target_dir_vp);
+	IHOLD(sip);
+	IHOLD(tdp);
 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 0acef1231417..79c13f57a819 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -28,7 +28,7 @@ int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
 int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
-int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		bhv_vname_t *dentry);
 int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
 		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-- 
cgit v1.2.3


From 3937be5ba836a204d3d1df96b518eecd6cdacbb9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:19 +1100
Subject: [XFS] cleanup vnode use in xfs_symlink and xfs_rename

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30548a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 41 +++++++++++++++++++++--------------------
 fs/xfs/xfs_rename.c         | 20 +++++---------------
 fs/xfs/xfs_vnodeops.c       | 16 +++++-----------
 fs/xfs/xfs_vnodeops.h       |  4 ++--
 4 files changed, 33 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 1df48209d60a..215158cbac43 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -437,29 +437,33 @@ xfs_vn_symlink(
 	struct dentry	*dentry,
 	const char	*symname)
 {
-	struct inode	*ip;
-	bhv_vnode_t	*cvp;	/* used to lookup symlink to put in dentry */
+	struct inode	*inode;
+	struct xfs_inode *cip = NULL;
 	int		error;
 	mode_t		mode;
 
-	cvp = NULL;
-
 	mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
 
 	error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
-			    &cvp, NULL);
-	if (likely(!error && cvp)) {
-		error = xfs_init_security(cvp, dir);
-		if (likely(!error)) {
-			ip = vn_to_inode(cvp);
-			d_instantiate(dentry, ip);
-			xfs_validate_fields(dir);
-			xfs_validate_fields(ip);
-		} else {
-			xfs_cleanup_inode(dir, cvp, dentry, 0);
-		}
-	}
+			    &cip, NULL);
+	if (unlikely(error))
+		goto out;
+
+	inode = cip->i_vnode;
+
+	error = xfs_init_security(inode, dir);
+	if (unlikely(error))
+		goto out_cleanup_inode;
+
+	d_instantiate(dentry, inode);
+	xfs_validate_fields(dir);
+	xfs_validate_fields(inode);
+	return 0;
+
+ out_cleanup_inode:
+	xfs_cleanup_inode(dir, inode, dentry, 0);
+ out:
 	return -error;
 }
 
@@ -487,12 +491,9 @@ xfs_vn_rename(
 	struct dentry	*ndentry)
 {
 	struct inode	*new_inode = ndentry->d_inode;
-	bhv_vnode_t	*tvp;	/* target directory */
 	int		error;
 
-	tvp = vn_from_inode(ndir);
-
-	error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry);
+	error = xfs_rename(XFS_I(odir), odentry, XFS_I(ndir), ndentry);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 6f80cfdfbd88..c4d0bac56a5a 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -219,12 +219,11 @@ int
 xfs_rename(
 	xfs_inode_t	*src_dp,
 	bhv_vname_t	*src_vname,
-	bhv_vnode_t	*target_dir_vp,
+	xfs_inode_t	*target_dp,
 	bhv_vname_t	*target_vname)
 {
-	bhv_vnode_t	*src_dir_vp = XFS_ITOV(src_dp);
 	xfs_trans_t	*tp;
-	xfs_inode_t	*target_dp, *src_ip, *target_ip;
+	xfs_inode_t	*src_ip, *target_ip;
 	xfs_mount_t	*mp = src_dp->i_mount;
 	int		new_parent;		/* moving to a new dir */
 	int		src_is_directory;	/* src_name is a directory */
@@ -244,16 +243,7 @@ xfs_rename(
 	int		target_namelen = VNAMELEN(target_vname);
 
 	xfs_itrace_entry(src_dp);
-	xfs_itrace_entry(xfs_vtoi(target_dir_vp));
-
-	/*
-	 * Find the XFS behavior descriptor for the target directory
-	 * vnode since it was not handed to us.
-	 */
-	target_dp = xfs_vtoi(target_dir_vp);
-	if (target_dp == NULL) {
-		return XFS_ERROR(EXDEV);
-	}
+	xfs_itrace_entry(target_dp);
 
 	if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
 	    DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
@@ -360,10 +350,10 @@ xfs_rename(
 	 * them when they unlock the inodes.  Also, we need to be careful
 	 * not to add an inode to the transaction more than once.
 	 */
-	VN_HOLD(src_dir_vp);
+	IHOLD(src_dp);
 	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent) {
-		VN_HOLD(target_dir_vp);
+		IHOLD(target_dp);
 		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
 	}
 	if ((src_ip != src_dp) && (src_ip != target_dp)) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 10d2d22eb037..fa694dc5d309 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3059,10 +3059,9 @@ xfs_symlink(
 	bhv_vname_t		*dentry,
 	char			*target_path,
 	mode_t			mode,
-	bhv_vnode_t		**vpp,
+	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t		*tp;
 	xfs_inode_t		*ip;
@@ -3088,7 +3087,7 @@ xfs_symlink(
 	char			*link_name = VNAME(dentry);
 	int			link_namelen;
 
-	*vpp = NULL;
+	*ipp = NULL;
 	error = 0;
 	ip = NULL;
 	tp = NULL;
@@ -3227,7 +3226,7 @@ xfs_symlink(
 	 * transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	VN_HOLD(dir_vp);
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
@@ -3343,13 +3342,8 @@ std_return:
 					0, error, 0);
 	}
 
-	if (!error) {
-		bhv_vnode_t *vp;
-
-		ASSERT(ip);
-		vp = XFS_ITOV(ip);
-		*vpp = vp;
-	}
+	if (!error)
+		*ipp = ip;
 	return error;
 
  error2:
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 79c13f57a819..71e9b15276f5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -36,7 +36,7 @@ int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
-		char *target_path, mode_t mode, bhv_vnode_t **vpp,
+		char *target_path, mode_t mode, struct xfs_inode **ipp,
 		struct cred *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
@@ -45,7 +45,7 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset,
 		struct cred *credp, int	attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
-		bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname);
+		struct xfs_inode *target_dp, bhv_vname_t *target_vname);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
 		int *valuelenp, int flags, cred_t *cred);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
-- 
cgit v1.2.3


From ef1f5e7ad38e5414d016983a8cc5a8db7654a61d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:25 +1100
Subject: [XFS] cleanup vnode use in xfs_lookup

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30550a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_export.c | 9 ++++-----
 fs/xfs/linux-2.6/xfs_iops.c   | 6 +++---
 fs/xfs/xfs_vnodeops.c         | 4 ++--
 fs/xfs/xfs_vnodeops.h         | 2 +-
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 21f0e8257590..66a9a9e76cbe 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -213,17 +213,16 @@ xfs_fs_get_parent(
 	struct dentry		*child)
 {
 	int			error;
-	bhv_vnode_t		*cvp;
+	struct xfs_inode	*cip;
 	struct dentry		*parent;
 
-	cvp = NULL;
-	error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp);
+	error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cip);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
-	parent = d_alloc_anon(vn_to_inode(cvp));
+	parent = d_alloc_anon(cip->i_vnode);
 	if (unlikely(!parent)) {
-		VN_RELE(cvp);
+		iput(cip->i_vnode);
 		return ERR_PTR(-ENOMEM);
 	}
 	return parent;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 215158cbac43..01d9b3f1e044 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -372,13 +372,13 @@ xfs_vn_lookup(
 	struct dentry	*dentry,
 	struct nameidata *nd)
 {
-	bhv_vnode_t	*cvp;
+	struct xfs_inode *cip;
 	int		error;
 
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	error = xfs_lookup(XFS_I(dir), dentry, &cvp);
+	error = xfs_lookup(XFS_I(dir), dentry, &cip);
 	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
@@ -386,7 +386,7 @@ xfs_vn_lookup(
 		return NULL;
 	}
 
-	return d_splice_alias(vn_to_inode(cvp), dentry);
+	return d_splice_alias(cip->i_vnode, dentry);
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index fa694dc5d309..3418c94bcf17 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1763,7 +1763,7 @@ int
 xfs_lookup(
 	xfs_inode_t		*dp,
 	bhv_vname_t		*dentry,
-	bhv_vnode_t		**vpp)
+	xfs_inode_t		**ipp)
 {
 	xfs_inode_t		*ip;
 	xfs_ino_t		e_inum;
@@ -1778,7 +1778,7 @@ xfs_lookup(
 	lock_mode = xfs_ilock_map_shared(dp);
 	error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
 	if (!error) {
-		*vpp = XFS_ITOV(ip);
+		*ipp = ip;
 		xfs_itrace_ref(ip);
 	}
 	xfs_iunlock_map_shared(dp, lock_mode);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 71e9b15276f5..12e581865bdf 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -24,7 +24,7 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
-		bhv_vnode_t **vpp);
+		struct xfs_inode **ipp);
 int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
-- 
cgit v1.2.3


From dcf49cc5cfbbc0070ad4307428f8282dc7e04e58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:37 +1100
Subject: [XFS] cleanup vnode use in xfs_lrw.c

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30551a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 01a8f26e1b17..1d95dca96cfe 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -176,7 +176,6 @@ xfs_read(
 {
 	struct file		*file = iocb->ki_filp;
 	struct inode		*inode = file->f_mapping->host;
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	size_t			size = 0;
 	ssize_t			ret = 0;
@@ -242,7 +241,7 @@ xfs_read(
 	}
 
 	if (unlikely(ioflags & IO_ISDIRECT)) {
-		if (VN_CACHED(vp))
+		if (inode->i_mapping->nrpages)
 			ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
 						    -1, FI_REMAPF_LOCKED);
 		mutex_unlock(&inode->i_mutex);
@@ -571,7 +570,6 @@ xfs_write(
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
 	struct inode		*inode = mapping->host;
-	bhv_vnode_t		*vp = XFS_ITOV(xip);
 	unsigned long		segs = nsegs;
 	xfs_mount_t		*mp;
 	ssize_t			ret = 0, error = 0;
@@ -658,7 +656,7 @@ start:
 			return XFS_ERROR(-EINVAL);
 		}
 
-		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+		if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
 			need_i_mutex = 1;
@@ -720,7 +718,7 @@ retry:
 	current->backing_dev_info = mapping->backing_dev_info;
 
 	if ((ioflags & IO_ISDIRECT)) {
-		if (VN_CACHED(vp)) {
+		if (mapping->nrpages) {
 			WARN_ON(need_i_mutex == 0);
 			xfs_inval_cached_trace(xip, pos, -1,
 					(pos & PAGE_CACHE_MASK), -1);
-- 
cgit v1.2.3


From af048193fcfe2650e7ed3b1ab3d48b1ed0efb467 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:43 +1100
Subject: [XFS] cleanup vnode use in xfs_iops.c

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30552a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 01d9b3f1e044..53f8feb28e58 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,12 +62,11 @@ void
 xfs_synchronize_atime(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp;
+	struct inode	*inode = ip->i_vnode;
 
-	vp = XFS_ITOV_NULL(ip);
-	if (vp) {
-		ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec;
+	if (inode) {
+		ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
 	}
 }
 
@@ -80,11 +79,10 @@ void
 xfs_mark_inode_dirty_sync(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp;
+	struct inode	*inode = ip->i_vnode;
 
-	vp = XFS_ITOV_NULL(ip);
-	if (vp)
-		mark_inode_dirty_sync(vn_to_inode(vp));
+	if (inode)
+		mark_inode_dirty_sync(inode);
 }
 
 /*
@@ -215,26 +213,26 @@ xfs_validate_fields(
  */
 STATIC int
 xfs_init_security(
-	bhv_vnode_t	*vp,
+	struct inode	*inode,
 	struct inode	*dir)
 {
-	struct inode	*ip = vn_to_inode(vp);
+	struct xfs_inode *ip = XFS_I(inode);
 	size_t		length;
 	void		*value;
 	char		*name;
 	int		error;
 
-	error = security_inode_init_security(ip, dir, &name, &value, &length);
+	error = security_inode_init_security(inode, dir, &name,
+					     &value, &length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
 			return 0;
 		return -error;
 	}
 
-	error = xfs_attr_set(XFS_I(ip), name, value,
-			length, ATTR_SECURE);
+	error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
 	if (!error)
-		xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED);
+		xfs_iflags_set(ip, XFS_IMODIFIED);
 
 	kfree(name);
 	kfree(value);
@@ -244,7 +242,7 @@ xfs_init_security(
 STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
-	bhv_vnode_t	*vp,
+	struct inode	*inode,
 	struct dentry	*dentry,
 	int		mode)
 {
@@ -255,14 +253,14 @@ xfs_cleanup_inode(
 	 * xfs_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
-	teardown.d_inode = vn_to_inode(vp);
+	teardown.d_inode = inode;
 	teardown.d_name = dentry->d_name;
 
 	if (S_ISDIR(mode))
 		xfs_rmdir(XFS_I(dir), &teardown);
 	else
 		xfs_remove(XFS_I(dir), &teardown);
-	VN_RELE(vp);
+	iput(inode);
 }
 
 STATIC int
-- 
cgit v1.2.3


From 5f90150abad61b49dbb4a6ca1087fe0a75001ef9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:46:49 +1100
Subject: [XFS] cleanup vnode use in xfs_bmap.c

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30553a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_bmap.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 19aae13b7f95..bce8e3bd8ad1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5772,7 +5772,6 @@ xfs_getbmap(
 	int			error;		/* return value */
 	__int64_t		fixlen;		/* length for -1 case */
 	int			i;		/* extent number */
-	bhv_vnode_t		*vp;		/* corresponding vnode */
 	int			lock;		/* lock state */
 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
 	xfs_mount_t		*mp;		/* file system mount point */
@@ -5789,7 +5788,6 @@ xfs_getbmap(
 	int			bmapi_flags;	/* flags for xfs_bmapi */
 	__int32_t		oflags;		/* getbmapx bmv_oflags field */
 
-	vp = XFS_ITOV(ip);
 	mp = ip->i_mount;
 
 	whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-- 
cgit v1.2.3


From dfa18b117974d7667a2d5b941853fac3f2e256db Mon Sep 17 00:00:00 2001
From: Niv Sardi <xaiki@sgi.com>
Date: Thu, 6 Mar 2008 13:49:26 +1100
Subject: [XFS] kill t_sema member of struct xfs_trans

It's completely unused so we might aswell kill it. Note that there is
another t_sema in struct xlog_ticket, which is used and actually an sv_t
despite the name. That one is left untouched by this patch.

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30591a

Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_trans.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7f40628d85c7..b5effce00089 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -341,7 +341,6 @@ typedef struct xfs_trans {
 	unsigned int		t_rtx_res;	/* # of rt extents resvd */
 	unsigned int		t_rtx_res_used;	/* # of resvd rt extents used */
 	xfs_log_ticket_t	t_ticket;	/* log mgr ticket */
-	sema_t			t_sema;		/* sema for commit completion */
 	xfs_lsn_t		t_lsn;		/* log seq num of start of
 						 * transaction. */
 	xfs_lsn_t		t_commit_lsn;	/* log seq num of end of
-- 
cgit v1.2.3


From a45c796867df8dabc8eed6e72898d7ba1609bd7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 6 Mar 2008 13:49:36 +1100
Subject: [XFS] Remove superflous xfs_readsb call in xfs_mountfs.

When xfs_mountfs is called by xfs_mount xfs_readsb was called 35 lines
above unconditionally, so there is no need to try to read the superblock
if it's not present. If any other port doesn't have the superblock read at
this point it should just call it directly from it's xfs_mount equivalent.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30603a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_mount.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8ed164eb9544..41b690e6896c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -964,11 +964,6 @@ xfs_mountfs(
 	int		uuid_mounted = 0;
 	int		error = 0;
 
-	if (mp->m_sb_bp == NULL) {
-		error = xfs_readsb(mp, mfsi_flags);
-		if (error)
-			return error;
-	}
 	xfs_mount_common(mp, sbp);
 
 	/*
-- 
cgit v1.2.3


From 535f6b3735db6ef6026537bfe55ae00c3d9cc1ee Mon Sep 17 00:00:00 2001
From: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Date: Thu, 27 Mar 2008 17:58:27 +1100
Subject: [XFS] Replace custom AIL linked-list code with struct list_head

Replace the xfs_ail_entry_t with a struct list_head and clean the
surrounding code up. Also fixes a livelock in xfs_trans_first_push_ail()
by terminating the loop at the head of the list correctly.

SGI-PV: 978682
SGI-Modid: xfs-linux-melb:xfs-kern:30636a

Signed-off-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_mount.h     |   2 +-
 fs/xfs/xfs_trans.h     |   7 +--
 fs/xfs/xfs_trans_ail.c | 149 ++++++++++++++++++++-----------------------------
 3 files changed, 62 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7b37fa009297..77b39f66cead 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -220,7 +220,7 @@ extern void	xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
 #endif
 
 typedef struct xfs_ail {
-	xfs_ail_entry_t		xa_ail;
+	struct list_head	xa_ail;
 	uint			xa_gen;
 	struct task_struct	*xa_task;
 	xfs_lsn_t		xa_target;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b5effce00089..0804207c7391 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,13 +113,8 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
 
-typedef struct xfs_ail_entry {
-	struct xfs_log_item	*ail_forw;	/* AIL forw pointer */
-	struct xfs_log_item	*ail_back;	/* AIL back pointer */
-} xfs_ail_entry_t;
-
 typedef struct xfs_log_item {
-	xfs_ail_entry_t			li_ail;		/* AIL pointers */
+	struct list_head		li_ail;		/* AIL pointers */
 	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
 	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
 	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 76d470d8a1e6..13235ae9a582 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,13 +28,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
 
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 #else
 #define	xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -57,7 +57,7 @@ xfs_trans_tail_ail(
 	xfs_log_item_t	*lip;
 
 	spin_lock(&mp->m_ail_lock);
-	lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+	lip = xfs_ail_min(&mp->m_ail);
 	if (lip == NULL) {
 		lsn = (xfs_lsn_t)0;
 	} else {
@@ -91,7 +91,7 @@ xfs_trans_push_ail(
 {
 	xfs_log_item_t		*lip;
 
-	lip = xfs_ail_min(&mp->m_ail.xa_ail);
+	lip = xfs_ail_min(&mp->m_ail);
 	if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
 		if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
 			xfsaild_wakeup(mp, threshold_lsn);
@@ -111,15 +111,17 @@ xfs_trans_first_push_ail(
 {
 	xfs_log_item_t	*lip;
 
-	lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+	lip = xfs_ail_min(&mp->m_ail);
 	*gen = (int)mp->m_ail.xa_gen;
 	if (lsn == 0)
 		return lip;
 
-	while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0))
-		lip = lip->li_ail.ail_forw;
+	list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+			return lip;
+	}
 
-	return lip;
+	return NULL;
 }
 
 /*
@@ -329,7 +331,7 @@ xfs_trans_unlocked_item(
 	 * the call to xfs_log_move_tail() doesn't do anything if there's
 	 * not enough free space to wake people up so we're safe calling it.
 	 */
-	min_lip = xfs_ail_min(&mp->m_ail.xa_ail);
+	min_lip = xfs_ail_min(&mp->m_ail);
 
 	if (min_lip == lip)
 		xfs_log_move_tail(mp, 1);
@@ -357,15 +359,13 @@ xfs_trans_update_ail(
 	xfs_log_item_t	*lip,
 	xfs_lsn_t	lsn) __releases(mp->m_ail_lock)
 {
-	xfs_ail_entry_t		*ailp;
 	xfs_log_item_t		*dlip=NULL;
 	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
 
-	ailp = &(mp->m_ail.xa_ail);
-	mlip = xfs_ail_min(ailp);
+	mlip = xfs_ail_min(&mp->m_ail);
 
 	if (lip->li_flags & XFS_LI_IN_AIL) {
-		dlip = xfs_ail_delete(ailp, lip);
+		dlip = xfs_ail_delete(&mp->m_ail, lip);
 		ASSERT(dlip == lip);
 	} else {
 		lip->li_flags |= XFS_LI_IN_AIL;
@@ -373,11 +373,11 @@ xfs_trans_update_ail(
 
 	lip->li_lsn = lsn;
 
-	xfs_ail_insert(ailp, lip);
+	xfs_ail_insert(&mp->m_ail, lip);
 	mp->m_ail.xa_gen++;
 
 	if (mlip == dlip) {
-		mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+		mlip = xfs_ail_min(&mp->m_ail);
 		spin_unlock(&mp->m_ail_lock);
 		xfs_log_move_tail(mp, mlip->li_lsn);
 	} else {
@@ -407,14 +407,12 @@ xfs_trans_delete_ail(
 	xfs_mount_t	*mp,
 	xfs_log_item_t	*lip) __releases(mp->m_ail_lock)
 {
-	xfs_ail_entry_t		*ailp;
 	xfs_log_item_t		*dlip;
 	xfs_log_item_t		*mlip;
 
 	if (lip->li_flags & XFS_LI_IN_AIL) {
-		ailp = &(mp->m_ail.xa_ail);
-		mlip = xfs_ail_min(ailp);
-		dlip = xfs_ail_delete(ailp, lip);
+		mlip = xfs_ail_min(&mp->m_ail);
+		dlip = xfs_ail_delete(&mp->m_ail, lip);
 		ASSERT(dlip == lip);
 
 
@@ -423,7 +421,7 @@ xfs_trans_delete_ail(
 		mp->m_ail.xa_gen++;
 
 		if (mlip == dlip) {
-			mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+			mlip = xfs_ail_min(&mp->m_ail);
 			spin_unlock(&mp->m_ail_lock);
 			xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
 		} else {
@@ -461,7 +459,7 @@ xfs_trans_first_ail(
 {
 	xfs_log_item_t	*lip;
 
-	lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+	lip = xfs_ail_min(&mp->m_ail);
 	*gen = (int)mp->m_ail.xa_gen;
 
 	return lip;
@@ -485,9 +483,9 @@ xfs_trans_next_ail(
 
 	ASSERT(mp && lip && gen);
 	if (mp->m_ail.xa_gen == *gen) {
-		nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip);
+		nlip = xfs_ail_next(&mp->m_ail, lip);
 	} else {
-		nlip = xfs_ail_min(&(mp->m_ail).xa_ail);
+		nlip = xfs_ail_min(&mp->m_ail);
 		*gen = (int)mp->m_ail.xa_gen;
 		if (restarts != NULL) {
 			XFS_STATS_INC(xs_push_ail_restarts);
@@ -517,8 +515,7 @@ int
 xfs_trans_ail_init(
 	xfs_mount_t	*mp)
 {
-	mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail;
-	mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail;
+	INIT_LIST_HEAD(&mp->m_ail.xa_ail);
 	return xfsaild_start(mp);
 }
 
@@ -537,7 +534,7 @@ xfs_trans_ail_destroy(
  */
 STATIC void
 xfs_ail_insert(
-	xfs_ail_entry_t	*base,
+	xfs_ail_t	*ailp,
 	xfs_log_item_t	*lip)
 /* ARGSUSED */
 {
@@ -546,27 +543,22 @@ xfs_ail_insert(
 	/*
 	 * If the list is empty, just insert the item.
 	 */
-	if (base->ail_back == (xfs_log_item_t*)base) {
-		base->ail_forw = lip;
-		base->ail_back = lip;
-		lip->li_ail.ail_forw = (xfs_log_item_t*)base;
-		lip->li_ail.ail_back = (xfs_log_item_t*)base;
+	if (list_empty(&ailp->xa_ail)) {
+		list_add(&lip->li_ail, &ailp->xa_ail);
 		return;
 	}
 
-	next_lip = base->ail_back;
-	while ((next_lip != (xfs_log_item_t*)base) &&
-	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
-		next_lip = next_lip->li_ail.ail_back;
+	list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
+			break;
 	}
-	ASSERT((next_lip == (xfs_log_item_t*)base) ||
+
+	ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
 	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-	lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
-	lip->li_ail.ail_back = next_lip;
-	next_lip->li_ail.ail_forw = lip;
-	lip->li_ail.ail_forw->li_ail.ail_back = lip;
 
-	xfs_ail_check(base, lip);
+	list_add(&lip->li_ail, &next_lip->li_ail);
+
+	xfs_ail_check(ailp, lip);
 	return;
 }
 
@@ -576,15 +568,13 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-	xfs_ail_entry_t	*base,
+	xfs_ail_t	*ailp,
 	xfs_log_item_t	*lip)
 /* ARGSUSED */
 {
-	xfs_ail_check(base, lip);
-	lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
-	lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
-	lip->li_ail.ail_forw = NULL;
-	lip->li_ail.ail_back = NULL;
+	xfs_ail_check(ailp, lip);
+
+	list_del(&lip->li_ail);
 
 	return lip;
 }
@@ -595,14 +585,13 @@ xfs_ail_delete(
  */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-	xfs_ail_entry_t	*base)
+	xfs_ail_t	*ailp)
 /* ARGSUSED */
 {
-	register xfs_log_item_t *forw = base->ail_forw;
-	if (forw == (xfs_log_item_t*)base) {
+	if (list_empty(&ailp->xa_ail))
 		return NULL;
-	}
-	return forw;
+
+	return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
 }
 
 /*
@@ -612,15 +601,14 @@ xfs_ail_min(
  */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-	xfs_ail_entry_t	*base,
+	xfs_ail_t	*ailp,
 	xfs_log_item_t	*lip)
 /* ARGSUSED */
 {
-	if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+	if (lip->li_ail.next == &ailp->xa_ail)
 		return NULL;
-	}
-	return lip->li_ail.ail_forw;
 
+	return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
 }
 
 #ifdef DEBUG
@@ -629,57 +617,40 @@ xfs_ail_next(
  */
 STATIC void
 xfs_ail_check(
-	xfs_ail_entry_t *base,
+	xfs_ail_t 	*ailp,
 	xfs_log_item_t	*lip)
 {
 	xfs_log_item_t	*prev_lip;
 
-	prev_lip = base->ail_forw;
-	if (prev_lip == (xfs_log_item_t*)base) {
-		/*
-		 * Make sure the pointers are correct when the list
-		 * is empty.
-		 */
-		ASSERT(base->ail_back == (xfs_log_item_t*)base);
+	if (list_empty(&ailp->xa_ail))
 		return;
-	}
 
 	/*
 	 * Check the next and previous entries are valid.
 	 */
 	ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-	prev_lip = lip->li_ail.ail_back;
-	if (prev_lip != (xfs_log_item_t*)base) {
-		ASSERT(prev_lip->li_ail.ail_forw == lip);
+	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
 		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-	}
-	prev_lip = lip->li_ail.ail_forw;
-	if (prev_lip != (xfs_log_item_t*)base) {
-		ASSERT(prev_lip->li_ail.ail_back == lip);
+
+	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+	if (&prev_lip->li_ail != &ailp->xa_ail)
 		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-	}
 
 
 #ifdef XFS_TRANS_DEBUG
 	/*
-	 * Walk the list checking forward and backward pointers,
-	 * lsn ordering, and that every entry has the XFS_LI_IN_AIL
-	 * flag set. This is really expensive, so only do it when
-	 * specifically debugging the transaction subsystem.
+	 * Walk the list checking lsn ordering, and that every entry has the
+	 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+	 * when specifically debugging the transaction subsystem.
 	 */
-	prev_lip = (xfs_log_item_t*)base;
-	while (lip != (xfs_log_item_t*)base) {
-		if (prev_lip != (xfs_log_item_t*)base) {
-			ASSERT(prev_lip->li_ail.ail_forw == lip);
+	prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+	list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+		if (&prev_lip->li_ail != &ailp->xa_ail)
 			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-		}
-		ASSERT(lip->li_ail.ail_back == prev_lip);
 		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
 		prev_lip = lip;
-		lip = lip->li_ail.ail_forw;
 	}
-	ASSERT(lip == (xfs_log_item_t*)base);
-	ASSERT(base->ail_back == prev_lip);
 #endif /* XFS_TRANS_DEBUG */
 }
 #endif /* DEBUG */
-- 
cgit v1.2.3


From 75de2a91c98a6f486f261c1367fe59f5583e15a3 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 27 Mar 2008 18:00:38 +1100
Subject: [XFS] Account for inode cluster alignment in all allocations

At ENOSPC, we can get a filesystem shutdown due to a cancelling a dirty
transaction in xfs_mkdir or xfs_create. This is due to the initial
allocation attempt not taking into account inode alignment and hence we
can prepare the AGF freelist for allocation when it's not actually
possible to do an allocation. This results in inode allocation returning
ENOSPC with a dirty transaction, and hence we shut down the filesystem.

Because the first allocation is an exact allocation attempt, we must tell
the allocator that the alignment does not affect the allocation attempt.
i.e. we will accept any extent alignment as long as the extent starts at
the block we want. Unfortunately, this means that if the longest free
extent is less than the length + alignment necessary for fallback
allocation attempts but is long enough to attempt a non-aligned
allocation, we will modify the free list.

If we then have the exact allocation fail, all other allocation attempts
will also fail due to the alignment constraint being taken into account.
Hence the initial attempt needs to set the "alignment slop" field so that
alignment, while not required, must be taken into account when determining
if there is enough space left in the AG to do the allocation.

That means if the exact allocation fails, we will not dirty the freelist
if there is not enough space available fo a subsequent allocation to
succeed. Hence we get an ENOSPC error back to userspace without shutting
down the filesystem.

SGI-PV: 978886
SGI-Modid: xfs-linux-melb:xfs-kern:30699a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5a146cb22980..a64dfbd565a5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -107,6 +107,16 @@ xfs_ialloc_log_di(
 /*
  * Allocation group level functions.
  */
+static inline int
+xfs_ialloc_cluster_alignment(
+	xfs_alloc_arg_t	*args)
+{
+	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+	    args->mp->m_sb.sb_inoalignmt >=
+	     XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+		return args->mp->m_sb.sb_inoalignmt;
+	return 1;
+}
 
 /*
  * Allocate new inodes in the allocation group specified by agbp.
@@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc(
 		args.mod = args.total = args.wasdel = args.isfl =
 			args.userdata = args.minalignslop = 0;
 		args.prod = 1;
-		args.alignment = 1;
+
 		/*
-		 * Allow space for the inode btree to split.
+		 * We need to take into account alignment here to ensure that
+		 * we don't modify the free list if we fail to have an exact
+		 * block. If we don't have an exact match, and every oher
+		 * attempt allocation attempt fails, we'll end up cancelling
+		 * a dirty transaction and shutting down.
+		 *
+		 * For an exact allocation, alignment must be 1,
+		 * however we need to take cluster alignment into account when
+		 * fixing up the freelist. Use the minalignslop field to
+		 * indicate that extra blocks might be required for alignment,
+		 * but not to use them in the actual exact allocation.
 		 */
+		args.alignment = 1;
+		args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+		/* Allow space for the inode btree to split. */
 		args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
@@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc(
 			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
-		} else if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
-			   args.mp->m_sb.sb_inoalignmt >=
-			   XFS_B_TO_FSBT(args.mp,
-			  	XFS_INODE_CLUSTER_SIZE(args.mp)))
-				args.alignment = args.mp->m_sb.sb_inoalignmt;
-		else
-			args.alignment = 1;
+		} else
+			args.alignment = xfs_ialloc_cluster_alignment(&args);
 		/*
 		 * Need to figure out where to allocate the inode blocks.
 		 * Ideally they should be spaced out through the a.g.
@@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc(
 		args.agbno = be32_to_cpu(agi->agi_root);
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
-		if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
-			args.mp->m_sb.sb_inoalignmt >=
-			XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
-				args.alignment = args.mp->m_sb.sb_inoalignmt;
-		else
-			args.alignment = 1;
+		args.alignment = xfs_ialloc_cluster_alignment(&args);
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
-- 
cgit v1.2.3


From 59a33f9f776b051018ec98af95bd9fe8ba9d0f3e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 27 Mar 2008 18:00:45 +1100
Subject: [XFS] Ensure a btree insert returns a valid cursor.

When writing into preallocated regions there is a case where XFS can oops
or hang doing the unwritten extent conversion on I/O completion. It turns
out that the problem is related to the btree cursor being invalid.

When we do an insert into the tree, we may need to split blocks in the
tree. When we only split at the leaf level (i.e. level 0), everything
works just fine. However, if we have a multi-level split in the btreee,
the cursor passed to the insert function is no longer valid once the
insert is complete.

The leaf level split is handled correctly because all the operations at
level 0 are done using the original cursor, hence it is updated correctly.
However, when we need to update the next level up the tree, we don't use
that cursor - we use a cloned cursor that points to the index in the next
level up where we need to do the insert.

Hence if we need to split a second level, the changes to the tree are
reflected in the cloned cursor and not the original cursor. This
clone-and-move-up-a-level-on-split behaviour recurses all the way to the
top of the tree.

The complexity here is that these cloned cursors do not point to the
original index that was inserted - they point to the newly allocated block
(the right block) and the original cursor pointer to that level may still
point to the left block. Hence, without deep examination of the cloned
cursor and buffers, we cannot update the original cursor with the new path
from the cloned cursor.

In these cases the original cursor could be pointing to the wrong block(s)
and hence a subsequent modification to the tree using that cursor will
lead to corruption of the tree.

The crash case occurs when the tree changes height - we insert a new level
in the tree, and the cursor does not have a buffer in it's path for that
level. Hence any attempt to walk back up the cursor to the root block will
result in a null pointer dereference.

To make matters even more complex, the BMAP BT is rooted in an inode, so
we can have a change of height in the btree *without a root split*. That
is, if the root block in the inode is full when we split a leaf node, we
cannot fit the pointer to the new block in the root, so we allocate a new
block, migrate all the ptrs out of the inode into the new block and point
the inode root block at the newly allocated block. This changes the height
of the tree without a root split having occurred and hence invalidates the
path in the original cursor.

The patch below prevents xfs_bmbt_insert() from returning with an invalid
cursor by detecting the cases that invalidate the original cursor and
refresh it by do a lookup into the btree for the original index we were
inserting at.

Note that the INOBT, AGFBNO and AGFCNT btree implementations also have
this bug, but the cursor is currently always destroyed or revalidated
after an insert for those trees. Hence this patch only address the problem
in the BMBT code.

SGI-PV: 979339
SGI-Modid: xfs-linux-melb:xfs-kern:30701a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_bmap_btree.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987326a3..93470b728dd0 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
 
 /*
  * Insert the current record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. It appears, however, that some callers assume that the cursor is
+ * always valid. Hence if we do a multi-level split we need to revalidate the
+ * cursor.
+ *
+ * When a split occurs, we will see a new cursor returned. Use that as a
+ * trigger to determine if we need to revalidate the original cursor. If we get
+ * a split, then use the original irec to lookup up the path of the record we
+ * just inserted.
+ *
+ * Note that the fact that the btree root is in the inode means that we can
+ * have the level of the tree change without a "split" occurring at the root
+ * level. What happens is that the root is migrated to an allocated block and
+ * the inode root is pointed to it. This means a single split can change the
+ * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
+ * the level change should be accounted as a split so as to correctly trigger a
+ * revalidation of the old cursor.
  */
 int					/* error */
 xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
 	xfs_fsblock_t	nbno;
 	xfs_btree_cur_t	*ncur;
 	xfs_bmbt_rec_t	nrec;
+	xfs_bmbt_irec_t	oirec;		/* original irec */
 	xfs_btree_cur_t	*pcur;
+	int		splits = 0;
 
 	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
 	level = 0;
 	nbno = NULLFSBLOCK;
+	oirec = cur->bc_rec.b;
 	xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
 	ncur = NULL;
 	pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
 				&i))) {
 			if (pcur != cur)
 				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-			return error;
+			goto error0;
 		}
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+			/* allocating a new root is effectively a split */
+			if (cur->bc_nlevels != pcur->bc_nlevels)
+				splits++;
 			cur->bc_nlevels = pcur->bc_nlevels;
 			cur->bc_private.b.allocated +=
 				pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
 		}
 		if (ncur) {
+			splits++;
 			pcur = ncur;
 			ncur = NULL;
 		}
 	} while (nbno != NULLFSBLOCK);
+
+	if (splits > 1) {
+		/* revalidate the old cursor as we had a multi-level split */
+		error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
+				oirec.br_startblock, oirec.br_blockcount, &i);
+		if (error)
+			goto error0;
+		ASSERT(i == 1);
+	}
+
 	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
 	*stat = i;
 	return 0;
-- 
cgit v1.2.3


From f3dcc13f6fa20af1171eac7a537a4b89b1a84849 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 27 Mar 2008 18:00:54 +1100
Subject: [XFS] cleanup root inode handling in xfs_fs_fill_super

- rename rootvp to root for clarify
- remove useless vn_to_inode call
- check is_bad_inode before calling d_alloc_root
- use iput instead of VN_RELE in the error case

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30708a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index cb9ce90d1deb..72e55db948d2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1307,7 +1307,7 @@ xfs_fs_fill_super(
 	void			*data,
 	int			silent)
 {
-	struct inode		*rootvp;
+	struct inode		*root;
 	struct xfs_mount	*mp = NULL;
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, silent);
 	int			error;
@@ -1345,19 +1345,18 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	rootvp = igrab(mp->m_rootip->i_vnode);
-	if (!rootvp) {
+	root = igrab(mp->m_rootip->i_vnode);
+	if (!root) {
 		error = ENOENT;
 		goto fail_unmount;
 	}
-
-	sb->s_root = d_alloc_root(vn_to_inode(rootvp));
-	if (!sb->s_root) {
-		error = ENOMEM;
+	if (is_bad_inode(root)) {
+		error = EINVAL;
 		goto fail_vnrele;
 	}
-	if (is_bad_inode(sb->s_root->d_inode)) {
-		error = EINVAL;
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		error = ENOMEM;
 		goto fail_vnrele;
 	}
 
@@ -1379,7 +1378,7 @@ fail_vnrele:
 		dput(sb->s_root);
 		sb->s_root = NULL;
 	} else {
-		VN_RELE(rootvp);
+		iput(root);
 	}
 
 fail_unmount:
-- 
cgit v1.2.3


From df26cfe849d8fd767b26fcd4bfebfff67bda9f3a Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 18 Apr 2008 11:44:03 +1000
Subject: [XFS] split xfs_ioc_xattr

The three subcases of xfs_ioc_xattr don't share any semantics and almost
no code, so split it into three separate helpers.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30709a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 368 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 312 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f34bd010eb51..c6399b2cf17c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1179,85 +1179,85 @@ xfs_ioc_fsgetxattr(
 }
 
 STATIC int
-xfs_ioc_xattr(
+xfs_ioc_fssetxattr(
 	xfs_inode_t		*ip,
 	struct file		*filp,
-	unsigned int		cmd,
 	void			__user *arg)
 {
 	struct fsxattr		fa;
 	struct bhv_vattr	*vattr;
-	int			error = 0;
+	int			error;
 	int			attr_flags;
-	unsigned int		flags;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
 
 	vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
 	if (unlikely(!vattr))
 		return -ENOMEM;
 
-	switch (cmd) {
-	case XFS_IOC_FSSETXATTR: {
-		if (copy_from_user(&fa, arg, sizeof(fa))) {
-			error = -EFAULT;
-			break;
-		}
+	attr_flags = 0;
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= ATTR_NONBLOCK;
 
-		attr_flags = 0;
-		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-			attr_flags |= ATTR_NONBLOCK;
+	vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
+	vattr->va_xflags  = fa.fsx_xflags;
+	vattr->va_extsize = fa.fsx_extsize;
+	vattr->va_projid  = fa.fsx_projid;
 
-		vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-		vattr->va_xflags  = fa.fsx_xflags;
-		vattr->va_extsize = fa.fsx_extsize;
-		vattr->va_projid  = fa.fsx_projid;
+	error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+	if (!error)
+		vn_revalidate(XFS_ITOV(ip));	/* update flags */
+	kfree(vattr);
+	return 0;
+}
 
-		error = xfs_setattr(ip, vattr, attr_flags, NULL);
-		if (likely(!error))
-			vn_revalidate(XFS_ITOV(ip));	/* update flags */
-		error = -error;
-		break;
-	}
+STATIC int
+xfs_ioc_getxflags(
+	xfs_inode_t		*ip,
+	void			__user *arg)
+{
+	unsigned int		flags;
 
-	case XFS_IOC_GETXFLAGS: {
-		flags = xfs_di2lxflags(ip->i_d.di_flags);
-		if (copy_to_user(arg, &flags, sizeof(flags)))
-			error = -EFAULT;
-		break;
-	}
+	flags = xfs_di2lxflags(ip->i_d.di_flags);
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
 
-	case XFS_IOC_SETXFLAGS: {
-		if (copy_from_user(&flags, arg, sizeof(flags))) {
-			error = -EFAULT;
-			break;
-		}
+STATIC int
+xfs_ioc_setxflags(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	void			__user *arg)
+{
+	struct bhv_vattr	*vattr;
+	unsigned int		flags;
+	int			attr_flags;
+	int			error;
 
-		if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-			      FS_NOATIME_FL | FS_NODUMP_FL | \
-			      FS_SYNC_FL)) {
-			error = -EOPNOTSUPP;
-			break;
-		}
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
 
-		attr_flags = 0;
-		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-			attr_flags |= ATTR_NONBLOCK;
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL))
+		return -EOPNOTSUPP;
 
-		vattr->va_mask = XFS_AT_XFLAGS;
-		vattr->va_xflags = xfs_merge_ioc_xflags(flags,
-							xfs_ip2xflags(ip));
+	vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
+	if (unlikely(!vattr))
+		return -ENOMEM;
 
-		error = xfs_setattr(ip, vattr, attr_flags, NULL);
-		if (likely(!error))
-			vn_revalidate(XFS_ITOV(ip));	/* update flags */
-		error = -error;
-		break;
-	}
+	attr_flags = 0;
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= ATTR_NONBLOCK;
 
-	default:
-		error = -ENOTTY;
-		break;
-	}
+	vattr->va_mask = XFS_AT_XFLAGS;
+	vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
 
+	error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+	if (likely(!error))
+		vn_revalidate(XFS_ITOV(ip));	/* update flags */
 	kfree(vattr);
 	return error;
 }
@@ -1332,3 +1332,259 @@ xfs_ioc_getbmapx(
 
 	return 0;
 }
+
+int
+xfs_ioctl(
+	xfs_inode_t		*ip,
+	struct file		*filp,
+	int			ioflags,
+	unsigned int		cmd,
+	void			__user *arg)
+{
+	struct inode		*inode = filp->f_path.dentry->d_inode;
+	xfs_mount_t		*mp = ip->i_mount;
+	int			error;
+
+	xfs_itrace_entry(XFS_I(inode));
+	switch (cmd) {
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+		/*
+		 * Only allow the sys admin to reserve space unless
+		 * unwritten extents are enabled.
+		 */
+		if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
+		    !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+
+	case XFS_IOC_DIOINFO: {
+		struct dioattr	da;
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+			mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+		if (copy_to_user(arg, &da, sizeof(da)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_bulkstat(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry_v1(mp, arg);
+
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg);
+
+	case XFS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+
+	case XFS_IOC_FSGETXATTR:
+		return xfs_ioc_fsgetxattr(ip, 0, arg);
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_fsgetxattr(ip, 1, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_fssetxattr(ip, filp, arg);
+	case XFS_IOC_GETXFLAGS:
+		return xfs_ioc_getxflags(ip, arg);
+	case XFS_IOC_SETXFLAGS:
+		return xfs_ioc_setxflags(ip, filp, arg);
+
+	case XFS_IOC_FSSETDM: {
+		struct fsdmidata	dmi;
+
+		if (copy_from_user(&dmi, arg, sizeof(dmi)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+				dmi.fsd_dmstate);
+		return -error;
+	}
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmapx(ip, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE:
+		return xfs_find_handle(cmd, arg);
+
+	case XFS_IOC_OPEN_BY_HANDLE:
+		return xfs_open_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_FSSETDM_BY_HANDLE:
+		return xfs_fssetdm_by_handle(mp, arg, inode);
+
+	case XFS_IOC_READLINK_BY_HANDLE:
+		return xfs_readlink_by_handle(mp, arg, inode);
+
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(mp, arg, inode);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(mp, arg, inode);
+
+	case XFS_IOC_SWAPEXT: {
+		error = xfs_swapext((struct xfs_swapext __user *)arg);
+		return -error;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		error = xfs_fs_counts(mp, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		__uint64_t	   in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&inout, arg, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &inout, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user(arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		xfs_growfs_data_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_data(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		xfs_growfs_log_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_log(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_rt(mp, &in);
+		return -error;
+	}
+
+	case XFS_IOC_FREEZE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (inode->i_sb->s_frozen == SB_UNFROZEN)
+			freeze_bdev(inode->i_sb->s_bdev);
+		return 0;
+
+	case XFS_IOC_THAW:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (inode->i_sb->s_frozen != SB_UNFROZEN)
+			thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+		return 0;
+
+	case XFS_IOC_GOINGDOWN: {
+		__uint32_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(in, (__uint32_t __user *)arg))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_fs_goingdown(mp, in);
+		return -error;
+	}
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_errortag_add(in.errtag, mp);
+		return -error;
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_errortag_clearall(mp, 1);
+		return -error;
+
+	default:
+		return -ENOTTY;
+	}
+}
-- 
cgit v1.2.3


From 433550990e6c2e94995239bac6a52b4df454cae0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 27 Mar 2008 18:01:08 +1100
Subject: [XFS] remove most calls to VN_RELE

Most VN_RELE calls either directly contain a XFS_ITOV or have the
corresponding xfs_inode already in scope. Use the IRELE helper instead of
VN_RELE to clarify the code. With a little more work we can kill VN_RELE
altogether and define IRELE in terms of iput directly.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30710a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c          |  6 +++---
 fs/xfs/quota/xfs_qm_syscalls.c | 10 +++++-----
 fs/xfs/xfs_log_recover.c       |  3 ++-
 fs/xfs/xfs_mount.c             |  5 ++---
 fs/xfs/xfs_rtalloc.c           |  3 ++-
 fs/xfs/xfs_vfsops.c            | 13 +++++++------
 6 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8e9c5ae6504d..adbc7bb9fbaa 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1810,7 +1810,7 @@ xfs_qm_dqusage_adjust(
 	 * Now release the inode. This will send it to 'inactive', and
 	 * possibly even free blocks.
 	 */
-	VN_RELE(XFS_ITOV(ip));
+	IRELE(ip);
 
 	/*
 	 * Goto next inode.
@@ -1968,7 +1968,7 @@ xfs_qm_init_quotainos(
 			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
 					     0, 0, &gip, 0))) {
 				if (uip)
-					VN_RELE(XFS_ITOV(uip));
+					IRELE(uip);
 				return XFS_ERROR(error);
 			}
 		}
@@ -1999,7 +1999,7 @@ xfs_qm_init_quotainos(
 					  sbflags | XFS_SB_GQUOTINO, flags);
 		if (error) {
 			if (uip)
-				VN_RELE(XFS_ITOV(uip));
+				IRELE(uip);
 
 			return XFS_ERROR(error);
 		}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index d2b8be7e75f9..3dc161f39d13 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -386,7 +386,7 @@ xfs_qm_scall_trunc_qfiles(
 		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
 		if (! error) {
 			(void) xfs_truncate_file(mp, qip);
-			VN_RELE(XFS_ITOV(qip));
+			IRELE(qip);
 		}
 	}
 
@@ -395,7 +395,7 @@ xfs_qm_scall_trunc_qfiles(
 		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
 		if (! error) {
 			(void) xfs_truncate_file(mp, qip);
-			VN_RELE(XFS_ITOV(qip));
+			IRELE(qip);
 		}
 	}
 
@@ -552,13 +552,13 @@ xfs_qm_scall_getqstat(
 		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
 		out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
 		if (tempuqip)
-			VN_RELE(XFS_ITOV(uip));
+			IRELE(uip);
 	}
 	if (gip) {
 		out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
 		out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
 		if (tempgqip)
-			VN_RELE(XFS_ITOV(gip));
+			IRELE(gip);
 	}
 	if (mp->m_quotainfo) {
 		out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
@@ -1095,7 +1095,7 @@ again:
 		 * inactive code in hell.
 		 */
 		if (vnode_refd)
-			VN_RELE(vp);
+			IRELE(ip);
 		XFS_MOUNT_ILOCK(mp);
 		/*
 		 * If an inode was inserted or removed, we gotta
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index cd24711ae276..962d74a9ea7e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
+#include "xfs_utils.h"
 
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -3248,7 +3249,7 @@ xlog_recover_process_iunlinks(
 					if (ip->i_d.di_mode == 0)
 						xfs_iput_new(ip, 0);
 					else
-						VN_RELE(XFS_ITOV(ip));
+						IRELE(ip);
 				} else {
 					/*
 					 * We can't read in the inode
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 41b690e6896c..c2aafeb8c6cb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,7 @@
 #include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
+#include "xfs_utils.h"
 
 STATIC void	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
@@ -956,7 +957,6 @@ xfs_mountfs(
 {
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
-	bhv_vnode_t	*rvp = NULL;
 	__uint64_t	resblks;
 	__int64_t	update_flags = 0LL;
 	uint		quotamount, quotaflags;
@@ -1158,7 +1158,6 @@ xfs_mountfs(
 	}
 
 	ASSERT(rip != NULL);
-	rvp = XFS_ITOV(rip);
 
 	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
 		cmn_err(CE_WARN, "XFS: corrupted root inode");
@@ -1241,7 +1240,7 @@ xfs_mountfs(
 	/*
 	 * Free up the root inode.
 	 */
-	VN_RELE(rvp);
+	IRELE(rip);
  error3:
 	xfs_log_unmount_dealloc(mp);
  error2:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 47082c01872d..9cd6471cd60f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -44,6 +44,7 @@
 #include "xfs_rw.h"
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
+#include "xfs_utils.h"
 
 
 /*
@@ -2278,7 +2279,7 @@ xfs_rtmount_inodes(
 	ASSERT(sbp->sb_rsumino != NULLFSINO);
 	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
 	if (error) {
-		VN_RELE(XFS_ITOV(mp->m_rbmip));
+		IRELE(mp->m_rbmip);
 		return error;
 	}
 	ASSERT(mp->m_rsumip != NULL);
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 4c132a87d437..c21e4d168297 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -55,6 +55,7 @@
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
 #include "xfs_vfsops.h"
+#include "xfs_utils.h"
 
 
 int __init
@@ -595,7 +596,7 @@ xfs_unmount(
 	/*
 	 * Drop the reference count
 	 */
-	VN_RELE(rvp);
+	IRELE(rip);
 
 	/*
 	 * If we're forcing a shutdown, typically because of a media error,
@@ -777,8 +778,8 @@ xfs_unmount_flush(
 		goto fscorrupt_out2;
 
 	if (rbmip) {
-		VN_RELE(XFS_ITOV(rbmip));
-		VN_RELE(XFS_ITOV(rsumip));
+		IRELE(rbmip);
+		IRELE(rsumip);
 	}
 
 	xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1156,10 +1157,10 @@ xfs_sync_inodes(
 			 * above, then wait until after we've unlocked
 			 * the inode to release the reference.  This is
 			 * because we can be already holding the inode
-			 * lock when VN_RELE() calls xfs_inactive().
+			 * lock when IRELE() calls xfs_inactive().
 			 *
 			 * Make sure to drop the mount lock before calling
-			 * VN_RELE() so that we don't trip over ourselves if
+			 * IRELE() so that we don't trip over ourselves if
 			 * we have to go for the mount lock again in the
 			 * inactive code.
 			 */
@@ -1167,7 +1168,7 @@ xfs_sync_inodes(
 				IPOINTER_INSERT(ip, mp);
 			}
 
-			VN_RELE(vp);
+			IRELE(ip);
 
 			vnode_refed = B_FALSE;
 		}
-- 
cgit v1.2.3


From 2abdb8c88110bab78bfe17e51346e735560daa02 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 27 Mar 2008 18:01:14 +1100
Subject: [XFS] Prevent xfs_bmap_check_leaf_extents() referencing unmapped
 memory.

While investigating the extent corruption bug I ran into this bug in debug
only code. xfs_bmap_check_leaf_extents() loops through the leaf blocks of
the extent btree checking that every extent is entirely before the next
extent. It also compares the last extent in the previous block to the
first extent in the current block when the previous block has been
released and potentially unmapped. So take a copy of the last extent
instead of a pointer. Also move the last extent check out of the loop
because we only need to do it once.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30718a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/xfs_bmap.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index bce8e3bd8ad1..7d683e0b8ef7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6194,7 +6194,7 @@ xfs_bmap_check_leaf_extents(
 	xfs_mount_t		*mp;	/* file system mount structure */
 	__be64			*pp;	/* pointer to block address */
 	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
-	xfs_bmbt_rec_t		*lastp; /* pointer to previous extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
 	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
 	int			bp_release = 0;
 
@@ -6264,7 +6264,6 @@ xfs_bmap_check_leaf_extents(
 	/*
 	 * Loop over all leaf nodes checking that all extents are in the right order.
 	 */
-	lastp = NULL;
 	for (;;) {
 		xfs_fsblock_t	nextbno;
 		xfs_extnum_t	num_recs;
@@ -6285,18 +6284,16 @@ xfs_bmap_check_leaf_extents(
 		 */
 
 		ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+		if (i) {
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+		}
 		for (j = 1; j < num_recs; j++) {
 			nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
-			if (lastp) {
-				xfs_btree_check_rec(XFS_BTNUM_BMAP,
-					(void *)lastp, (void *)ep);
-			}
-			xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
-				(void *)(nextp));
-			lastp = ep;
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
 			ep = nextp;
 		}
 
+		last = *ep;
 		i += num_recs;
 		if (bp_release) {
 			bp_release = 0;
-- 
cgit v1.2.3


From 114d23aae51233b2bc62d8e2a632bcb55de1953d Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:18:39 +1000
Subject: [XFS] Per iclog callback chain lock

Rather than use the icloglock for protecting the iclog completion callback
chain, use a new per-iclog lock so that walking the callback chain doesn't
require holding a global lock.

This reduces contention on the icloglock during transaction commit and log
I/O completion by reducing the number of times we need to hold the global
icloglock during these operations.

SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30770a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c      | 35 +++++++++++++++++++----------------
 fs/xfs/xfs_log_priv.h | 33 ++++++++++++++++++++++++++-------
 2 files changed, 45 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1fa980933895..7a5b12d93537 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -397,12 +397,10 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
 	       void		  *iclog_hndl,	/* iclog to hang callback off */
 	       xfs_log_callback_t *cb)
 {
-	xlog_t *log = mp->m_log;
 	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
 	int	abortflg;
 
-	cb->cb_next = NULL;
-	spin_lock(&log->l_icloglock);
+	spin_lock(&iclog->ic_callback_lock);
 	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
 	if (!abortflg) {
 		ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
@@ -411,7 +409,7 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
 		*(iclog->ic_callback_tail) = cb;
 		iclog->ic_callback_tail = &(cb->cb_next);
 	}
-	spin_unlock(&log->l_icloglock);
+	spin_unlock(&iclog->ic_callback_lock);
 	return abortflg;
 }	/* xfs_log_notify */
 
@@ -1257,6 +1255,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
 		iclog->ic_state = XLOG_STATE_ACTIVE;
 		iclog->ic_log = log;
+		atomic_set(&iclog->ic_refcnt, 0);
+		spin_lock_init(&iclog->ic_callback_lock);
 		iclog->ic_callback_tail = &(iclog->ic_callback);
 		iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
 
@@ -1987,7 +1987,7 @@ xlog_state_clean_log(xlog_t *log)
 		if (iclog->ic_state == XLOG_STATE_DIRTY) {
 			iclog->ic_state	= XLOG_STATE_ACTIVE;
 			iclog->ic_offset       = 0;
-			iclog->ic_callback	= NULL;   /* don't need to free */
+			ASSERT(iclog->ic_callback == NULL);
 			/*
 			 * If the number of ops in this iclog indicate it just
 			 * contains the dummy transaction, we can
@@ -2190,37 +2190,40 @@ xlog_state_do_callback(
 					be64_to_cpu(iclog->ic_header.h_lsn);
 				spin_unlock(&log->l_grant_lock);
 
-				/*
-				 * Keep processing entries in the callback list
-				 * until we come around and it is empty.  We
-				 * need to atomically see that the list is
-				 * empty and change the state to DIRTY so that
-				 * we don't miss any more callbacks being added.
-				 */
-				spin_lock(&log->l_icloglock);
 			} else {
+				spin_unlock(&log->l_icloglock);
 				ioerrors++;
 			}
-			cb = iclog->ic_callback;
 
+			/*
+			 * Keep processing entries in the callback list until
+			 * we come around and it is empty.  We need to
+			 * atomically see that the list is empty and change the
+			 * state to DIRTY so that we don't miss any more
+			 * callbacks being added.
+			 */
+			spin_lock(&iclog->ic_callback_lock);
+			cb = iclog->ic_callback;
 			while (cb) {
 				iclog->ic_callback_tail = &(iclog->ic_callback);
 				iclog->ic_callback = NULL;
-				spin_unlock(&log->l_icloglock);
+				spin_unlock(&iclog->ic_callback_lock);
 
 				/* perform callbacks in the order given */
 				for (; cb; cb = cb_next) {
 					cb_next = cb->cb_next;
 					cb->cb_func(cb->cb_arg, aborted);
 				}
-				spin_lock(&log->l_icloglock);
+				spin_lock(&iclog->ic_callback_lock);
 				cb = iclog->ic_callback;
 			}
 
 			loopdidcallbacks++;
 			funcdidcallbacks++;
 
+			spin_lock(&log->l_icloglock);
 			ASSERT(iclog->ic_callback == NULL);
+			spin_unlock(&iclog->ic_callback_lock);
 			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
 				iclog->ic_state = XLOG_STATE_DIRTY;
 
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 01c63db25a1d..104b623aa082 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header {
  * - ic_offset is the current number of bytes written to in this iclog.
  * - ic_refcnt is bumped when someone is writing to the log.
  * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *	- ic_callback_*
+ *	- ic_refcnt
+ *	- fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
  */
 typedef struct xlog_iclog_fields {
 	sv_t			ic_forcesema;
@@ -332,18 +345,23 @@ typedef struct xlog_iclog_fields {
 	struct xlog_in_core	*ic_prev;
 	struct xfs_buf		*ic_bp;
 	struct log		*ic_log;
-	xfs_log_callback_t	*ic_callback;
-	xfs_log_callback_t	**ic_callback_tail;
-#ifdef XFS_LOG_TRACE
-	struct ktrace		*ic_trace;
-#endif
 	int			ic_size;
 	int			ic_offset;
-	atomic_t		ic_refcnt;
 	int			ic_bwritecnt;
 	ushort_t		ic_state;
 	char			*ic_datap;	/* pointer to iclog data */
-} xlog_iclog_fields_t;
+#ifdef XFS_LOG_TRACE
+	struct ktrace		*ic_trace;
+#endif
+
+	/* Callback structures need their own cacheline */
+	spinlock_t		ic_callback_lock ____cacheline_aligned_in_smp;
+	xfs_log_callback_t	*ic_callback;
+	xfs_log_callback_t	**ic_callback_tail;
+
+	/* reference counts need their own cacheline */
+	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
+} xlog_iclog_fields_t ____cacheline_aligned_in_smp;
 
 typedef union xlog_in_core2 {
 	xlog_rec_header_t	hic_header;
@@ -366,6 +384,7 @@ typedef struct xlog_in_core {
 #define	ic_bp		hic_fields.ic_bp
 #define	ic_log		hic_fields.ic_log
 #define	ic_callback	hic_fields.ic_callback
+#define	ic_callback_lock hic_fields.ic_callback_lock
 #define	ic_callback_tail hic_fields.ic_callback_tail
 #define	ic_trace	hic_fields.ic_trace
 #define	ic_size		hic_fields.ic_size
-- 
cgit v1.2.3


From eb01c9cd87c7a9998c2edf209721ea069e3e3652 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:18:46 +1000
Subject: [XFS] Remove the xlog_ticket allocator

The ticket allocator is just a simple slab implementation internal to the
log. It requires the icloglock to be held when manipulating it and this
contributes to contention on that lock.

Just kill the entire allocator and use a memory zone instead. While there,
allow us to gracefully fail allocation with ENOMEM.

SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30771a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c      | 137 ++++----------------------------------------------
 fs/xfs/xfs_log_priv.h |   9 ++--
 fs/xfs/xfs_vfsops.c   |  12 +++--
 3 files changed, 21 insertions(+), 137 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7a5b12d93537..3cf115d8de75 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,7 @@
 #include "xfs_inode.h"
 #include "xfs_rw.h"
 
+kmem_zone_t	*xfs_log_ticket_zone;
 
 #define xlog_write_adv_cnt(ptr, len, off, bytes) \
 	{ (ptr) += (bytes); \
@@ -73,8 +74,6 @@ STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
 				       xlog_ticket_t	*ticket,
 				       int		*continued_write,
 				       int		*logoffsetp);
-STATIC void xlog_state_put_ticket(xlog_t	*log,
-				  xlog_ticket_t *tic);
 STATIC int  xlog_state_release_iclog(xlog_t		*log,
 				     xlog_in_core_t	*iclog);
 STATIC void xlog_state_switch_iclogs(xlog_t		*log,
@@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 
 
 /* local ticket functions */
-STATIC void		xlog_state_ticket_alloc(xlog_t *log);
 STATIC xlog_ticket_t	*xlog_ticket_get(xlog_t *log,
 					 int	unit_bytes,
 					 int	count,
@@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t	*mp,
 		 */
 		xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
 		xlog_ungrant_log_space(log, ticket);
-		xlog_state_put_ticket(log, ticket);
+		xlog_ticket_put(log, ticket);
 	} else {
 		xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
 		xlog_regrant_reserve_log_space(log, ticket);
@@ -469,6 +467,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 		/* may sleep if need to allocate more tickets */
 		internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
 						  client, flags);
+		if (!internal_ticket)
+			return XFS_ERROR(ENOMEM);
 		internal_ticket->t_trans_type = t_type;
 		*ticket = internal_ticket;
 		xlog_trace_loggrant(log, internal_ticket, 
@@ -693,7 +693,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		if (tic) {
 			xlog_trace_loggrant(log, tic, "unmount rec");
 			xlog_ungrant_log_space(log, tic);
-			xlog_state_put_ticket(log, tic);
+			xlog_ticket_put(log, tic);
 		}
 	} else {
 		/*
@@ -1208,7 +1208,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	spin_lock_init(&log->l_icloglock);
 	spin_lock_init(&log->l_grant_lock);
 	initnsema(&log->l_flushsema, 0, "ic-flush");
-	xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
 
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1538,7 +1537,6 @@ STATIC void
 xlog_dealloc_log(xlog_t *log)
 {
 	xlog_in_core_t	*iclog, *next_iclog;
-	xlog_ticket_t	*tic, *next_tic;
 	int		i;
 
 	iclog = log->l_iclog;
@@ -1559,22 +1557,6 @@ xlog_dealloc_log(xlog_t *log)
 	spinlock_destroy(&log->l_icloglock);
 	spinlock_destroy(&log->l_grant_lock);
 
-	/* XXXsup take a look at this again. */
-	if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
-	    !XLOG_FORCED_SHUTDOWN(log)) {
-		xfs_fs_cmn_err(CE_WARN, log->l_mp,
-			"xlog_dealloc_log: (cnt: %d, total: %d)",
-			log->l_ticket_cnt, log->l_ticket_tcnt);
-		/* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
-
-	} else {
-		tic = log->l_unmount_free;
-		while (tic) {
-			next_tic = tic->t_next;
-			kmem_free(tic, PAGE_SIZE);
-			tic = next_tic;
-		}
-	}
 	xfs_buf_free(log->l_xbuf);
 #ifdef XFS_LOG_TRACE
 	if (log->l_trace != NULL) {
@@ -2794,18 +2776,6 @@ xlog_ungrant_log_space(xlog_t	     *log,
 }	/* xlog_ungrant_log_space */
 
 
-/*
- * Atomically put back used ticket.
- */
-STATIC void
-xlog_state_put_ticket(xlog_t	    *log,
-		      xlog_ticket_t *tic)
-{
-	spin_lock(&log->l_icloglock);
-	xlog_ticket_put(log, tic);
-	spin_unlock(&log->l_icloglock);
-}	/* xlog_state_put_ticket */
-
 /*
  * Flush iclog to disk if this is the last reference to the given iclog and
  * the WANT_SYNC bit is set.
@@ -3176,92 +3146,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
  */
 
 /*
- *	Algorithm doesn't take into account page size. ;-(
- */
-STATIC void
-xlog_state_ticket_alloc(xlog_t *log)
-{
-	xlog_ticket_t	*t_list;
-	xlog_ticket_t	*next;
-	xfs_caddr_t	buf;
-	uint		i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2;
-
-	/*
-	 * The kmem_zalloc may sleep, so we shouldn't be holding the
-	 * global lock.  XXXmiken: may want to use zone allocator.
-	 */
-	buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP);
-
-	spin_lock(&log->l_icloglock);
-
-	/* Attach 1st ticket to Q, so we can keep track of allocated memory */
-	t_list = (xlog_ticket_t *)buf;
-	t_list->t_next = log->l_unmount_free;
-	log->l_unmount_free = t_list++;
-	log->l_ticket_cnt++;
-	log->l_ticket_tcnt++;
-
-	/* Next ticket becomes first ticket attached to ticket free list */
-	if (log->l_freelist != NULL) {
-		ASSERT(log->l_tail != NULL);
-		log->l_tail->t_next = t_list;
-	} else {
-		log->l_freelist = t_list;
-	}
-	log->l_ticket_cnt++;
-	log->l_ticket_tcnt++;
-
-	/* Cycle through rest of alloc'ed memory, building up free Q */
-	for ( ; i > 0; i--) {
-		next = t_list + 1;
-		t_list->t_next = next;
-		t_list = next;
-		log->l_ticket_cnt++;
-		log->l_ticket_tcnt++;
-	}
-	t_list->t_next = NULL;
-	log->l_tail = t_list;
-	spin_unlock(&log->l_icloglock);
-}	/* xlog_state_ticket_alloc */
-
-
-/*
- * Put ticket into free list
- *
- * Assumption: log lock is held around this call.
+ * Free a used ticket.
  */
 STATIC void
 xlog_ticket_put(xlog_t		*log,
 		xlog_ticket_t	*ticket)
 {
 	sv_destroy(&ticket->t_sema);
-
-	/*
-	 * Don't think caching will make that much difference.  It's
-	 * more important to make debug easier.
-	 */
-#if 0
-	/* real code will want to use LIFO for caching */
-	ticket->t_next = log->l_freelist;
-	log->l_freelist = ticket;
-	/* no need to clear fields */
-#else
-	/* When we debug, it is easier if tickets are cycled */
-	ticket->t_next     = NULL;
-	if (log->l_tail) {
-		log->l_tail->t_next = ticket;
-	} else {
-		ASSERT(log->l_freelist == NULL);
-		log->l_freelist = ticket;
-	}
-	log->l_tail	    = ticket;
-#endif /* DEBUG */
-	log->l_ticket_cnt++;
+	kmem_zone_free(xfs_log_ticket_zone, ticket);
 }	/* xlog_ticket_put */
 
 
 /*
- * Grab ticket off freelist or allocation some more
+ * Allocate and initialise a new log ticket.
  */
 STATIC xlog_ticket_t *
 xlog_ticket_get(xlog_t		*log,
@@ -3273,21 +3170,9 @@ xlog_ticket_get(xlog_t		*log,
 	xlog_ticket_t	*tic;
 	uint		num_headers;
 
- alloc:
-	if (log->l_freelist == NULL)
-		xlog_state_ticket_alloc(log);		/* potentially sleep */
-
-	spin_lock(&log->l_icloglock);
-	if (log->l_freelist == NULL) {
-		spin_unlock(&log->l_icloglock);
-		goto alloc;
-	}
-	tic		= log->l_freelist;
-	log->l_freelist	= tic->t_next;
-	if (log->l_freelist == NULL)
-		log->l_tail = NULL;
-	log->l_ticket_cnt--;
-	spin_unlock(&log->l_icloglock);
+	tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+	if (!tic)
+		return NULL;
 
 	/*
 	 * Permanent reservations have up to 'cnt'-1 active log operations
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 104b623aa082..c1583960009d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -242,7 +242,7 @@ typedef struct xlog_res {
 
 typedef struct xlog_ticket {
 	sv_t		   t_sema;	 /* sleep on this semaphore      : 20 */
- 	struct xlog_ticket *t_next;	 /*			         :4|8 */
+	struct xlog_ticket *t_next;	 /*			         :4|8 */
 	struct xlog_ticket *t_prev;	 /*				 :4|8 */
 	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
 	int		   t_curr_res;	 /* current reservation in bytes : 4  */
@@ -406,13 +406,8 @@ typedef struct log {
 	sema_t			l_flushsema;    /* iclog flushing semaphore */
 	int			l_flushcnt;	/* # of procs waiting on this
 						 * sema */
-	int			l_ticket_cnt;	/* free ticket count */
-	int			l_ticket_tcnt;	/* total ticket count */
 	int			l_covered_state;/* state of "covering disk
 						 * log entries" */
-	xlog_ticket_t		*l_freelist;    /* free list of tickets */
-	xlog_ticket_t		*l_unmount_free;/* kmem_free these addresses */
-	xlog_ticket_t		*l_tail;        /* free list of tickets */
 	xlog_in_core_t		*l_iclog;       /* head log queue	*/
 	spinlock_t		l_icloglock;    /* grab to change iclog state */
 	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
@@ -478,6 +473,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void	 xlog_put_bp(struct xfs_buf *);
 extern int	 xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 
+extern kmem_zone_t	*xfs_log_ticket_zone;
+
 /* iclog tracing */
 #define XLOG_TRACE_GRAB_FLUSH  1
 #define XLOG_TRACE_REL_FLUSH   2
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index c21e4d168297..ea94593b5313 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -69,15 +69,17 @@ xfs_init(void)
 	/*
 	 * Initialize all of the zone allocators we use.
 	 */
+	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+						"xfs_log_ticket");
 	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-						 "xfs_bmap_free_item");
+						"xfs_bmap_free_item");
 	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-					    "xfs_btree_cur");
-	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
-	xfs_da_state_zone =
-		kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
+						"xfs_btree_cur");
+	xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+						"xfs_da_state");
 	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
 	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
 	xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
 	xfs_mru_cache_init();
 	xfs_filestream_init();
-- 
cgit v1.2.3


From 4679b2d36d53ed508c956337972fbbea8db99a77 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:18:54 +1000
Subject: [XFS] Reorganise xlog_t for better cacheline isolation of contention

To reduce contention on the log in large CPU count, separate out different
parts of the xlog_t structure onto different cachelines. Move each lock
onto a different cacheline along with all the members that are
accessed/modified while that lock is held.

Also, move the debugging code into debug code.

SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30772a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c      |  5 ++---
 fs/xfs/xfs_log_priv.h | 55 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3cf115d8de75..319b98eb410c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1237,9 +1237,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
 		iclog->ic_bp = bp;
 		iclog->hic_data = bp->b_addr;
-
+#ifdef DEBUG
 		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
-
+#endif
 		head = &iclog->ic_header;
 		memset(head, 0, sizeof(xlog_rec_header_t));
 		head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1250,7 +1250,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		head->h_fmt = cpu_to_be32(XLOG_FMT);
 		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
 
-
 		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
 		iclog->ic_state = XLOG_STATE_ACTIVE;
 		iclog->ic_log = log;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c1583960009d..8952a392b5f3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -361,7 +361,7 @@ typedef struct xlog_iclog_fields {
 
 	/* reference counts need their own cacheline */
 	atomic_t		ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t ____cacheline_aligned_in_smp;
+} xlog_iclog_fields_t;
 
 typedef union xlog_in_core2 {
 	xlog_rec_header_t	hic_header;
@@ -402,8 +402,29 @@ typedef struct xlog_in_core {
  * that round off problems won't occur when releasing partial reservations.
  */
 typedef struct log {
+	/* The following fields don't need locking */
+	struct xfs_mount	*l_mp;	        /* mount point */
+	struct xfs_buf		*l_xbuf;        /* extra buffer for log
+						 * wrapping */
+	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	uint			l_flags;
+	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+	struct xfs_buf_cancel	**l_buf_cancel_table;
+	int			l_iclog_hsize;  /* size of iclog header */
+	int			l_iclog_heads;  /* # of iclog header sectors */
+	uint			l_sectbb_log;   /* log2 of sector size in BBs */
+	uint			l_sectbb_mask;  /* sector size (in BBs)
+						 * alignment mask */
+	int			l_iclog_size;	/* size of log in bytes */
+	int			l_iclog_size_log; /* log power size of log */
+	int			l_iclog_bufs;	/* number of iclog buffers */
+	xfs_daddr_t		l_logBBstart;   /* start block of log */
+	int			l_logsize;      /* size of log in bytes */
+	int			l_logBBsize;    /* size of log in BB chunks */
+
 	/* The following block of fields are changed while holding icloglock */
-	sema_t			l_flushsema;    /* iclog flushing semaphore */
+	sema_t			l_flushsema ____cacheline_aligned_in_smp;
+						/* iclog flushing semaphore */
 	int			l_flushcnt;	/* # of procs waiting on this
 						 * sema */
 	int			l_covered_state;/* state of "covering disk
@@ -413,27 +434,14 @@ typedef struct log {
 	xfs_lsn_t		l_tail_lsn;     /* lsn of 1st LR with unflushed
 						 * buffers */
 	xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
-	struct xfs_mount	*l_mp;	        /* mount point */
-	struct xfs_buf		*l_xbuf;        /* extra buffer for log
-						 * wrapping */
-	struct xfs_buftarg	*l_targ;        /* buftarg of log */
-	xfs_daddr_t		l_logBBstart;   /* start block of log */
-	int			l_logsize;      /* size of log in bytes */
-	int			l_logBBsize;    /* size of log in BB chunks */
 	int			l_curr_cycle;   /* Cycle number of log writes */
 	int			l_prev_cycle;   /* Cycle number before last
 						 * block increment */
 	int			l_curr_block;   /* current logical log block */
 	int			l_prev_block;   /* previous logical log block */
-	int			l_iclog_size;	/* size of log in bytes */
-	int			l_iclog_size_log; /* log power size of log */
-	int			l_iclog_bufs;	/* number of iclog buffers */
-
-	/* The following field are used for debugging; need to hold icloglock */
-	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
 
 	/* The following block of fields are changed while holding grant_lock */
-	spinlock_t		l_grant_lock;
+	spinlock_t		l_grant_lock ____cacheline_aligned_in_smp;
 	xlog_ticket_t		*l_reserve_headq;
 	xlog_ticket_t		*l_write_headq;
 	int			l_grant_reserve_cycle;
@@ -441,19 +449,16 @@ typedef struct log {
 	int			l_grant_write_cycle;
 	int			l_grant_write_bytes;
 
-	/* The following fields don't need locking */
 #ifdef XFS_LOG_TRACE
 	struct ktrace		*l_trace;
 	struct ktrace		*l_grant_trace;
 #endif
-	uint			l_flags;
-	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-	struct xfs_buf_cancel	**l_buf_cancel_table;
-	int			l_iclog_hsize;  /* size of iclog header */
-	int			l_iclog_heads;  /* # of iclog header sectors */
-	uint			l_sectbb_log;   /* log2 of sector size in BBs */
-	uint			l_sectbb_mask;  /* sector size (in BBs)
-						 * alignment mask */
+
+	/* The following field are used for debugging; need to hold icloglock */
+#ifdef DEBUG
+	char			*l_iclog_bak[XLOG_MAX_ICLOGS];
+#endif
+
 } xlog_t;
 
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
-- 
cgit v1.2.3


From 6b1d1a732f886936fe515d911b1a01d9cc50e179 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:02 +1000
Subject: [XFS] Fix lock inversion in forced shutdown.

Recent changes to xlog_state_release_iclog() placed the grant_lock inside
the icloglock. forced unmount of the log does this the opposite way
around, but does not depend on the order for correct working. Fix the
inversion by changing the order locks are gained in
xfs_log_force_umount().

SGI-PV: 979661
SGI-Modid: xfs-linux-melb:xfs-kern:30773a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 319b98eb410c..4a6f7c5d1459 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3499,8 +3499,8 @@ xfs_log_force_umount(
 	 * before we mark the filesystem SHUTDOWN and wake
 	 * everybody up to tell the bad news.
 	 */
-	spin_lock(&log->l_grant_lock);
 	spin_lock(&log->l_icloglock);
+	spin_lock(&log->l_grant_lock);
 	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
 	XFS_BUF_DONE(mp->m_sb_bp);
 	/*
-- 
cgit v1.2.3


From 0225da1f35df46c67785eb08526995d7cdb4e3b0 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 10 Apr 2008 12:19:10 +1000
Subject: [XFS] Replace __inline with inline

Remove the remaining uses of __inline in the XFS code base.

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30774a

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_cred.h  | 2 +-
 fs/xfs/linux-2.6/xfs_stats.h | 4 ++--
 fs/xfs/quota/xfs_qm_stats.h  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index e7f3da61c6c3..652721ce0ea5 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -30,7 +30,7 @@ typedef struct cred {
 extern struct cred *sys_cred;
 
 /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
+static inline int capable_cred(cred_t *cr, int cid)
 {
 	return (cr == sys_cred) ? 1 : capable(cid);
 }
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 8ba7a2fa6c1d..afd0b0d5fdb2 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void);
 # define XFS_STATS_DEC(count)
 # define XFS_STATS_ADD(count, inc)
 
-static __inline void xfs_init_procfs(void) { };
-static __inline void xfs_cleanup_procfs(void) { };
+static inline void xfs_init_procfs(void) { };
+static inline void xfs_cleanup_procfs(void) { };
 
 #endif	/* !CONFIG_PROC_FS */
 
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
index a50ffabcf554..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void);
 
 # define XQM_STATS_INC(count)	do { } while (0)
 
-static __inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
+static inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
 
 #endif
 
-- 
cgit v1.2.3


From 34a622b2e1c8e11c8990184634f101c1aad42fec Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 10 Apr 2008 12:19:21 +1000
Subject: [XFS] replace remaining __FUNCTION__ occurrences

__FUNCTION__ is gcc-specific, use __func__

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30775a

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/kmem.c      |  6 +++---
 fs/xfs/linux-2.6/xfs_buf.c   |  6 +++---
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 fs/xfs/linux-2.6/xfs_vnode.h |  4 ++--
 fs/xfs/xfs_alloc.c           | 18 +++++++++---------
 fs/xfs/xfs_bmap.c            | 18 +++++++++---------
 fs/xfs/xfs_bmap.h            |  2 +-
 fs/xfs/xfs_bmap_btree.c      | 16 ++++++++--------
 fs/xfs/xfs_filestream.c      |  2 +-
 fs/xfs/xfs_log.c             |  2 +-
 fs/xfs/xfs_log_recover.c     |  4 ++--
 fs/xfs/xfs_trans_ail.c       |  2 +-
 12 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index e040f1ce1b6a..9b1bb17a0501 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 #ifdef DEBUG
 	if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
 		printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-			__FUNCTION__, (long)size);
+			__func__, (long)size);
 		dump_stack();
 	}
 #endif
@@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 		if (!(++retries % 100))
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, lflags);
+					__func__, lflags);
 		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
@@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 		if (!(++retries % 100))
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, lflags);
+					__func__, lflags);
 		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e347bfd47c91..142ddbece374 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -400,7 +400,7 @@ _xfs_buf_lookup_pages(
 				printk(KERN_ERR
 					"XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, gfp_mask);
+					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
 			xfsbufd_wakeup(0, gfp_mask);
@@ -598,7 +598,7 @@ xfs_buf_get_flags(
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
 			printk(KERN_WARNING "%s: failed to map pages\n",
-					__FUNCTION__);
+					__func__);
 			goto no_buffer;
 		}
 	}
@@ -778,7 +778,7 @@ xfs_buf_get_noaddr(
 	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 	if (unlikely(error)) {
 		printk(KERN_WARNING "%s: failed to map pages\n",
-				__FUNCTION__);
+				__func__);
 		goto fail_free_mem;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 72e55db948d2..fb561beea373 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -935,7 +935,7 @@ xfs_fs_clear_inode(
 		xfs_inactive(ip);
 		xfs_iflags_clear(ip, XFS_IMODIFIED);
 		if (xfs_reclaim(ip))
-			panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode);
+			panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 	}
 
 	ASSERT(XFS_I(inode) == NULL);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 4ed5914adefb..dbb8a5d27f78 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -288,9 +288,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
 extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
 extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
 #define xfs_itrace_entry(ip)	\
-	_xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address)
+	_xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit(ip)	\
-	_xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address)
+	_xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit_tag(ip, tag)	\
 	_xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
 #define xfs_itrace_ref(ip)	\
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdbfbbee4959..bd5c01788eff 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -55,17 +55,17 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 ktrace_t *xfs_alloc_trace_buf;
 
 #define	TRACE_ALLOC(s,a)	\
-	xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__)
+	xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
 #define	TRACE_FREE(s,a,b,x,f)	\
-	xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__)
+	xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
 #define	TRACE_MODAGF(s,a,f)	\
-	xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__)
-#define	TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define	TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp)	\
-	xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define	TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+	xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
+#define	TRACE_BUSY(__func__,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+#define	TRACE_UNBUSY(__func__,s,ag,sl,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
 #else
 #define	TRACE_ALLOC(s,a)
 #define	TRACE_FREE(s,a,b,x,f)
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7d683e0b8ef7..65b8fa83e078 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update(
 	int		whichfork);	/* data or attr fork */
 
 #define	XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)	\
-	xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w)
+	xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
 #define	XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)	\
-	xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w)
+	xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
 #define	XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)	\
-	xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w)
+	xfs_bmap_trace_post_update(__func__,d,ip,i,w)
 #define	XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)	\
-	xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w)
+	xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
 #else
 #define	XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
 #define	XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
@@ -6164,10 +6164,10 @@ xfs_check_block(
 			}
 			if (*thispa == *pp) {
 				cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
-					__FUNCTION__, j, i,
+					__func__, j, i,
 					(unsigned long long)be64_to_cpu(*thispa));
 				panic("%s: ptrs are equal in node\n",
-					__FUNCTION__);
+					__func__);
 			}
 		}
 	}
@@ -6324,13 +6324,13 @@ xfs_bmap_check_leaf_extents(
 	return;
 
 error0:
-	cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
+	cmn_err(CE_WARN, "%s: at error0", __func__);
 	if (bp_release)
 		xfs_trans_brelse(NULL, bp);
 error_norelse:
 	cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
-		__FUNCTION__, i);
-	panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
+		__func__, i);
+	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
 	return;
 }
 #endif
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 87224b7d7984..6ff70cda451c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -151,7 +151,7 @@ xfs_bmap_trace_exlist(
 	xfs_extnum_t		cnt,		/* count of entries in list */
 	int			whichfork);	/* data or attr fork */
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)	\
-	xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w)
+	xfs_bmap_trace_exlist(__func__,ip,c,w)
 #else
 #define	XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 93470b728dd0..4f0e849d973e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor(
 }
 
 #define	XFS_BMBT_TRACE_ARGBI(c,b,i)	\
-	xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__)
+	xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
 #define	XFS_BMBT_TRACE_ARGBII(c,b,i,j)	\
-	xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__)
+	xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
 #define	XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)	\
-	xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__)
+	xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
 #define	XFS_BMBT_TRACE_ARGI(c,i)	\
-	xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__)
+	xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIFK(c,i,f,s)	\
-	xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__)
+	xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIFR(c,i,f,r)	\
-	xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__)
+	xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIK(c,i,k)	\
-	xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__)
+	xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
 #define	XFS_BMBT_TRACE_CURSOR(c,s)	\
-	xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__)
+	xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
 #else
 #define	XFS_BMBT_TRACE_ARGBI(c,b,i)
 #define	XFS_BMBT_TRACE_ARGBII(c,b,i,j)
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eb03eab5ca52..3f3785b10804 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -73,7 +73,7 @@ xfs_filestreams_trace(
 #define TRACE4(mp,t,a0,a1,a2,a3)	TRACE6(mp,t,a0,a1,a2,a3,0,0)
 #define TRACE5(mp,t,a0,a1,a2,a3,a4)	TRACE6(mp,t,a0,a1,a2,a3,a4,0)
 #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
-	xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
+	xfs_filestreams_trace(mp, t, __func__, __LINE__, \
 				(__psunsigned_t)a0, (__psunsigned_t)a1, \
 				(__psunsigned_t)a2, (__psunsigned_t)a3, \
 				(__psunsigned_t)a4, (__psunsigned_t)a5)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4a6f7c5d1459..bece882f99ec 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2225,7 +2225,7 @@ xlog_state_do_callback(
 			repeats = 0;
 			xfs_fs_cmn_err(CE_WARN, log->l_mp,
 				"%s: possible infinite loop (%d iterations)",
-				__FUNCTION__, flushcnt);
+				__func__, flushcnt);
 		}
 	} while (!ioerrors && loopdidcallbacks);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 962d74a9ea7e..c37521467fdc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -192,7 +192,7 @@ xlog_header_check_dump(
 {
 	int			b;
 
-	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
+	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
 	for (b = 0; b < 16; b++)
 		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
@@ -3447,7 +3447,7 @@ xlog_valid_rec_header(
 	    (!rhead->h_version ||
 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
 		xlog_warn("XFS: %s: unrecognised log version (%d).",
-			__FUNCTION__, be32_to_cpu(rhead->h_version));
+			__func__, be32_to_cpu(rhead->h_version));
 		return XFS_ERROR(EIO);
 	}
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 13235ae9a582..1f77c00af566 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -438,7 +438,7 @@ xfs_trans_delete_ail(
 		else {
 			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
 		"%s: attempting to delete a log item that is not in the AIL",
-					__FUNCTION__);
+					__func__);
 			spin_unlock(&mp->m_ail_lock);
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		}
-- 
cgit v1.2.3


From b6ddc4e6fed9c6f4adb273c8b36e1731f90ec17e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 10 Apr 2008 12:19:27 +1000
Subject: [XFS] Don't validate symlink target component length

This target component validation is not POSIX conformant and it is not
done by any other Linux filesystem so remove it from XFS.

SGI-PV: 980080
SGI-Modid: xfs-linux-melb:xfs-kern:30776a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 3418c94bcf17..d46f24c68498 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3105,31 +3105,6 @@ xfs_symlink(
 	pathlen = strlen(target_path);
 	if (pathlen >= MAXPATHLEN)      /* total string too long */
 		return XFS_ERROR(ENAMETOOLONG);
-	if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
-		int len, total;
-		char *path;
-
-		for (total = 0, path = target_path; total < pathlen;) {
-			/*
-			 * Skip any slashes.
-			 */
-			while(*path == '/') {
-				total++;
-				path++;
-			}
-
-			/*
-			 * Count up to the next slash or end of path.
-			 * Error out if the component is bigger than MAXNAMELEN.
-			 */
-			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
-				if (++len >= MAXNAMELEN) {
-					error = ENAMETOOLONG;
-					return error;
-				}
-			}
-		}
-	}
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
-- 
cgit v1.2.3


From 3c85c36cc2e87018d38fcd033f41bbdf1360c07a Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:40 +1000
Subject: [XFS] xfs_quiesce_fs() never returns an error. Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30780a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index ea94593b5313..6351efb569c7 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -637,7 +637,7 @@ out:
 	return XFS_ERROR(error);
 }
 
-STATIC int
+STATIC void
 xfs_quiesce_fs(
 	xfs_mount_t		*mp)
 {
@@ -661,8 +661,6 @@ xfs_quiesce_fs(
 			count++;
 		}
 	} while (count < 2);
-
-	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From a414047fc97aea7db6237176ce00013117839cd5 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:47 +1000
Subject: [XFS] Remove useless whitespace in function prototypes

Makes it simpler to annotate function prototypes with __must_check via sed
scripts.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30781a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_utils.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index c4c4a6aa6549..701accbbaea1 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,14 +21,14 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
-extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
-extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
+extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
 				xfs_dev_t, cred_t *, prid_t, int,
 				xfs_inode_t **, int *);
-extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
 
 #endif	/* __XFS_UTILS_H__ */
-- 
cgit v1.2.3


From 36fbe6e6bd5408b09341043dfece978b4a7a0f34 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:19:56 +1000
Subject: [XFS] xfs_icsb_counter_disabled() never returns an error.

Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30782a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_mount.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c2aafeb8c6cb..eb348c168505 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -58,7 +58,7 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
 STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
 						int64_t, int);
-STATIC int	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 
 #else
 
@@ -2183,7 +2183,7 @@ xfs_icsb_counter_disabled(
 	return test_bit(field, &mp->m_icsb_counters);
 }
 
-STATIC int
+STATIC void
 xfs_icsb_disable_counter(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t	field)
@@ -2201,7 +2201,7 @@ xfs_icsb_disable_counter(
 	 * the m_icsb_mutex.
 	 */
 	if (xfs_icsb_counter_disabled(mp, field))
-		return 0;
+		return;
 
 	xfs_icsb_lock_all_counters(mp);
 	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
@@ -2224,8 +2224,6 @@ xfs_icsb_disable_counter(
 	}
 
 	xfs_icsb_unlock_all_counters(mp);
-
-	return 0;
 }
 
 STATIC void
-- 
cgit v1.2.3


From 714082bc12b6c305f825411df02177efcb0085f1 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:03 +1000
Subject: [XFS] Report errors from xfs_reserve_blocks().

xfs_reserve_blocks() can fail in interesting ways. In neither case is it a
fatal error, but the result can lead to sub-optimal behaviour. Warn to the
syslog if the call fails but otherwise continue.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30784a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_mount.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb348c168505..244aa1b9f134 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1227,12 +1227,15 @@ xfs_mountfs(
 	 *
 	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
 	 * This may drive us straight to ENOSPC on mount, but that implies
-	 * we were already there on the last unmount.
+	 * we were already there on the last unmount. Warn if this occurs.
 	 */
 	resblks = mp->m_sb.sb_dblocks;
 	do_div(resblks, 20);
 	resblks = min_t(__uint64_t, resblks, 1024);
-	xfs_reserve_blocks(mp, &resblks, NULL);
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
+				"Continuing without a reserve pool.");
 
 	return 0;
 
@@ -1268,6 +1271,7 @@ int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
 	__uint64_t	resblks;
+	int		error = 0;
 
 	/*
 	 * We can potentially deadlock here if we have an inode cluster
@@ -1311,7 +1315,11 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 	 * value does not matter....
 	 */
 	resblks = 0;
-	xfs_reserve_blocks(mp, &resblks, NULL);
+	error = xfs_reserve_blocks(mp, &resblks, NULL);
+	if (error)
+		cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+				"Freespace may not be correct on next mount.");
+
 
 	xfs_log_sbcount(mp, 1);
 	xfs_unmountfs_writesb(mp);
-- 
cgit v1.2.3


From 5b1397385bf536cbdb60f3362f44079d15d5f5ee Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:10 +1000
Subject: [XFS] xfs_qm_reset_dqcounts() does not return errors.

Declare it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30785a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index adbc7bb9fbaa..dec5f95e8470 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc(
 }
 
 
-STATIC int
+STATIC void
 xfs_qm_reset_dqcounts(
 	xfs_mount_t	*mp,
 	xfs_buf_t	*bp,
@@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts(
 		ddq->d_rtbwarns = 0;
 		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
 	}
-
-	return 0;
 }
 
 STATIC int
@@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs(
 		if (error)
 			break;
 
-		(void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
 		xfs_bdwrite(mp, bp);
 		/*
 		 * goto the next block.
-- 
cgit v1.2.3


From 4b8879df8c21bed3efd1eb2da5d72501199aba29 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:17 +1000
Subject: [XFS] Propagate xfs_qm_dqflush_all() errors.

xfs_qm_dqflush_all() can return flush errors. Ensure they are propagated
into the quotacheck code to determine if the quotacheck succeeded or not.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30786a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index dec5f95e8470..04b29c672141 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1877,6 +1877,14 @@ xfs_qm_quotacheck(
 
 	} while (! done);
 
+	/*
+	 * We've made all the changes that we need to make incore.
+	 * Flush them down to disk buffers if everything was updated
+	 * successfully.
+	 */
+	if (!error)
+		error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+
 	/*
 	 * We can get this error if we couldn't do a dquot allocation inside
 	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
@@ -1888,11 +1896,6 @@ xfs_qm_quotacheck(
 		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
 		goto error_return;
 	}
-	/*
-	 * We've made all the changes that we need to make incore.
-	 * Now flush_them down to disk buffers.
-	 */
-	xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
 
 	/*
 	 * We didn't log anything, because if we crashed, we'll have to
-- 
cgit v1.2.3


From 3c56836f92683cb871ebbf44c512069b0d48a08f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:24 +1000
Subject: [XFS] Check for dquot flush errors

xfs_qm_dqflush() can fail, but the return is not checked anywhere. Hence
we never know if we've failed to flush a dquot to disk. Propagate the
error and warn to the syslog if a flush ever fails.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30787a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_dquot.c      | 10 ++++++----
 fs/xfs/quota/xfs_dquot_item.c |  7 ++++++-
 fs/xfs/quota/xfs_qm.c         | 14 ++++++++++++--
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 665babcca6a6..15214fbb9aa7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1439,9 +1439,7 @@ xfs_qm_dqpurge(
 	uint		flags)
 {
 	xfs_dqhash_t	*thishash;
-	xfs_mount_t	*mp;
-
-	mp = dqp->q_mount;
+	xfs_mount_t	*mp = dqp->q_mount;
 
 	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
 	ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
@@ -1485,6 +1483,7 @@ xfs_qm_dqpurge(
 	 * we're unmounting, we do care, so we flush it and wait.
 	 */
 	if (XFS_DQ_IS_DIRTY(dqp)) {
+		int	error;
 		xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
 		/* dqflush unlocks dqflock */
 		/*
@@ -1495,7 +1494,10 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		(void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		if (error)
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"xfs_qm_dqpurge: dquot %p flush failed", dqp);
 		xfs_dqflock(dqp);
 	}
 	ASSERT(dqp->q_pincount == 0);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1800e8d1f646..3dedce1d9cde 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push(
 	xfs_dq_logitem_t	*logitem)
 {
 	xfs_dquot_t	*dqp;
+	int		error;
 
 	dqp = logitem->qli_dquot;
 
@@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push(
 	 * lock without sleeping, then there must not have been
 	 * anyone in the process of flushing the dquot.
 	 */
-	xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+	error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+	if (error)
+		xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+			"xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+			error, dqp);
 	xfs_dqunlock(dqp);
 }
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 04b29c672141..0ed3c8277fcd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -2094,12 +2094,17 @@ xfs_qm_shake_freelist(
 		 * dirty dquots.
 		 */
 		if (XFS_DQ_IS_DIRTY(dqp)) {
+			int	error;
 			xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
 			/*
 			 * We flush it delayed write, so don't bother
 			 * releasing the mplock.
 			 */
-			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			if (error) {
+				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+			"xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+			}
 			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
 			dqp = dqp->dq_flnext;
 			continue;
@@ -2266,12 +2271,17 @@ xfs_qm_dqreclaim_one(void)
 		 * dirty dquots.
 		 */
 		if (XFS_DQ_IS_DIRTY(dqp)) {
+			int	error;
 			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
 			/*
 			 * We flush it delayed write, so don't bother
 			 * releasing the freelist lock.
 			 */
-			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			if (error) {
+				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+			}
 			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
 			continue;
 		}
-- 
cgit v1.2.3


From 53aa7915d67b9d0f5986c9f08e76846fedc520d4 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:31 +1000
Subject: [XFS] Clean up quotamount error handling.

xfs_qm_mount_quotas() returns an error status that is ignored. If we fail
to mount quotas, we continue with quota's turned off, which is all handled
inside xfs_qm_mount_quotas(). Mark it as void to indicate that errors need
not be returned to the callers.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30788a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 28 ++++++++++++++--------------
 fs/xfs/quota/xfs_qm.h |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 0ed3c8277fcd..e15ee7cf3ccd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy(
  * necessary data structures like quotainfo.  This is also responsible for
  * running a quotacheck as necessary.  We are guaranteed that the superblock
  * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
  */
-int
+void
 xfs_qm_mount_quotas(
 	xfs_mount_t	*mp,
 	int		mfsi_flags)
@@ -313,7 +316,6 @@ xfs_qm_mount_quotas(
 	int		error = 0;
 	uint		sbf;
 
-
 	/*
 	 * If quotas on realtime volumes is not supported, we disable
 	 * quotas immediately.
@@ -332,7 +334,8 @@ xfs_qm_mount_quotas(
 	 * Allocate the quotainfo structure inside the mount struct, and
 	 * create quotainode(s), and change/rev superblock if necessary.
 	 */
-	if ((error = xfs_qm_init_quotainfo(mp))) {
+	error = xfs_qm_init_quotainfo(mp);
+	if (error) {
 		/*
 		 * We must turn off quotas.
 		 */
@@ -344,12 +347,11 @@ xfs_qm_mount_quotas(
 	 * If any of the quotas are not consistent, do a quotacheck.
 	 */
 	if (XFS_QM_NEED_QUOTACHECK(mp) &&
-		!(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-		if ((error = xfs_qm_quotacheck(mp))) {
-			/* Quotacheck has failed and quotas have
-			 * been disabled.
-			 */
-			return XFS_ERROR(error);
+	    !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+		error = xfs_qm_quotacheck(mp);
+		if (error) {
+			/* Quotacheck failed and disabled quotas. */
+			return;
 		}
 	}
 	/* 
@@ -357,12 +359,10 @@ xfs_qm_mount_quotas(
 	 * quotachecked status, since we won't be doing accounting for
 	 * that type anymore.
 	 */
-	if (!XFS_IS_UQUOTA_ON(mp)) {
+	if (!XFS_IS_UQUOTA_ON(mp))
 		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-	}
-	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
+	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
 		mp->m_qflags &= ~XFS_OQUOTA_CHKD;
-	}
 
  write_changes:
 	/*
@@ -392,7 +392,7 @@ xfs_qm_mount_quotas(
 		xfs_fs_cmn_err(CE_WARN, mp,
 			"Failed to initialize disk quotas.");
 	}
-	return XFS_ERROR(error);
+	return;
 }
 
 /*
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index baf537c1c177..cd2300e374af 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
 
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int		xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void		xfs_qm_mount_quotas(xfs_mount_t *, int);
 extern int		xfs_qm_quotacheck(xfs_mount_t *);
 extern void		xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int		xfs_qm_unmount_quotas(xfs_mount_t *);
-- 
cgit v1.2.3


From 31d5577b35d8397dea19f2ba7550e9225605a785 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:38 +1000
Subject: [XFS] Catch errors resetting quota flags.

Warn to the syslog if we fail to reset the quota flags in the superblock
when a quota check fails.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30789a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index e15ee7cf3ccd..6aa3445cabad 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1927,7 +1927,10 @@ xfs_qm_quotacheck(
 		ASSERT(mp->m_quotainfo != NULL);
 		ASSERT(xfs_Gqm != NULL);
 		xfs_qm_destroy_quotainfo(mp);
-		(void)xfs_mount_reset_sbqflags(mp);
+		if (xfs_mount_reset_sbqflags(mp)) {
+			cmn_err(CE_WARN, "XFS quotacheck %s: "
+				"Failed to reset quota flags.", mp->m_fsname);
+		}
 	} else {
 		cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
 	}
-- 
cgit v1.2.3


From cb6edc26c386d2268dcf61bcdec02b6fb50b6ba2 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:45 +1000
Subject: [XFS] Catch errors when turning off quotas.

When turning off quota, we need to write various transactions to the log
to ensure that they are cleanly removed in the case of a crash. We need to
check that the transactions hit the disk correctly. If we fail to write
the final quota off transaction, we are corrupt in memory and so the only
option is to shut the filesystem down at this point.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30790a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm_syscalls.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 3dc161f39d13..61cf68df547e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff(
 
 	/*
 	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
-	 * and synchronously.
+	 * and synchronously. If we fail to write, we should abort the
+	 * operation as it cannot be recovered safely if we crash.
 	 */
-	xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+	error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+	if (error)
+		goto out_error;
 
 	/*
 	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff(
 	 * So, we have QUOTAOFF start and end logitems; the start
 	 * logitem won't get overwritten until the end logitem appears...
 	 */
-	xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+	error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+	if (error) {
+		/* We're screwed now. Shutdown is the only option. */
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		goto out_error;
+	}
 
 	/*
 	 * If quotas is completely disabled, close shop.
@@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff(
 		XFS_PURGE_INODE(XFS_QI_GQIP(mp));
 		XFS_QI_GQIP(mp) = NULL;
 	}
+out_error:
 	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
 	return (error);
-- 
cgit v1.2.3


From 88ab02085363b7c45935d66ab3e969b4fec9a20c Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:51 +1000
Subject: [XFS] Propagate quota file truncation errors.

Truncating the quota files can silently fail. Ensure that truncation
errors are propagated to the callers.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30791a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm_syscalls.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 61cf68df547e..556018d24cad 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -380,12 +380,11 @@ xfs_qm_scall_trunc_qfiles(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int		error;
+	int		error = 0, error2 = 0;
 	xfs_inode_t	*qip;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return XFS_ERROR(EPERM);
-	error = 0;
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
 		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
@@ -393,22 +392,22 @@ xfs_qm_scall_trunc_qfiles(
 
 	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
 		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
-		if (! error) {
-			(void) xfs_truncate_file(mp, qip);
+		if (!error) {
+			error = xfs_truncate_file(mp, qip);
 			IRELE(qip);
 		}
 	}
 
 	if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
 	    mp->m_sb.sb_gquotino != NULLFSINO) {
-		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
-		if (! error) {
-			(void) xfs_truncate_file(mp, qip);
+		error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+		if (!error2) {
+			error2 = xfs_truncate_file(mp, qip);
 			IRELE(qip);
 		}
 	}
 
-	return (error);
+	return error ? error : error2;
 }
 
 
-- 
cgit v1.2.3


From 0c928299676c8df2b00e75d5691cd4846e6c0868 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:20:58 +1000
Subject: [XFS] Catch errors from xfs_acl_setmode().

Propagate the error status from xfs_acl_setmode() so that callers know if
the ACl was set correctly or not.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30792a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_acl.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7272fe39a92d..98b515d39187 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -307,12 +307,13 @@ xfs_acl_vset(
 
 	VN_HOLD(vp);
 	error = xfs_acl_allow_set(vp, kind);
-	if (error)
-		goto out;
 
 	/* Incoming ACL exists, set file mode based on its value */
-	if (kind == _ACL_TYPE_ACCESS)
-		xfs_acl_setmode(vp, xfs_acl, &basicperms);
+	if (!error && kind == _ACL_TYPE_ACCESS)
+		error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
+
+	if (error)
+		goto out;
 
 	/*
 	 * If we have more than std unix permissions, set up the actual attr.
@@ -707,7 +708,9 @@ xfs_acl_inherit(
 
 	memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
 	xfs_acl_filter_mode(mode, cacl);
-	xfs_acl_setmode(vp, cacl, &basicperms);
+	error = xfs_acl_setmode(vp, cacl, &basicperms);
+	if (error)
+		goto out_error;
 
 	/*
 	 * Set the Default and Access ACL on the file.  The mode is already
@@ -720,6 +723,7 @@ xfs_acl_inherit(
 		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
 	if (!error && !basicperms)
 		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+out_error:
 	_ACL_FREE(cacl);
 	return error;
 }
-- 
cgit v1.2.3


From 5ca1f261a08d5cff5f29eaa0887b59baae2ae7f7 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:04 +1000
Subject: [XFS] Catch errors from xfs_acl_vremove().

Removing an ACL can return an error. Propagate it.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30793a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 98b515d39187..8e130b9720ae 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -324,7 +324,7 @@ xfs_acl_vset(
 	if (!basicperms) {
 		xfs_acl_set_attr(vp, xfs_acl, kind, &error);
 	} else {
-		xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+		error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
 	}
 
 out:
-- 
cgit v1.2.3


From 3c1e2bbe5bcdcd435510a05eb121fa74b848e24f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:11 +1000
Subject: [XFS] Propagate xfs_trans_reserve() errors.

xfs_trans_reserve() reports errors that should not be ignored. For
example, a shutdown filesystem will report errors through
xfs_trans_reserve() to prevent further changes from being attempted on a
damaged filesystem. Catch and propagate all error conditions from
xfs_trans_reserve().

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30794a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c37521467fdc..957b8caddf1e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2965,7 +2965,7 @@ xlog_recover_process_data(
  * Process an extent free intent item that was recovered from
  * the log.  We need to free the extents that it describes.
  */
-STATIC void
+STATIC int
 xlog_recover_process_efi(
 	xfs_mount_t		*mp,
 	xfs_efi_log_item_t	*efip)
@@ -2973,6 +2973,7 @@ xlog_recover_process_efi(
 	xfs_efd_log_item_t	*efdp;
 	xfs_trans_t		*tp;
 	int			i;
+	int			error = 0;
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
 
@@ -2996,12 +2997,16 @@ xlog_recover_process_efi(
 			 * free the memory associated with it.
 			 */
 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
-			return;
+			return XFS_ERROR(EIO);
 		}
 	}
 
 	tp = xfs_trans_alloc(mp, 0);
-	xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		return error;
+	}
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -3013,6 +3018,7 @@ xlog_recover_process_efi(
 
 	efip->efi_flags |= XFS_EFI_RECOVERED;
 	xfs_trans_commit(tp, 0);
+	return error;
 }
 
 /*
@@ -3060,7 +3066,7 @@ xlog_recover_check_ail(
  * everything already in the AIL, we stop processing as soon as
  * we see something other than an EFI in the AIL.
  */
-STATIC void
+STATIC int
 xlog_recover_process_efis(
 	xlog_t			*log)
 {
@@ -3068,6 +3074,7 @@ xlog_recover_process_efis(
 	xfs_efi_log_item_t	*efip;
 	int			gen;
 	xfs_mount_t		*mp;
+	int			error = 0;
 
 	mp = log->l_mp;
 	spin_lock(&mp->m_ail_lock);
@@ -3092,11 +3099,14 @@ xlog_recover_process_efis(
 		}
 
 		spin_unlock(&mp->m_ail_lock);
-		xlog_recover_process_efi(mp, efip);
+		error = xlog_recover_process_efi(mp, efip);
+		if (error)
+			return error;
 		spin_lock(&mp->m_ail_lock);
 		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
 	}
 	spin_unlock(&mp->m_ail_lock);
+	return error;
 }
 
 /*
@@ -3116,9 +3126,9 @@ xlog_recover_clear_agi_bucket(
 	int		error;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-	xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
-
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+	error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+	if (!error)
+		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
 	if (error) {
@@ -3919,7 +3929,14 @@ xlog_recover_finish(
 	 * rather than accepting new requests.
 	 */
 	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
-		xlog_recover_process_efis(log);
+		int	error;
+		error = xlog_recover_process_efis(log);
+		if (error) {
+			cmn_err(CE_ALERT,
+				"Failed to recover EFIs on filesystem: %s",
+				log->l_mp->m_fsname);
+			return error;
+		}
 		/*
 		 * Sync the log to get all the EFIs out of the AIL.
 		 * This isn't absolutely necessary, but it helps in
-- 
cgit v1.2.3


From e5720eec0548c08943d759e39db0388d8fe59287 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:18 +1000
Subject: [XFS] Propagate errors from xfs_trans_commit().

xfs_trans_commit() can return errors when there are problems in the
transaction subsystem. They are indicative that the entire transaction may
be incomplete, and hence the error should be propagated as there is a good
possibility that there is something fatally wrong in the filesystem. Catch
and propagate or warn about commit errors in the places where they are
currently ignored.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30795a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c          |  4 ++--
 fs/xfs/quota/xfs_qm_syscalls.c |  4 ++--
 fs/xfs/xfs_inode.c             | 52 ++++++++++++++++++------------------------
 fs/xfs/xfs_log_recover.c       | 27 ++++++++++++++--------
 fs/xfs/xfs_mount.c             | 35 +++++++++++++++++-----------
 fs/xfs/xfs_rtalloc.c           | 38 ++++++++++++++++++------------
 fs/xfs/xfs_vfsops.c            | 15 +++++++++---
 fs/xfs/xfs_vnodeops.c          | 28 ++++++++++++-----------
 8 files changed, 115 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6aa3445cabad..40ea56409561 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -2392,9 +2392,9 @@ xfs_qm_write_sb_changes(
 	}
 
 	xfs_mod_sb(tp, flags);
-	(void) xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 
-	return 0;
+	return error;
 }
 
 
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 556018d24cad..8342823dbdc3 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -734,12 +734,12 @@ xfs_qm_scall_setqlim(
 	xfs_trans_log_dquot(tp, dqp);
 
 	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 	xfs_qm_dqprint(dqp);
 	xfs_qm_dqrele(dqp);
 	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
-	return (0);
+	return error;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d7514f8317df..63e66890f063 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1699,33 +1699,16 @@ xfs_itruncate_finish(
 			 * blocks in the file system, but oh well.
 			 */
 			xfs_bmap_cancel(&free_list);
-			if (committed) {
-				/*
-				 * If the passed in transaction committed
-				 * in xfs_bmap_finish(), then we want to
-				 * add the inode to this one before returning.
-				 * This keeps things simple for the higher
-				 * level code, because it always knows that
-				 * the inode is locked and held in the
-				 * transaction that returns to it whether
-				 * errors occur or not.  We don't mark the
-				 * inode dirty so that this transaction can
-				 * be easily aborted if possible.
-				 */
-				xfs_trans_ijoin(ntp, ip,
-					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-				xfs_trans_ihold(ntp, ip);
-			}
+			if (committed)
+				goto error_join;
 			return error;
 		}
 
 		if (committed) {
 			/*
-			 * The first xact was committed,
-			 * so add the inode to the new one.
-			 * Mark it dirty so it will be logged
-			 * and moved forward in the log as
-			 * part of every commit.
+			 * The first xact was committed, so add the inode to
+			 * the new one.  Mark it dirty so it will be logged and
+			 * moved forward in the log as part of every commit.
 			 */
 			xfs_trans_ijoin(ntp, ip,
 					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -1733,19 +1716,16 @@ xfs_itruncate_finish(
 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 		}
 		ntp = xfs_trans_dup(ntp);
-		(void) xfs_trans_commit(*tp, 0);
+		error = xfs_trans_commit(*tp, 0);
 		*tp = ntp;
+		if (error)
+			goto error_join;
 		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 					  XFS_TRANS_PERM_LOG_RES,
 					  XFS_ITRUNCATE_LOG_COUNT);
-		/*
-		 * Add the inode being truncated to the next chained
-		 * transaction.
-		 */
-		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-		xfs_trans_ihold(ntp, ip);
 		if (error)
-			return (error);
+			goto error_join;
+
 	}
 	/*
 	 * Only update the size in the case of the data fork, but
@@ -1777,6 +1757,18 @@ xfs_itruncate_finish(
 	       (ip->i_d.di_nextents == 0));
 	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
 	return 0;
+
+error_join:
+	/*
+	 * Add the inode being truncated to the next chained transaction.  This
+	 * keeps things simple for the higher level code, because it always
+	 * knows that the inode is locked and held in the transaction that
+	 * returns to it whether errors occur or not.  We don't mark the inode
+	 * dirty so that this transaction can be easily aborted if possible.
+	 */
+	xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(ntp, ip);
+	return error;
 }
 
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 957b8caddf1e..418582b709eb 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3017,7 +3017,7 @@ xlog_recover_process_efi(
 	}
 
 	efip->efi_flags |= XFS_EFI_RECOVERED;
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 	return error;
 }
 
@@ -3131,16 +3131,13 @@ xlog_recover_clear_agi_bucket(
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
 				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
-	if (error) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		return;
-	}
+	if (error)
+		goto out_abort;
 
+	error = EINVAL;
 	agi = XFS_BUF_TO_AGI(agibp);
-	if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		return;
-	}
+	if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
+		goto out_abort;
 
 	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
@@ -3148,7 +3145,17 @@ xlog_recover_clear_agi_bucket(
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 
-	(void) xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
+	if (error)
+		goto out_error;
+	return;
+
+out_abort:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+	xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+			"failed to clear agi %d. Continuing.", agno);
+	return;
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 244aa1b9f134..2d03fe194c2c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 
-STATIC void	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+STATIC int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
@@ -1189,8 +1189,13 @@ xfs_mountfs(
 	/*
 	 * If fs is not mounted readonly, then update the superblock changes.
 	 */
-	if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY))
-		xfs_mount_log_sb(mp, update_flags);
+	if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		error = xfs_mount_log_sb(mp, update_flags);
+		if (error) {
+			cmn_err(CE_WARN, "XFS: failed to write sb changes");
+			goto error4;
+		}
+	}
 
 	/*
 	 * Initialise the XFS quota management subsystem for this mount
@@ -1320,8 +1325,10 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 		cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
 				"Freespace may not be correct on next mount.");
 
-
-	xfs_log_sbcount(mp, 1);
+	error = xfs_log_sbcount(mp, 1);
+	if (error)
+		cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+				"Freespace may not be correct on next mount.");
 	xfs_unmountfs_writesb(mp);
 	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
 	xfs_log_unmount(mp);			/* Done! No more fs ops. */
@@ -1413,9 +1420,8 @@ xfs_log_sbcount(
 	xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
 	if (sync)
 		xfs_trans_set_sync(tp);
-	xfs_trans_commit(tp, 0);
-
-	return 0;
+	error = xfs_trans_commit(tp, 0);
+	return error;
 }
 
 STATIC void
@@ -1913,24 +1919,27 @@ xfs_uuid_unmount(
  * be altered by the mount options, as well as any potential sb_features2
  * fixup. Only the first superblock is updated.
  */
-STATIC void
+STATIC int
 xfs_mount_log_sb(
 	xfs_mount_t	*mp,
 	__int64_t	fields)
 {
 	xfs_trans_t	*tp;
+	int		error;
 
 	ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
 			 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-				XFS_DEFAULT_LOG_COUNT)) {
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				XFS_DEFAULT_LOG_COUNT);
+	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return;
+		return error;
 	}
 	xfs_mod_sb(tp, fields);
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
+	return error;
 }
 
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 9cd6471cd60f..a0dc6e5bc5b9 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -124,14 +124,14 @@ xfs_growfs_rt_alloc(
 				XFS_GROWRTALLOC_LOG_RES(mp), 0,
 				XFS_TRANS_PERM_LOG_RES,
 				XFS_DEFAULT_PERM_LOG_COUNT)))
-			goto error_exit;
+			goto error_cancel;
 		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
 		/*
 		 * Lock the inode.
 		 */
 		if ((error = xfs_trans_iget(mp, tp, ino, 0,
 						XFS_ILOCK_EXCL, &ip)))
-			goto error_exit;
+			goto error_cancel;
 		XFS_BMAP_INIT(&flist, &firstblock);
 		/*
 		 * Allocate blocks to the bitmap file.
@@ -144,14 +144,16 @@ xfs_growfs_rt_alloc(
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
-			goto error_exit;
+			goto error_cancel;
 		/*
 		 * Free any blocks freed up in the transaction, then commit.
 		 */
 		error = xfs_bmap_finish(&tp, &flist, &committed);
 		if (error)
-			goto error_exit;
-		xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+			goto error_cancel;
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			goto error;
 		/*
 		 * Now we need to clear the allocated blocks.
 		 * Do this one block per transaction, to keep it simple.
@@ -166,13 +168,13 @@ xfs_growfs_rt_alloc(
 			 */
 			if ((error = xfs_trans_reserve(tp, 0,
 					XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
-				goto error_exit;
+				goto error_cancel;
 			/*
 			 * Lock the bitmap inode.
 			 */
 			if ((error = xfs_trans_iget(mp, tp, ino, 0,
 							XFS_ILOCK_EXCL, &ip)))
-				goto error_exit;
+				goto error_cancel;
 			/*
 			 * Get a buffer for the block.
 			 */
@@ -181,14 +183,16 @@ xfs_growfs_rt_alloc(
 				mp->m_bsize, 0);
 			if (bp == NULL) {
 				error = XFS_ERROR(EIO);
-				goto error_exit;
+				goto error_cancel;
 			}
 			memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
 			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
 			/*
 			 * Commit the transaction.
 			 */
-			xfs_trans_commit(tp, 0);
+			error = xfs_trans_commit(tp, 0);
+			if (error)
+				goto error;
 		}
 		/*
 		 * Go on to the next extent, if any.
@@ -196,8 +200,9 @@ xfs_growfs_rt_alloc(
 		oblocks = map.br_startoff + map.br_blockcount;
 	}
 	return 0;
-error_exit:
+error_cancel:
 	xfs_trans_cancel(tp, cancelflags);
+error:
 	return error;
 }
 
@@ -1876,6 +1881,7 @@ xfs_growfs_rt(
 	xfs_trans_t	*tp;		/* transaction pointer */
 
 	sbp = &mp->m_sb;
+	cancelflags = 0;
 	/*
 	 * Initial error checking.
 	 */
@@ -2042,13 +2048,15 @@ xfs_growfs_rt(
 		 */
 		mp->m_rsumlevels = nrsumlevels;
 		mp->m_rsumsize = nrsumsize;
-		/*
-		 * Commit the transaction.
-		 */
-		xfs_trans_commit(tp, 0);
+
+		error = xfs_trans_commit(tp, 0);
+		if (error) {
+			tp = NULL;
+			break;
+		}
 	}
 
-	if (error)
+	if (error && tp)
 		xfs_trans_cancel(tp, cancelflags);
 
 	/*
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 6351efb569c7..09e186d02c11 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -672,6 +672,8 @@ void
 xfs_attr_quiesce(
 	xfs_mount_t	*mp)
 {
+	int	error = 0;
+
 	/* wait for all modifications to complete */
 	while (atomic_read(&mp->m_active_trans) > 0)
 		delay(100);
@@ -682,7 +684,11 @@ xfs_attr_quiesce(
 	ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
 
 	/* Push the superblock and write an unmount record */
-	xfs_log_sbcount(mp, 1);
+	error = xfs_log_sbcount(mp, 1);
+	if (error)
+		xfs_fs_cmn_err(CE_WARN, mp,
+				"xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
 	xfs_log_unmount_write(mp);
 	xfs_unmountfs_writesb(mp);
 }
@@ -1316,8 +1322,11 @@ xfs_syncsub(
 	 * of sync if we crash or get a forced shutdown. We don't want to force
 	 * this to disk, just get a transaction into the iclogs....
 	 */
-	if (flags & SYNC_SUPER)
-		xfs_log_sbcount(mp, 0);
+	if (flags & SYNC_SUPER) {
+		error = xfs_log_sbcount(mp, 0);
+		if (error)
+			last_error = error;
+	}
 
 	/*
 	 * Now check to see if the log needs a "dummy" transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d46f24c68498..bc0a4707189a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1447,28 +1447,22 @@ xfs_inactive_attrs(
 	tp = *tpp;
 	mp = ip->i_mount;
 	ASSERT(ip->i_d.di_forkoff != 0);
-	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto error_unlock;
 
 	error = xfs_attr_inactive(ip);
-	if (error) {
-		*tpp = NULL;
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		return error; /* goto out */
-	}
+	if (error)
+		goto error_unlock;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 	error = xfs_trans_reserve(tp, 0,
 				  XFS_IFREE_LOG_RES(mp),
 				  0, XFS_TRANS_PERM_LOG_RES,
 				  XFS_INACTIVE_LOG_COUNT);
-	if (error) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		xfs_trans_cancel(tp, 0);
-		*tpp = NULL;
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		return error;
-	}
+	if (error)
+		goto error_cancel;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1479,6 +1473,14 @@ xfs_inactive_attrs(
 
 	*tpp = tp;
 	return 0;
+
+error_cancel:
+	ASSERT(XFS_FORCED_SHUTDOWN(mp));
+	xfs_trans_cancel(tp, 0);
+error_unlock:
+	*tpp = NULL;
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
 }
 
 int
-- 
cgit v1.2.3


From f4586e40613a9f8bb9f7f9c8a796062a9ab1614c Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:25 +1000
Subject: [XFS] Clean up xfs_alloc_search_busy() return values.

xfs_alloc_search_busy() returns an index into the busy array if the extent
was found in the array. This is never checked, and the
xfs_alloc_search_busy() does a log force to prevent reuse of the extent
before the free transaction hits the disk. Hence the return value is
useless. Declare the function void and remove the slot number from the
tracing as well.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30796a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_alloc.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bd5c01788eff..bd43f77daacd 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -45,7 +45,7 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
 		    xfs_agnumber_t agno,
 		    xfs_agblock_t bno,
@@ -64,15 +64,15 @@ ktrace_t *xfs_alloc_trace_buf;
 	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
 #define	TRACE_UNBUSY(__func__,s,ag,sl,tp)	\
 	xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+#define	TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp)	\
+	xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
 #else
 #define	TRACE_ALLOC(s,a)
 #define	TRACE_FREE(s,a,b,x,f)
 #define	TRACE_MODAGF(s,a,f)
 #define	TRACE_BUSY(s,a,ag,agb,l,sl,tp)
 #define	TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
 #endif	/* XFS_ALLOC_TRACE */
 
 /*
@@ -2562,9 +2562,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
 
 
 /*
- * returns non-zero if any of (agno,bno):len is in a busy list
+ * If we find the extent in the busy list, force the log out to get the
+ * extent out of the busy list so the caller can use it straight away.
  */
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
 		    xfs_agnumber_t agno,
 		    xfs_agblock_t bno,
@@ -2572,7 +2573,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 {
 	xfs_mount_t		*mp;
 	xfs_perag_busy_t	*bsy;
-	int			n;
 	xfs_agblock_t		uend, bend;
 	xfs_lsn_t		lsn;
 	int			cnt;
@@ -2585,21 +2585,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	uend = bno + len - 1;
 
 	/* search pagb_list for this slot, skipping open slots */
-	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
-	     cnt; bsy++, n++) {
+	for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
 
 		/*
 		 * (start1,length1) within (start2, length2)
 		 */
 		if (bsy->busy_tp != NULL) {
 			bend = bsy->busy_start + bsy->busy_length - 1;
-			if ((bno > bend) ||
-			    (uend < bsy->busy_start)) {
+			if ((bno > bend) || (uend < bsy->busy_start)) {
 				cnt--;
 			} else {
 				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-						 "found1", agno, bno, len, n,
-						 tp);
+					 "found1", agno, bno, len, tp);
 				break;
 			}
 		}
@@ -2610,15 +2607,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	 * transaction that freed the block
 	 */
 	if (cnt) {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
 		lsn = bsy->busy_tp->t_commit_lsn;
 		spin_unlock(&mp->m_perag[agno].pagb_lock);
 		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
 	} else {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
-		n = -1;
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
 		spin_unlock(&mp->m_perag[agno].pagb_lock);
 	}
-
-	return n;
 }
-- 
cgit v1.2.3


From 12375c82375ec39ec948a3ad62e5e77533515e83 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:32 +1000
Subject: [XFS] Make xfs_alloc_compute_aligned() void.

xfs_alloc_compute_aligned() returns a value based on a comparison of the
computed extent length and the minimum length allowed. This is only used
by some callers - the other four return parameters are used more often.
Hence move the comparison to the code that actually needs to do it and
make xfs_alloc_compute_aligned() a void function.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30797a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_alloc.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bd43f77daacd..facdae14edd0 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
  */
-STATIC int				/* success (>= minlen) */
+STATIC void
 xfs_alloc_compute_aligned(
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
@@ -116,7 +116,6 @@ xfs_alloc_compute_aligned(
 	}
 	*resbno = bno;
 	*reslen = len;
-	return len >= minlen;
 }
 
 /*
@@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (!xfs_alloc_compute_aligned(ltbno, ltlen,
-					args->alignment, args->minlen,
-					&ltbnoa, &ltlena))
+			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+					args->minlen, &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
 				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
@@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (xfs_alloc_compute_aligned(ltbno, ltlen,
-					args->alignment, args->minlen,
-					&ltbnoa, &ltlena))
+			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+					args->minlen, &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
 				break;
 			if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
 				goto error0;
@@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (xfs_alloc_compute_aligned(gtbno, gtlen,
-					args->alignment, args->minlen,
-					&gtbnoa, &gtlena))
+			xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+					args->minlen, &gtbnoa, &gtlena);
+			if (gtlena >= args->minlen)
 				break;
 			if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
 				goto error0;
-- 
cgit v1.2.3


From c2b1cba6833da77b1b478ac144f9cf5144d276ec Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:40 +1000
Subject: [XFS] xfs_bmap_adjacent() never returns an error.

Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30798a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_bmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 65b8fa83e078..6d9b5448deb2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align(
 
 #define XFS_ALLOC_GAP_UNITS	4
 
-STATIC int
+STATIC void
 xfs_bmap_adjacent(
 	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
 {
@@ -2548,7 +2548,6 @@ xfs_bmap_adjacent(
 			ap->rval = gotbno;
 	}
 #undef ISVALID
-	return 0;
 }
 
 STATIC int
-- 
cgit v1.2.3


From d87dd6360dce86cad9099aed74f14b4dd0143301 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:46 +1000
Subject: [XFS] Warn if errors come from block_truncate_page().

block_truncate_page() can return errors that we currently ignore and
silently discard. We should not ever get errors reported here - an error
indicates a bug somewhere else. Hence catch the error and issue a stack
dump to the syslog because we cannot propagate the error any further up
the call chain.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30800a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 53f8feb28e58..41e7baabfd9f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -692,11 +692,19 @@ xfs_vn_setattr(
 	return -error;
 }
 
+/*
+ * block_truncate_page can return an error, but we can't propagate it
+ * at all here. Leave a complaint + stack trace in the syslog because
+ * this could be bad. If it is bad, we need to propagate the error further.
+ */
 STATIC void
 xfs_vn_truncate(
 	struct inode	*inode)
 {
-	block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks);
+	int	error;
+	error = block_truncate_page(inode->i_mapping, inode->i_size,
+							xfs_get_blocks);
+	WARN_ON(error);
 }
 
 STATIC int
-- 
cgit v1.2.3


From fc6149d8d9634814cdcd9283b8f2efd3359181df Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:53 +1000
Subject: [XFS] Check for xfs_free_extent() failing.

xfs_free_extent() can fail, but log recovery never bothers to check if it
successfully free the extent it was supposed to. This could lead to silent
corruption during log recovery. Abort log recovery if we fail to free an
extent.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30801a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 418582b709eb..3a8fe7bfa2af 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3003,15 +3003,15 @@ xlog_recover_process_efi(
 
 	tp = xfs_trans_alloc(mp, 0);
 	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		return error;
-	}
+	if (error)
+		goto abort_error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &(efip->efi_format.efi_extents[i]);
-		xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		if (error)
+			goto abort_error;
 		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
 					 extp->ext_len);
 	}
@@ -3019,6 +3019,10 @@ xlog_recover_process_efi(
 	efip->efi_flags |= XFS_EFI_RECOVERED;
 	error = xfs_trans_commit(tp, 0);
 	return error;
+
+abort_error:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
 }
 
 /*
-- 
cgit v1.2.3


From 7c9ef85c5672ae316aafd7bbe0bbadebe90301e6 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:21:59 +1000
Subject: [XFS] Catch errors returned from xfs_bmap_last_offset().

xfs_bmap_last_offset() can fail and return an error.
xfs_iomap_write_allocate() fails to detect and propagate the error.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30802a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_iomap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fde37f87d52f..fb3cf1191419 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,8 +802,11 @@ xfs_iomap_write_allocate(
 			 */
 			nimaps = 1;
 			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
-			xfs_bmap_last_offset(NULL, ip, &last_block,
-				XFS_DATA_FORK);
+			error = xfs_bmap_last_offset(NULL, ip, &last_block,
+							XFS_DATA_FORK);
+			if (error)
+				goto trans_cancel;
+
 			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
 			if ((map_start_fsb + count_fsb) > last_block) {
 				count_fsb = last_block - map_start_fsb;
-- 
cgit v1.2.3


From 556b8b166c9514b5f940047a41dad8fe8cd9a778 Mon Sep 17 00:00:00 2001
From: Barry Naujok <bnaujok@sgi.com>
Date: Thu, 10 Apr 2008 12:22:07 +1000
Subject: [XFS] remove bhv_vname_t and xfs_rename code

SGI-PV: 976035
SGI-Modid: xfs-linux-melb:xfs-kern:30804a

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_export.c |   5 +-
 fs/xfs/linux-2.6/xfs_iops.c   |  55 ++++++++++++++-----
 fs/xfs/linux-2.6/xfs_vnode.h  |   9 ----
 fs/xfs/xfs_dir2.c             |  62 ++++++++++------------
 fs/xfs/xfs_dir2.h             |  12 +++--
 fs/xfs/xfs_mount.h            |   4 +-
 fs/xfs/xfs_rename.c           |  82 ++++++++++------------------
 fs/xfs/xfs_types.h            |   5 ++
 fs/xfs/xfs_utils.c            |   4 +-
 fs/xfs/xfs_utils.h            |   4 +-
 fs/xfs/xfs_vnodeops.c         | 121 ++++++++++++++++++------------------------
 fs/xfs/xfs_vnodeops.h         |  23 ++++----
 12 files changed, 183 insertions(+), 203 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 66a9a9e76cbe..265f0168ab76 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -22,6 +22,7 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
+#include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
@@ -30,8 +31,6 @@
 #include "xfs_inode.h"
 #include "xfs_vfsops.h"
 
-static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
-
 /*
  * Note that we only accept fileids which are long enough rather than allow
  * the parent generation number to default to zero.  XFS considers zero a
@@ -216,7 +215,7 @@ xfs_fs_get_parent(
 	struct xfs_inode	*cip;
 	struct dentry		*parent;
 
-	error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cip);
+	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 41e7baabfd9f..0c958cf77758 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -239,6 +239,15 @@ xfs_init_security(
 	return error;
 }
 
+static void
+xfs_dentry_to_name(
+	struct xfs_name	*namep,
+	struct dentry	*dentry)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+}
+
 STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
@@ -246,20 +255,19 @@ xfs_cleanup_inode(
 	struct dentry	*dentry,
 	int		mode)
 {
-	struct dentry   teardown = {};
+	struct xfs_name	teardown;
 
 	/* Oh, the horror.
 	 * If we can't add the ACL or we fail in
 	 * xfs_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
-	teardown.d_inode = inode;
-	teardown.d_name = dentry->d_name;
+	xfs_dentry_to_name(&teardown, dentry);
 
 	if (S_ISDIR(mode))
-		xfs_rmdir(XFS_I(dir), &teardown);
+		xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
 	else
-		xfs_remove(XFS_I(dir), &teardown);
+		xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
 	iput(inode);
 }
 
@@ -273,6 +281,7 @@ xfs_vn_mknod(
 	struct inode	*inode;
 	struct xfs_inode *ip = NULL;
 	xfs_acl_t	*default_acl = NULL;
+	struct xfs_name	name;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
 	int		error;
 
@@ -293,6 +302,8 @@ xfs_vn_mknod(
 		}
 	}
 
+	xfs_dentry_to_name(&name, dentry);
+
 	if (IS_POSIXACL(dir) && !default_acl)
 		mode &= ~current->fs->umask;
 
@@ -303,10 +314,10 @@ xfs_vn_mknod(
 	case S_IFSOCK:
 		rdev = sysv_encode_dev(rdev);
 	case S_IFREG:
-		error = xfs_create(XFS_I(dir), dentry, mode, rdev, &ip, NULL);
+		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
 		break;
 	case S_IFDIR:
-		error = xfs_mkdir(XFS_I(dir), dentry, mode, &ip, NULL);
+		error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
 		break;
 	default:
 		error = EINVAL;
@@ -371,12 +382,14 @@ xfs_vn_lookup(
 	struct nameidata *nd)
 {
 	struct xfs_inode *cip;
+	struct xfs_name	name;
 	int		error;
 
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	error = xfs_lookup(XFS_I(dir), dentry, &cip);
+	xfs_dentry_to_name(&name, dentry);
+	error = xfs_lookup(XFS_I(dir), &name, &cip);
 	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
@@ -394,12 +407,14 @@ xfs_vn_link(
 	struct dentry	*dentry)
 {
 	struct inode	*inode;	/* inode of guy being linked to */
+	struct xfs_name	name;
 	int		error;
 
 	inode = old_dentry->d_inode;
+	xfs_dentry_to_name(&name, dentry);
 
 	igrab(inode);
-	error = xfs_link(XFS_I(dir), XFS_I(inode), dentry);
+	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
 	if (unlikely(error)) {
 		iput(inode);
 		return -error;
@@ -417,11 +432,13 @@ xfs_vn_unlink(
 	struct dentry	*dentry)
 {
 	struct inode	*inode;
+	struct xfs_name	name;
 	int		error;
 
 	inode = dentry->d_inode;
+	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_remove(XFS_I(dir), dentry);
+	error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
 	if (likely(!error)) {
 		xfs_validate_fields(dir);	/* size needs update */
 		xfs_validate_fields(inode);
@@ -437,14 +454,15 @@ xfs_vn_symlink(
 {
 	struct inode	*inode;
 	struct xfs_inode *cip = NULL;
+	struct xfs_name	name;
 	int		error;
 	mode_t		mode;
 
 	mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
-			    &cip, NULL);
+	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
 	if (unlikely(error))
 		goto out;
 
@@ -471,9 +489,12 @@ xfs_vn_rmdir(
 	struct dentry	*dentry)
 {
 	struct inode	*inode = dentry->d_inode;
+	struct xfs_name	name;
 	int		error;
 
-	error = xfs_rmdir(XFS_I(dir), dentry);
+	xfs_dentry_to_name(&name, dentry);
+
+	error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
 	if (likely(!error)) {
 		xfs_validate_fields(inode);
 		xfs_validate_fields(dir);
@@ -489,9 +510,15 @@ xfs_vn_rename(
 	struct dentry	*ndentry)
 {
 	struct inode	*new_inode = ndentry->d_inode;
+	struct xfs_name	oname;
+	struct xfs_name	nname;
 	int		error;
 
-	error = xfs_rename(XFS_I(odir), odentry, XFS_I(ndir), ndentry);
+	xfs_dentry_to_name(&oname, odentry);
+	xfs_dentry_to_name(&nname, ndentry);
+
+	error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+							XFS_I(ndir), &nname);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index dbb8a5d27f78..8b4d63ce8694 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -23,8 +23,6 @@ struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
-typedef struct dentry	bhv_vname_t;
-typedef __u64		bhv_vnumber_t;
 typedef struct inode	bhv_vnode_t;
 
 #define VN_ISLNK(vp)	S_ISLNK((vp)->i_mode)
@@ -210,13 +208,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
 	return inode ? vn_from_inode(inode) : NULL;
 }
 
-/*
- * Vname handling macros.
- */
-#define VNAME(dentry)		((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry)	((dentry)->d_name.len)
-#define VNAME_TO_INODE(dentry)	(XFS_I((dentry)->d_inode))
-
 /*
  * Dealing with bad inodes
  */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index e92e73f0e6af..7cb26529766b 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,6 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
 
+struct xfs_name xfs_name_dotdot = {"..", 2};
 
 void
 xfs_dir_mount(
@@ -146,8 +147,7 @@ int
 xfs_dir_createname(
 	xfs_trans_t		*tp,
 	xfs_inode_t		*dp,
-	char			*name,
-	int			namelen,
+	struct xfs_name		*name,
 	xfs_ino_t		inum,		/* new entry inode number */
 	xfs_fsblock_t		*first,		/* bmap's firstblock */
 	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
@@ -162,9 +162,9 @@ xfs_dir_createname(
 		return rval;
 	XFS_STATS_INC(xs_dir_create);
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.inumber = inum;
 	args.dp = dp;
 	args.firstblock = first;
@@ -197,8 +197,7 @@ int
 xfs_dir_lookup(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,
-	int		namelen,
+	struct xfs_name	*name,
 	xfs_ino_t	*inum)		/* out: inode number */
 {
 	xfs_da_args_t	args;
@@ -207,18 +206,14 @@ xfs_dir_lookup(
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_lookup);
+	memset(&args, 0, sizeof(xfs_da_args_t));
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.justcheck = args.addname = 0;
 	args.oknoent = 1;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -247,8 +242,7 @@ int
 xfs_dir_removename(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,
-	int		namelen,
+	struct xfs_name	*name,
 	xfs_ino_t	ino,
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
@@ -261,9 +255,9 @@ xfs_dir_removename(
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_remove);
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.inumber = ino;
 	args.dp = dp;
 	args.firstblock = first;
@@ -329,8 +323,7 @@ int
 xfs_dir_replace(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,		/* name of entry to replace */
-	int		namelen,
+	struct xfs_name	*name,		/* name of entry to replace */
 	xfs_ino_t	inum,		/* new inode number */
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
@@ -345,9 +338,9 @@ xfs_dir_replace(
 	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.inumber = inum;
 	args.dp = dp;
 	args.firstblock = first;
@@ -374,28 +367,29 @@ xfs_dir_replace(
 
 /*
  * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
  */
 int
 xfs_dir_canenter(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
-	char		*name,		/* name of entry to add */
-	int		namelen)
+	struct xfs_name	*name,		/* name of entry to add */
+	uint		resblks)
 {
 	xfs_da_args_t	args;
 	int		rval;
 	int		v;		/* type-checking value */
 
+	if (resblks)
+		return 0;
+
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+	memset(&args, 0, sizeof(xfs_da_args_t));
 
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
+	args.name = name->name;
+	args.namelen = name->len;
+	args.hashval = xfs_da_hashname(name->name, name->len);
 	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 1;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index b265197e74cf..6392f939029f 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,6 +59,8 @@ typedef	__uint32_t	xfs_dir2_db_t;
  */
 typedef	xfs_off_t	xfs_dir2_off_t;
 
+extern struct xfs_name	xfs_name_dotdot;
+
 /*
  * Generic directory interface routines
  */
@@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
 extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
 				struct xfs_inode *pdp);
 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t inum,
+				struct xfs_name *name, xfs_ino_t inum,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t *inum);
+				struct xfs_name *name, xfs_ino_t *inum);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t ino,
+				struct xfs_name *name, xfs_ino_t ino,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen, xfs_ino_t inum,
+				struct xfs_name *name, xfs_ino_t inum,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-				char *name, int namelen);
+				struct xfs_name *name, uint resblks);
 extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 77b39f66cead..1ed575110ff0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -73,7 +73,7 @@ typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
 			struct xfs_inode *, dm_right_t,
 			struct xfs_inode *, dm_right_t,
-			char *, char *, mode_t, int, int);
+			const char *, const char *, mode_t, int, int);
 typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
 			char *, char *);
 typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -401,7 +401,7 @@ typedef struct xfs_mount {
 
 /*
  * Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used. 
+ * "largeio" mount option is used.
  *
  * If compatibility mode is specified, simply return the basic unit of caching
  * so that we don't get inefficient read/modify/write I/O from user apps.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c4d0bac56a5a..ee371890d85d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -83,26 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip;
  */
 STATIC int
 xfs_lock_for_rename(
-	xfs_inode_t	*dp1,	/* old (source) directory inode */
-	xfs_inode_t	*dp2,	/* new (target) directory inode */
-	bhv_vname_t	*vname1,/* old entry name */
-	bhv_vname_t	*vname2,/* new entry name */
-	xfs_inode_t	**ipp1,	/* inode of old entry */
-	xfs_inode_t	**ipp2,	/* inode of new entry, if it
+	xfs_inode_t	*dp1,	/* in: old (source) directory inode */
+	xfs_inode_t	*dp2,	/* in: new (target) directory inode */
+	xfs_inode_t	*ip1,	/* in: inode of old entry */
+	struct xfs_name	*name2,	/* in: new entry name */
+	xfs_inode_t	**ipp2,	/* out: inode of new entry, if it
 				   already exists, NULL otherwise. */
-	xfs_inode_t	**i_tab,/* array of inode returned, sorted */
-	int		*num_inodes)  /* number of inodes in array */
+	xfs_inode_t	**i_tab,/* out: array of inode returned, sorted */
+	int		*num_inodes)  /* out: number of inodes in array */
 {
-	xfs_inode_t		*ip1 = VNAME_TO_INODE(vname1);
-	xfs_inode_t		*ip2, *temp;
+	xfs_inode_t		*ip2 = NULL;
+	xfs_inode_t		*temp;
 	xfs_ino_t		inum1, inum2;
 	int			error;
 	int			i, j;
 	uint			lock_mode;
 	int			diff_dirs = (dp1 != dp2);
 
-	ip2 = NULL;
-
 	/*
 	 * First, find out the current inums of the entries so that we
 	 * can determine the initial locking order.  We'll have to
@@ -115,17 +112,15 @@ xfs_lock_for_rename(
 
 	inum1 = ip1->i_ino;
 
-
 	/*
 	 * Unlock dp1 and lock dp2 if they are different.
 	 */
-
 	if (diff_dirs) {
 		xfs_iunlock_map_shared(dp1, lock_mode);
 		lock_mode = xfs_ilock_map_shared(dp2);
 	}
 
-	error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2);
+	error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
 	if (error == ENOENT) {		/* target does not need to exist. */
 		inum2 = 0;
 	} else if (error) {
@@ -157,6 +152,7 @@ xfs_lock_for_rename(
 		*num_inodes = 4;
 		i_tab[3] = ip2;
 	}
+	*ipp2 = i_tab[3];
 
 	/*
 	 * Sort the elements via bubble sort.  (Remember, there are at
@@ -194,21 +190,6 @@ xfs_lock_for_rename(
 		xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
 	}
 
-	/*
-	 * Set the return value. Null out any unused entries in i_tab.
-	 */
-	*ipp1 = *ipp2 = NULL;
-	for (i=0; i < *num_inodes; i++) {
-		if (i_tab[i]->i_ino == inum1) {
-			*ipp1 = i_tab[i];
-		}
-		if (i_tab[i]->i_ino == inum2) {
-			*ipp2 = i_tab[i];
-		}
-	}
-	for (;i < 4; i++) {
-		i_tab[i] = NULL;
-	}
 	return 0;
 }
 
@@ -218,12 +199,13 @@ xfs_lock_for_rename(
 int
 xfs_rename(
 	xfs_inode_t	*src_dp,
-	bhv_vname_t	*src_vname,
+	struct xfs_name	*src_name,
+	xfs_inode_t	*src_ip,
 	xfs_inode_t	*target_dp,
-	bhv_vname_t	*target_vname)
+	struct xfs_name	*target_name)
 {
 	xfs_trans_t	*tp;
-	xfs_inode_t	*src_ip, *target_ip;
+	xfs_inode_t	*target_ip;
 	xfs_mount_t	*mp = src_dp->i_mount;
 	int		new_parent;		/* moving to a new dir */
 	int		src_is_directory;	/* src_name is a directory */
@@ -237,10 +219,6 @@ xfs_rename(
 	int		spaceres;
 	int		target_link_zero = 0;
 	int		num_inodes;
-	char		*src_name = VNAME(src_vname);
-	char		*target_name = VNAME(target_vname);
-	int		src_namelen = VNAMELEN(src_vname);
-	int		target_namelen = VNAMELEN(target_vname);
 
 	xfs_itrace_entry(src_dp);
 	xfs_itrace_entry(target_dp);
@@ -250,7 +228,7 @@ xfs_rename(
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
 					src_dp, DM_RIGHT_NULL,
 					target_dp, DM_RIGHT_NULL,
-					src_name, target_name,
+					src_name->name, target_name->name,
 					0, 0, 0);
 		if (error) {
 			return error;
@@ -267,10 +245,8 @@ xfs_rename(
 	 * does not exist in the source directory.
 	 */
 	tp = NULL;
-	error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
-			target_vname, &src_ip, &target_ip, inodes,
-			&num_inodes);
-
+	error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
+					&target_ip, inodes, &num_inodes);
 	if (error) {
 		/*
 		 * We have nothing locked, no inode references, and
@@ -316,7 +292,7 @@ xfs_rename(
 	XFS_BMAP_INIT(&free_list, &first_block);
 	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
 			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
 	if (error == ENOSPC) {
@@ -374,9 +350,8 @@ xfs_rename(
 		 * If there's no space reservation, check the entry will
 		 * fit before actually inserting it.
 		 */
-		if (spaceres == 0 &&
-		    (error = xfs_dir_canenter(tp, target_dp, target_name,
-						target_namelen)))
+		error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+		if (error)
 			goto error_return;
 		/*
 		 * If target does not exist and the rename crosses
@@ -384,8 +359,8 @@ xfs_rename(
 		 * to account for the ".." reference from the new entry.
 		 */
 		error = xfs_dir_createname(tp, target_dp, target_name,
-					   target_namelen, src_ip->i_ino,
-					   &first_block, &free_list, spaceres);
+						src_ip->i_ino, &first_block,
+						&free_list, spaceres);
 		if (error == ENOSPC)
 			goto error_return;
 		if (error)
@@ -424,7 +399,7 @@ xfs_rename(
 		 * name at the destination directory, remove it first.
 		 */
 		error = xfs_dir_replace(tp, target_dp, target_name,
-					target_namelen, src_ip->i_ino,
+					src_ip->i_ino,
 					&first_block, &free_list, spaceres);
 		if (error)
 			goto abort_return;
@@ -461,7 +436,8 @@ xfs_rename(
 		 * Rewrite the ".." entry to point to the new
 		 * directory.
 		 */
-		error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+					target_dp->i_ino,
 					&first_block, &free_list, spaceres);
 		ASSERT(error != EEXIST);
 		if (error)
@@ -497,8 +473,8 @@ xfs_rename(
 			goto abort_return;
 	}
 
-	error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
-			src_ip->i_ino, &first_block, &free_list, spaceres);
+	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+					&first_block, &free_list, spaceres);
 	if (error)
 		goto abort_return;
 	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -583,7 +559,7 @@ std_return:
 		(void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
 					src_dp, DM_RIGHT_NULL,
 					target_dp, DM_RIGHT_NULL,
-					src_name, target_name,
+					src_name->name, target_name->name,
 					0, error, 0);
 	}
 	return error;
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 5c89be475464..0f5191644ab2 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -160,4 +160,9 @@ typedef enum {
 	XFS_BTNUM_MAX
 } xfs_btnum_t;
 
+struct xfs_name {
+	const char	*name;
+	int		len;
+};
+
 #endif	/* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 47c45ff4a067..2b8dc7e40772 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -45,7 +45,7 @@ int
 xfs_dir_lookup_int(
 	xfs_inode_t	*dp,
 	uint		lock_mode,
-	bhv_vname_t	*dentry,
+	struct xfs_name	*name,
 	xfs_ino_t	*inum,
 	xfs_inode_t	**ipp)
 {
@@ -53,7 +53,7 @@ xfs_dir_lookup_int(
 
 	xfs_itrace_entry(dp);
 
-	error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
+	error = xfs_dir_lookup(NULL, dp, name, inum);
 	if (!error) {
 		/*
 		 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 701accbbaea1..175b126d2cab 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,8 +21,8 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_dir_lookup_int(xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
-				xfs_inode_t **);
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
+				xfs_ino_t *, xfs_inode_t **);
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
 				xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index bc0a4707189a..ca38fb9a9937 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1764,7 +1764,7 @@ xfs_inactive(
 int
 xfs_lookup(
 	xfs_inode_t		*dp,
-	bhv_vname_t		*dentry,
+	struct xfs_name		*name,
 	xfs_inode_t		**ipp)
 {
 	xfs_inode_t		*ip;
@@ -1778,7 +1778,7 @@ xfs_lookup(
 		return XFS_ERROR(EIO);
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
+	error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
 	if (!error) {
 		*ipp = ip;
 		xfs_itrace_ref(ip);
@@ -1790,17 +1790,16 @@ xfs_lookup(
 int
 xfs_create(
 	xfs_inode_t		*dp,
-	bhv_vname_t		*dentry,
+	struct xfs_name		*name,
 	mode_t			mode,
 	xfs_dev_t		rdev,
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	char			*name = VNAME(dentry);
-	xfs_mount_t	        *mp = dp->i_mount;
+	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t		*ip;
 	xfs_trans_t		*tp;
-	int                     error;
+	int			error;
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	boolean_t		unlock_dp_on_error = B_FALSE;
@@ -1810,17 +1809,14 @@ xfs_create(
 	xfs_prid_t		prid;
 	struct xfs_dquot	*udqp, *gdqp;
 	uint			resblks;
-	int			namelen;
 
 	ASSERT(!*ipp);
 	xfs_itrace_entry(dp);
 
-	namelen = VNAMELEN(dentry);
-
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
 				dp, DM_RIGHT_NULL, NULL,
-				DM_RIGHT_NULL, name, NULL,
+				DM_RIGHT_NULL, name->name, NULL,
 				mode, 0, 0);
 
 		if (error)
@@ -1852,7 +1848,7 @@ xfs_create(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+	resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 	/*
 	 * Initially assume that the file does not exist and
 	 * reserve the resources for that case.  If that is not
@@ -1885,7 +1881,8 @@ xfs_create(
 	if (error)
 		goto error_return;
 
-	if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
+	error = xfs_dir_canenter(tp, dp, name, resblks);
+	if (error)
 		goto error_return;
 	error = xfs_dir_ialloc(&tp, dp, mode, 1,
 			rdev, credp, prid, resblks > 0,
@@ -1915,7 +1912,7 @@ xfs_create(
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
-	error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks ?
 					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
@@ -1976,7 +1973,7 @@ std_return:
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
 			dp, DM_RIGHT_NULL,
 			*ipp ? ip : NULL,
-			DM_RIGHT_NULL, name, NULL,
+			DM_RIGHT_NULL, name->name, NULL,
 			mode, error, 0);
 	}
 	return error;
@@ -2268,12 +2265,10 @@ int remove_which_error_return = 0;
 int
 xfs_remove(
 	xfs_inode_t             *dp,
-	bhv_vname_t		*dentry)
+	struct xfs_name		*name,
+	xfs_inode_t		*ip)
 {
-	char			*name = VNAME(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-	xfs_inode_t             *ip = VNAME_TO_INODE(dentry);
-	int			namelen = VNAMELEN(dentry);
 	xfs_trans_t             *tp = NULL;
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
@@ -2289,9 +2284,9 @@ xfs_remove(
 		return XFS_ERROR(EIO);
 
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp,
-					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					name, NULL, ip->i_d.di_mode, 0, 0);
+		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL, name->name, NULL,
+					ip->i_d.di_mode, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2376,7 +2371,7 @@ xfs_remove(
 	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
 	 */
 	XFS_BMAP_INIT(&free_list, &first_block);
-	error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, 0);
 	if (error) {
 		ASSERT(error != ENOENT);
@@ -2444,7 +2439,7 @@ xfs_remove(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 				dp, DM_RIGHT_NULL,
 				NULL, DM_RIGHT_NULL,
-				name, NULL, ip->i_d.di_mode, error, 0);
+				name->name, NULL, ip->i_d.di_mode, error, 0);
 	}
 	return error;
 
@@ -2474,7 +2469,7 @@ int
 xfs_link(
 	xfs_inode_t		*tdp,
 	xfs_inode_t		*sip,
-	bhv_vname_t		*dentry)
+	struct xfs_name		*target_name)
 {
 	xfs_mount_t		*mp = tdp->i_mount;
 	xfs_trans_t		*tp;
@@ -2485,13 +2480,10 @@ xfs_link(
 	int			cancel_flags;
 	int			committed;
 	int			resblks;
-	char			*target_name = VNAME(dentry);
-	int			target_namelen;
 
 	xfs_itrace_entry(tdp);
 	xfs_itrace_entry(sip);
 
-	target_namelen = VNAMELEN(dentry);
 	ASSERT(!S_ISDIR(sip->i_d.di_mode));
 
 	if (XFS_FORCED_SHUTDOWN(mp))
@@ -2501,7 +2493,7 @@ xfs_link(
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
 					tdp, DM_RIGHT_NULL,
 					sip, DM_RIGHT_NULL,
-					target_name, NULL, 0, 0, 0);
+					target_name->name, NULL, 0, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2516,7 +2508,7 @@ xfs_link(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
 	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
 			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
 	if (error == ENOSPC) {
@@ -2568,15 +2560,14 @@ xfs_link(
 		goto error_return;
 	}
 
-	if (resblks == 0 &&
-	    (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
+	error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+	if (error)
 		goto error_return;
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
-				   sip->i_ino, &first_block, &free_list,
-				   resblks);
+	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+					&first_block, &free_list, resblks);
 	if (error)
 		goto abort_return;
 	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2612,7 +2603,7 @@ std_return:
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
 				tdp, DM_RIGHT_NULL,
 				sip, DM_RIGHT_NULL,
-				target_name, NULL, 0, error, 0);
+				target_name->name, NULL, 0, error, 0);
 	}
 	return error;
 
@@ -2629,13 +2620,11 @@ std_return:
 int
 xfs_mkdir(
 	xfs_inode_t             *dp,
-	bhv_vname_t		*dentry,
+	struct xfs_name		*dir_name,
 	mode_t			mode,
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
 {
-	char			*dir_name = VNAME(dentry);
-	int			dir_namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_inode_t		*cdp;	/* inode of created dir */
 	xfs_trans_t		*tp;
@@ -2659,7 +2648,7 @@ xfs_mkdir(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
 					dp, DM_RIGHT_NULL, NULL,
-					DM_RIGHT_NULL, dir_name, NULL,
+					DM_RIGHT_NULL, dir_name->name, NULL,
 					mode, 0, 0);
 		if (error)
 			return error;
@@ -2688,7 +2677,7 @@ xfs_mkdir(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+	resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
 	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
 				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
 	if (error == ENOSPC) {
@@ -2720,8 +2709,8 @@ xfs_mkdir(
 	if (error)
 		goto error_return;
 
-	if (resblks == 0 &&
-	    (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
+	error = xfs_dir_canenter(tp, dp, dir_name, resblks);
+	if (error)
 		goto error_return;
 	/*
 	 * create the directory inode.
@@ -2750,9 +2739,9 @@ xfs_mkdir(
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
-				   &first_block, &free_list, resblks ?
-				   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
+					&first_block, &free_list, resblks ?
+					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
 		goto error1;
@@ -2817,7 +2806,7 @@ std_return:
 					dp, DM_RIGHT_NULL,
 					created ? cdp : NULL,
 					DM_RIGHT_NULL,
-					dir_name, NULL,
+					dir_name->name, NULL,
 					mode, error, 0);
 	}
 	return error;
@@ -2841,13 +2830,11 @@ std_return:
 int
 xfs_rmdir(
 	xfs_inode_t             *dp,
-	bhv_vname_t		*dentry)
+	struct xfs_name		*name,
+	xfs_inode_t		*cdp)
 {
 	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
-	char			*name = VNAME(dentry);
-	int			namelen = VNAMELEN(dentry);
 	xfs_mount_t		*mp = dp->i_mount;
-  	xfs_inode_t             *cdp = VNAME_TO_INODE(dentry);
 	xfs_trans_t             *tp;
 	int                     error;
 	xfs_bmap_free_t         free_list;
@@ -2865,8 +2852,8 @@ xfs_rmdir(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
 					dp, DM_RIGHT_NULL,
-					NULL, DM_RIGHT_NULL,
-					name, NULL, cdp->i_d.di_mode, 0, 0);
+					NULL, DM_RIGHT_NULL, name->name,
+					NULL, cdp->i_d.di_mode, 0, 0);
 		if (error)
 			return XFS_ERROR(error);
 	}
@@ -2960,7 +2947,7 @@ xfs_rmdir(
 		goto error_return;
 	}
 
-	error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+	error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
 					&first_block, &free_list, resblks);
 	if (error)
 		goto error1;
@@ -3040,7 +3027,7 @@ xfs_rmdir(
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
 					dp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, cdp->i_d.di_mode,
+					name->name, NULL, cdp->i_d.di_mode,
 					error, 0);
 	}
 	return error;
@@ -3058,8 +3045,8 @@ xfs_rmdir(
 int
 xfs_symlink(
 	xfs_inode_t		*dp,
-	bhv_vname_t		*dentry,
-	char			*target_path,
+	struct xfs_name		*link_name,
+	const char		*target_path,
 	mode_t			mode,
 	xfs_inode_t		**ipp,
 	cred_t			*credp)
@@ -3079,15 +3066,13 @@ xfs_symlink(
 	int			nmaps;
 	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
 	xfs_daddr_t		d;
-	char			*cur_chunk;
+	const char		*cur_chunk;
 	int			byte_cnt;
 	int			n;
 	xfs_buf_t		*bp;
 	xfs_prid_t		prid;
 	struct xfs_dquot	*udqp, *gdqp;
 	uint			resblks;
-	char			*link_name = VNAME(dentry);
-	int			link_namelen;
 
 	*ipp = NULL;
 	error = 0;
@@ -3099,8 +3084,6 @@ xfs_symlink(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	link_namelen = VNAMELEN(dentry);
-
 	/*
 	 * Check component lengths of the target path name.
 	 */
@@ -3111,7 +3094,7 @@ xfs_symlink(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					link_name, target_path, 0, 0, 0);
+					link_name->name, target_path, 0, 0, 0);
 		if (error)
 			return error;
 	}
@@ -3143,7 +3126,7 @@ xfs_symlink(
 		fs_blocks = 0;
 	else
 		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
-	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
 	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
 			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
 	if (error == ENOSPC && fs_blocks == 0) {
@@ -3177,8 +3160,8 @@ xfs_symlink(
 	/*
 	 * Check for ability to enter directory entry, if no space reserved.
 	 */
-	if (resblks == 0 &&
-	    (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
+	error = xfs_dir_canenter(tp, dp, link_name, resblks);
+	if (error)
 		goto error_return;
 	/*
 	 * Initialize the bmap freelist prior to calling either
@@ -3270,8 +3253,8 @@ xfs_symlink(
 	/*
 	 * Create the directory entry for the symlink.
 	 */
-	error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
-				   &first_block, &free_list, resblks);
+	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+					&first_block, &free_list, resblks);
 	if (error)
 		goto error1;
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3315,8 +3298,8 @@ std_return:
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
 					dp, DM_RIGHT_NULL,
 					error ? NULL : ip,
-					DM_RIGHT_NULL, link_name, target_path,
-					0, error, 0);
+					DM_RIGHT_NULL, link_name->name,
+					target_path, 0, error, 0);
 	}
 
 	if (!error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 12e581865bdf..24c53923dc2c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -23,20 +23,22 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
 		xfs_off_t stop);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode **ipp);
-int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
-int xfs_remove(struct xfs_inode *dp, bhv_vname_t	*dentry);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
-		bhv_vname_t *dentry);
-int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
+		struct xfs_name *target_name);
+int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
 		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
+int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
+		struct xfs_inode *cdp);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
-int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
-		char *target_path, mode_t mode, struct xfs_inode **ipp,
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+		const char *target_path, mode_t mode, struct xfs_inode **ipp,
 		struct cred *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
@@ -44,8 +46,9 @@ int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset,
 		struct cred *credp, int	attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
-		struct xfs_inode *target_dp, bhv_vname_t *target_vname);
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+		struct xfs_name *target_name);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
 		int *valuelenp, int flags, cred_t *cred);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
-- 
cgit v1.2.3


From d64e31a2f53cdcb2f95b782196faacb0995ca0c0 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:22:17 +1000
Subject: [XFS] Ensure errors from xfs_bdstrat() are correctly checked.

xfsbdstrat() is declared to return an error. That is never checked because
the error is propagated by the xfs_buf_t that is passed through the
function.

Mark xfsbdstrat() as returning void and comment the prototype on the
methods needed for error checking.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30823a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 19 ++++++-------------
 fs/xfs/linux-2.6/xfs_lrw.h |  3 ++-
 fs/xfs/xfs_log_recover.c   |  6 ++++--
 fs/xfs/xfs_mount.c         |  1 -
 fs/xfs/xfs_trans_buf.c     | 12 +++++-------
 fs/xfs/xfs_vnodeops.c      |  6 ++++--
 6 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1d95dca96cfe..f6dab5d8944e 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -875,28 +875,21 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
 }
 
 /*
- * Wrapper around bdstrat so that we can stop data
- * from going to disk in case we are shutting down the filesystem.
- * Typically user data goes thru this path; one of the exceptions
- * is the superblock.
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
  */
-int
+void
 xfsbdstrat(
 	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
 	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		/* Grio redirection would go here
-		 * if (XFS_BUF_IS_GRIO(bp)) {
-		 */
-
+	if (!XFS_FORCED_SHUTDOWN(mp))
 		xfs_buf_iorequest(bp);
-		return 0;
-	}
 
 	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-	return (xfs_bioerror_relse(bp));
+	xfs_bioerror_relse(bp);
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e200253139cf..e1d498b4ba7a 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *,
 #define xfs_inval_cached_trace(ip, offset, len, first, last)
 #endif
 
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+/* errors from xfsbdstrat() must be extracted from the buffer */
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3a8fe7bfa2af..1f83298f90aa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -121,7 +121,8 @@ xlog_bread(
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 
 	xfsbdstrat(log->l_mp, bp);
-	if ((error = xfs_iowait(bp)))
+	error = xfs_iowait(bp);
+	if (error)
 		xfs_ioerror_alert("xlog_bread", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
 	return error;
@@ -3849,7 +3850,8 @@ xlog_do_recover(
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	xfsbdstrat(log->l_mp, bp);
-	if ((error = xfs_iowait(bp))) {
+	error = xfs_iowait(bp);
+	if (error) {
 		xfs_ioerror_alert("xlog_do_recover",
 				  log->l_mp, bp, XFS_BUF_ADDR(bp));
 		ASSERT(0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2d03fe194c2c..2fec452afbcc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1470,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 		XFS_BUF_UNASYNC(sbp);
 		ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
 		xfsbdstrat(mp, sbp);
-		/* Nevermind errors we might get here. */
 		error = xfs_iowait(sbp);
 		if (error)
 			xfs_ioerror_alert("xfs_unmountfs_writesb",
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e5c010f5040..cb0c5839154b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -354,17 +354,15 @@ xfs_trans_read_buf(
 			ASSERT(!XFS_BUF_ISASYNC(bp));
 			XFS_BUF_READ(bp);
 			xfsbdstrat(tp->t_mountp, bp);
-			xfs_iowait(bp);
-			if (XFS_BUF_GETERROR(bp) != 0) {
+			error = xfs_iowait(bp);
+			if (error) {
 				xfs_ioerror_alert("xfs_trans_read_buf", mp,
 						  bp, blkno);
-				error = XFS_BUF_GETERROR(bp);
 				xfs_buf_relse(bp);
 				/*
-				 * We can gracefully recover from most
-				 * read errors. Ones we can't are those
-				 * that happen after the transaction's
-				 * already dirty.
+				 * We can gracefully recover from most read
+				 * errors. Ones we can't are those that happen
+				 * after the transaction's already dirty.
 				 */
 				if (tp->t_flags & XFS_TRANS_DIRTY)
 					xfs_force_shutdown(tp->t_mountp,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ca38fb9a9937..dd4621e0ab3b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3825,7 +3825,8 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_READ(bp);
 		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
 		xfsbdstrat(mp, bp);
-		if ((error = xfs_iowait(bp))) {
+		error = xfs_iowait(bp);
+		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
 					  mp, bp, XFS_BUF_ADDR(bp));
 			break;
@@ -3837,7 +3838,8 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNREAD(bp);
 		XFS_BUF_WRITE(bp);
 		xfsbdstrat(mp, bp);
-		if ((error = xfs_iowait(bp))) {
+		error = xfs_iowait(bp);
+		if (error) {
 			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
 					  mp, bp, XFS_BUF_ADDR(bp));
 			break;
-- 
cgit v1.2.3


From db7a19f2c89d99b66874a7e0c0dc681ff1f37b4e Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:22:24 +1000
Subject: [XFS] Ensure xfs_bawrite() errors are checked.

xfs_bawrite() can return immediate error status on async writes. Unlike
xfsbdstrat() we don't ever check the error on the buffer after the call,
so we currently do not catch errors at all here. Ensure we catch and
propagate or warn to the syslog about up-front async write errors.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30824a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_dquot.c      | 10 ++++++++--
 fs/xfs/quota/xfs_dquot_item.c |  7 ++++++-
 fs/xfs/xfs_buf_item.c         |  7 ++++++-
 fs/xfs/xfs_inode.c            |  2 +-
 fs/xfs/xfs_inode_item.c       |  8 +++++++-
 5 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 15214fbb9aa7..631ebb31b295 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1291,7 +1291,7 @@ xfs_qm_dqflush(
 	if (flags & XFS_QMOPT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & XFS_QMOPT_ASYNC) {
-		xfs_bawrite(mp, bp);
+		error = xfs_bawrite(mp, bp);
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
@@ -1582,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait(
 		    XFS_INCORE_TRYLOCK);
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			int	error;
 			if (XFS_BUF_ISPINNED(bp)) {
 				xfs_log_force(dqp->q_mount,
 					      (xfs_lsn_t)0,
 					      XFS_LOG_FORCE);
 			}
-			xfs_bawrite(dqp->q_mount, bp);
+			error = xfs_bawrite(dqp->q_mount, bp);
+			if (error)
+				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+					"xfs_qm_dqflock_pushbuf_wait: "
+					"pushbuf error %d on dqp %p, bp %p",
+					error, dqp, bp);
 		} else {
 			xfs_buf_relse(bp);
 		}
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 3dedce1d9cde..36e05ca78412 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -267,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf(
 					      XFS_LOG_FORCE);
 			}
 			if (dopush) {
+				int	error;
 #ifdef XFSRACEDEBUG
 				delay_for_intr();
 				delay(300);
 #endif
-				xfs_bawrite(mp, bp);
+				error = xfs_bawrite(mp, bp);
+				if (error)
+					xfs_fs_cmn_err(CE_WARN, mp,
+	"xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
+							error, qip, bp);
 			} else {
 				xfs_buf_relse(bp);
 			}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 63debd147eb5..53a71c62025d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -645,7 +645,12 @@ xfs_buf_item_push(
 	bp = bip->bli_buf;
 
 	if (XFS_BUF_ISDELAYWRITE(bp)) {
-		xfs_bawrite(bip->bli_item.li_mountp, bp);
+		int	error;
+		error = xfs_bawrite(bip->bli_item.li_mountp, bp);
+		if (error)
+			xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
+			"xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
+					error, bip, bp);
 	} else {
 		xfs_buf_relse(bp);
 	}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 63e66890f063..ca074ee01d06 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3276,7 +3276,7 @@ xfs_iflush(
 	if (flags & INT_DELWRI) {
 		xfs_bdwrite(mp, bp);
 	} else if (flags & INT_ASYNC) {
-		xfs_bawrite(mp, bp);
+		error = xfs_bawrite(mp, bp);
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2c775b4ae9e6..93b5db453ea2 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -40,6 +40,7 @@
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rw.h"
+#include "xfs_error.h"
 
 
 kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
@@ -813,7 +814,12 @@ xfs_inode_item_pushbuf(
 					      XFS_LOG_FORCE);
 			}
 			if (dopush) {
-				xfs_bawrite(mp, bp);
+				int	error;
+				error = xfs_bawrite(mp, bp);
+				if (error)
+					xfs_fs_cmn_err(CE_WARN, mp,
+		"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
+							error, iip, bp);
 			} else {
 				xfs_buf_relse(bp);
 			}
-- 
cgit v1.2.3


From 958d4ec606d4af590f86a601a238613f21e878ee Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:23:46 +1000
Subject: [XFS] xfs_bdwrite() does not return errors.

xfs_bdwrite() cannot return an error; it only queues buffers to the
delayed write list and as such never encounters anything that can fail.
Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30825a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 fs/xfs/linux-2.6/xfs_buf.h | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 142ddbece374..52f6846101d5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1060,7 +1060,7 @@ xfs_buf_iostart(
 		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
 		bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
 		xfs_buf_delwri_queue(bp, 1);
-		return status;
+		return 0;
 	}
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a3d207de48b8..841d7883528d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
 	return error;
 }
 
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
+/*
+ * No error can be returned from xfs_buf_iostart for delwri
+ * buffers as they are queued and no I/O is issued.
+ */
+static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
 	bp->b_strat = xfs_bdstrat_cb;
 	bp->b_fspriv3 = mp;
-	return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
+	(void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
 }
 
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
-- 
cgit v1.2.3


From cc88466f3f67bb16fc91b0b974e51c2a43a9e597 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:23:52 +1000
Subject: [XFS] Catch unwritten extent conversion errors.

On unwritten I/O completion, we fail to propagate an error when converting
the extent to a written extent. This means that the I/O silently fails.
propagate the error onto the ioend so that the inode is marked with an
error appropriately.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30826a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 169e6c062794..a55c3b26d840 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -243,8 +243,12 @@ xfs_end_bio_unwritten(
 	size_t			size = ioend->io_size;
 
 	if (likely(!ioend->io_error)) {
-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount))
-			xfs_iomap_write_unwritten(ip, offset, size);
+		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+			int error;
+			error = xfs_iomap_write_unwritten(ip, offset, size);
+			if (error)
+				ioend->io_error = error;
+		}
 		xfs_setfilesize(ioend);
 	}
 	xfs_destroy_ioend(ioend);
-- 
cgit v1.2.3


From e4ac967b117c5780760abbd9ae996210c31cb398 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:23:58 +1000
Subject: [XFS] xfs_iflush_fork() never returns an error.

xfs_iflush_fork() never returns an error. Mark it void and clean up the
code calling it that checks for errors.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30827a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_inode.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ca074ee01d06..2bc22790d65a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2883,7 +2883,7 @@ xfs_iextents_copy(
  * format indicates the current state of the fork.
  */
 /*ARGSUSED*/
-STATIC int
+STATIC void
 xfs_iflush_fork(
 	xfs_inode_t		*ip,
 	xfs_dinode_t		*dip,
@@ -2904,16 +2904,16 @@ xfs_iflush_fork(
 	static const short	extflag[2] =
 		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
 
-	if (iip == NULL)
-		return 0;
+	if (!iip)
+		return;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	/*
 	 * This can happen if we gave up in iformat in an error path,
 	 * for the attribute fork.
 	 */
-	if (ifp == NULL) {
+	if (!ifp) {
 		ASSERT(whichfork == XFS_ATTR_FORK);
-		return 0;
+		return;
 	}
 	cp = XFS_DFORK_PTR(dip, whichfork);
 	mp = ip->i_mount;
@@ -2974,8 +2974,6 @@ xfs_iflush_fork(
 		ASSERT(0);
 		break;
 	}
-
-	return 0;
 }
 
 STATIC int
@@ -3452,16 +3450,9 @@ xfs_iflush_int(
 		}
 	}
 
-	if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
-		goto corrupt_out;
-	}
-
-	if (XFS_IFORK_Q(ip)) {
-		/*
-		 * The only error from xfs_iflush_fork is on the data fork.
-		 */
-		(void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
-	}
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+	if (XFS_IFORK_Q(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
 	xfs_inobp_check(mp, bp);
 
 	/*
-- 
cgit v1.2.3


From 7b07339048f7b020575706b492c004b5664b67ab Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:04 +1000
Subject: [XFS] xfs_bulkstat_one_dinode() never returns an error.

Mark it void.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30828a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_itable.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 45d8776408ef..eb85bdedad0c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -129,7 +129,7 @@ xfs_bulkstat_one_iget(
 	return error;
 }
 
-STATIC int
+STATIC void
 xfs_bulkstat_one_dinode(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
 	xfs_ino_t	ino,		/* inode number to get data for */
@@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode(
 		buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
 		break;
 	}
-
-	return 0;
 }
 
 STATIC int
-- 
cgit v1.2.3


From 64bfe1bfae833e89ed77f72c61ded19f4b1976f8 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:10 +1000
Subject: [XFS] Catch errors from xfs_imap().

Catch errors from xfs_imap() in log recovery when we might be trying to
map an invalid inode number due to a corrupted log.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30829a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1f83298f90aa..a8039431b86f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2282,7 +2282,9 @@ xlog_recover_do_inode_trans(
 		 * invalidate the buffer when we write it out below.
 		 */
 		imap.im_blkno = 0;
-		xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+		error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+		if (error)
+			goto error;
 	}
 
 	/*
-- 
cgit v1.2.3


From 78e9da77f1bf265fe750b9223ec15707473fb6e8 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:17 +1000
Subject: [XFS] Don't allow silent errors in xfs_inactive().

xfs_inactive() fails to report errors when committing the inactive
transaction. Hence we can get silent failures either finishing off the
truncation or committing the transaction. Even if we get errors, we need
to continue, so simply warn loudly to the system if we get errors here.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30830a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index dd4621e0ab3b..6650601c64f7 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1743,11 +1743,18 @@ xfs_inactive(
 		XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 
 		/*
-		 * Just ignore errors at this point.  There is
-		 * nothing we can do except to try to keep going.
+		 * Just ignore errors at this point.  There is nothing we can
+		 * do except to try to keep going. Make sure it's not a silent
+		 * error.
 		 */
-		(void) xfs_bmap_finish(&tp,  &free_list, &committed);
-		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		error = xfs_bmap_finish(&tp,  &free_list, &committed);
+		if (error)
+			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+				"xfs_bmap_finish() returned error %d", error);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+		if (error)
+			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+				"xfs_trans_commit() returned error %d", error);
 	}
 	/*
 	 * Release the dquots held by inode, if any.
-- 
cgit v1.2.3


From 234f56aca20a4f66b6ba3d3bf2787634dd9e0999 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:24 +1000
Subject: [XFS] Check for errors when changing buffer pointers.

xfs_buf_associate_memory() can fail, but the return is never checked.
Propagate the error through XFS_BUF_SET_PTR() so that failures are
detected.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30831a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a8039431b86f..e65ab4af0955 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1162,10 +1162,14 @@ xlog_write_log_records(
 		if (j == 0 && (start_block + endcount > ealign)) {
 			offset = XFS_BUF_PTR(bp);
 			balign = BBTOB(ealign - start_block);
-			XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
-			if ((error = xlog_bread(log, ealign, sectbb, bp)))
+			error = XFS_BUF_SET_PTR(bp, offset + balign,
+						BBTOB(sectbb));
+			if (!error)
+				error = xlog_bread(log, ealign, sectbb, bp);
+			if (!error)
+				error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+			if (error)
 				break;
-			XFS_BUF_SET_PTR(bp, offset, bufblks);
 		}
 
 		offset = xlog_align(log, start_block, endcount, bp);
@@ -3630,15 +3634,19 @@ xlog_do_recovery_pass(
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
+				wrapped_hblks = hblks - split_hblks;
 				bufaddr = XFS_BUF_PTR(hbp);
-				XFS_BUF_SET_PTR(hbp,
+				error = XFS_BUF_SET_PTR(hbp,
 						bufaddr + BBTOB(split_hblks),
 						BBTOB(hblks - split_hblks));
-				wrapped_hblks = hblks - split_hblks;
-				error = xlog_bread(log, 0, wrapped_hblks, hbp);
+				if (!error)
+					error = xlog_bread(log, 0,
+							wrapped_hblks, hbp);
+				if (!error)
+					error = XFS_BUF_SET_PTR(hbp, bufaddr,
+							BBTOB(hblks));
 				if (error)
 					goto bread_err2;
-				XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
 				if (!offset)
 					offset = xlog_align(log, 0,
 							wrapped_hblks, hbp);
@@ -3690,13 +3698,18 @@ xlog_do_recovery_pass(
 				 *   - order is important.
 				 */
 				bufaddr = XFS_BUF_PTR(dbp);
-				XFS_BUF_SET_PTR(dbp,
+				error = XFS_BUF_SET_PTR(dbp,
 						bufaddr + BBTOB(split_bblks),
 						BBTOB(bblks - split_bblks));
-				if ((error = xlog_bread(log, wrapped_hblks,
-						bblks - split_bblks, dbp)))
+				if (!error)
+					error = xlog_bread(log, wrapped_hblks,
+							bblks - split_bblks,
+							dbp);
+				if (!error)
+					error = XFS_BUF_SET_PTR(dbp, bufaddr,
+							h_size);
+				if (error)
 					goto bread_err2;
-				XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
 				if (!offset)
 					offset = xlog_align(log, wrapped_hblks,
 						bblks - split_bblks, dbp);
-- 
cgit v1.2.3


From b911ca0472c3762d2bafc4d21e432a9056844064 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:30 +1000
Subject: [XFS] Sanitise xfs_log_force error checking.

xfs_log_force() is declared to return an error, but we almost never check
it. We don't need to check it in most cases; if there's a log I/O error
then we'll be shutting down the filesystem anyway and that means we'll
catch the error somewhere else.

However, on certain calls we should be returning an error - sync
transactions, fsync, sync writes, etc. so this isn't a pure black and
white distinction. Hence make xfs_log_force() a void function that issues
a warning to the syslog on error, and call _xfs_log_force() in all the
places where we actually care about the error status returned.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30832a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 25 +++++++++++++++++++++++--
 fs/xfs/xfs_log.h |  5 +++--
 fs/xfs/xfs_rw.c  |  8 ++++----
 3 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bece882f99ec..e29ea0a6d767 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -382,7 +382,27 @@ _xfs_log_force(
 		return xlog_state_sync_all(log, flags, log_flushed);
 	else
 		return xlog_state_sync(log, lsn, flags, log_flushed);
-}	/* xfs_log_force */
+}	/* _xfs_log_force */
+
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+	xfs_mount_t	*mp,
+	xfs_lsn_t	lsn,
+	uint		flags)
+{
+	int	error;
+	error = _xfs_log_force(mp, lsn, flags, NULL);
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+			"error %d returned.", error);
+	}
+}
+
 
 /*
  * Attaches a new iclog I/O completion callback routine during
@@ -634,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return 0;
 
-	xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 
 #ifdef DEBUG
 	first_iclog = iclog = log->l_iclog;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4cdac048df5e..d1d678ecb63e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -142,8 +142,9 @@ int	  _xfs_log_force(struct xfs_mount *mp,
 			 xfs_lsn_t	lsn,
 			 uint		flags,
 			 int		*log_forced);
-#define xfs_log_force(mp, lsn, flags) \
-	_xfs_log_force(mp, lsn, flags, NULL);
+void	  xfs_log_force(struct xfs_mount	*mp,
+			xfs_lsn_t		lsn,
+			uint			flags);
 int	  xfs_log_mount(struct xfs_mount	*mp,
 			struct xfs_buftarg	*log_target,
 			xfs_daddr_t		start_block,
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index cd3ece6cc918..b0f31c09a76d 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -126,11 +126,11 @@ xfs_write_sync_logforce(
 		 * when we return.
 		 */
 		if (iip && iip->ili_last_lsn) {
-			xfs_log_force(mp, iip->ili_last_lsn,
-					XFS_LOG_FORCE | XFS_LOG_SYNC);
+			error = _xfs_log_force(mp, iip->ili_last_lsn,
+					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
 		} else if (xfs_ipincount(ip) > 0) {
-			xfs_log_force(mp, (xfs_lsn_t)0,
-					XFS_LOG_FORCE | XFS_LOG_SYNC);
+			error = _xfs_log_force(mp, (xfs_lsn_t)0,
+					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
 		}
 
 	} else {
-- 
cgit v1.2.3


From 1bb7d6b5a82f1d9487fd44415484a368f7c87bed Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 10 Apr 2008 12:24:38 +1000
Subject: [XFS] Catch log unmount failures.

Unmounting the log can fail. unlikely, but it can. Catch all the error
conditions an make sure it's propagated upwards.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30833a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e29ea0a6d767..afaee301b0ee 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -697,7 +697,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		atomic_inc(&iclog->ic_refcnt);
 		spin_unlock(&log->l_icloglock);
 		xlog_state_want_sync(log, iclog);
-		(void) xlog_state_release_iclog(log, iclog);
+		error = xlog_state_release_iclog(log, iclog);
 
 		spin_lock(&log->l_icloglock);
 		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
@@ -736,7 +736,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		spin_unlock(&log->l_icloglock);
 
 		xlog_state_want_sync(log, iclog);
-		(void) xlog_state_release_iclog(log, iclog);
+		error =  xlog_state_release_iclog(log, iclog);
 
 		spin_lock(&log->l_icloglock);
 
@@ -751,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		}
 	}
 
-	return 0;
+	return error;
 }	/* xfs_log_unmount_write */
 
 /*
-- 
cgit v1.2.3


From d4055947bd0913864f4d8ac96bf1197338071622 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:49:35 +1000
Subject: [XFS] Don't error out on good I/Os.

xfsbdstrat() made all I/Os error out, good or bad. Fix it.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30836a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index f6dab5d8944e..21c0dbc74093 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -885,8 +885,10 @@ xfsbdstrat(
 	struct xfs_buf		*bp)
 {
 	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp))
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
 		xfs_buf_iorequest(bp);
+		return;
+	}
 
 	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
 	xfs_bioerror_relse(bp);
-- 
cgit v1.2.3


From e6430037e9fd0b3d02ceaf5ab99bfe3ccb763be7 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:49:49 +1000
Subject: [XFS] fix logic error in xfs_alloc_ag_vextent_near()

Fix a logic error in xfs_alloc_ag_vextent_near(). This is a regression
introduced by the error handling changes.

SGI-PV: 890084
SGI-Modid: xfs-linux-melb:xfs-kern:30838a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index facdae14edd0..1956f83489f1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -838,7 +838,7 @@ xfs_alloc_ag_vextent_near(
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
 					args->minlen, &ltbnoa, &ltlena);
-			if (ltlena >= args->minlen)
+			if (ltlena < args->minlen)
 				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-- 
cgit v1.2.3


From 7e20694d91f817f8e9f62404aca793ae0df4d98a Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:49:55 +1000
Subject: [XFS] Remove periodic logging of in-core superblock counters.

xfssyncd triggers the logging of superblock counters every 30s if the
filesystem is made with lazy-count=1. This will prevent disks from idling
and spinning down as there will be a log write every 30s. With the way
counter recovery works for lazy-count=1, this code is unnecessary and
provides no real benefit, so just remove it.

SGI-PV: 980145
SGI-Modid: xfs-linux-melb:xfs-kern:30840a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  3 +--
 fs/xfs/linux-2.6/xfs_vfs.h   |  1 -
 fs/xfs/xfs_vfsops.c          | 13 -------------
 3 files changed, 1 insertion(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index fb561beea373..865eb708aa95 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1028,8 +1028,7 @@ xfs_sync_worker(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-		error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR |
-				     SYNC_REFCACHE | SYNC_SUPER);
+		error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
 }
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 4da03a4e3520..7e60c7776b1c 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work {
 #define SYNC_REFCACHE		0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT		0x0080  /* remount readonly, no dummy LRs */
 #define SYNC_IOWAIT		0x0100  /* wait for all I/O to complete */
-#define SYNC_SUPER		0x0200  /* flush superblock to disk */
 
 /*
  * When remounting a filesystem read-only or freezing the filesystem,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 09e186d02c11..fc48158fe479 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1316,22 +1316,9 @@ xfs_syncsub(
 		}
 	}
 
-	/*
-	 * If asked, update the disk superblock with incore counter values if we
-	 * are using non-persistent counters so that they don't get too far out
-	 * of sync if we crash or get a forced shutdown. We don't want to force
-	 * this to disk, just get a transaction into the iclogs....
-	 */
-	if (flags & SYNC_SUPER) {
-		error = xfs_log_sbcount(mp, 0);
-		if (error)
-			last_error = error;
-	}
-
 	/*
 	 * Now check to see if the log needs a "dummy" transaction.
 	 */
-
 	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
 		xfs_trans_t *tp;
 		xfs_inode_t *ip;
-- 
cgit v1.2.3


From f6485057c5cfbc84e5eff639ddea1ce0d668607b Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 17 Apr 2008 16:50:04 +1000
Subject: [XFS] Ensure the inode is joined in xfs_itruncate_finish

On success, we still need to join the inode to the current transaction in
xfs_itruncate_finish(). Fixes regression from error handling changes.

SGI-PV: 980084
SGI-Modid: xfs-linux-melb:xfs-kern:30845a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_inode.c | 137 +++++++++++++++++++++++++----------------------------
 1 file changed, 65 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2bc22790d65a..ca12acb90394 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1464,51 +1464,50 @@ xfs_itruncate_start(
 }
 
 /*
- * Shrink the file to the given new_size.  The new
- * size must be smaller than the current size.
- * This will free up the underlying blocks
- * in the removed range after a call to xfs_itruncate_start()
- * or xfs_atruncate_start().
+ * Shrink the file to the given new_size.  The new size must be smaller than
+ * the current size.  This will free up the underlying blocks in the removed
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
  *
- * The transaction passed to this routine must have made
- * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
- * This routine may commit the given transaction and
- * start new ones, so make sure everything involved in
- * the transaction is tidy before calling here.
- * Some transaction will be returned to the caller to be
- * committed.  The incoming transaction must already include
- * the inode, and both inode locks must be held exclusively.
- * The inode must also be "held" within the transaction.  On
- * return the inode will be "held" within the returned transaction.
- * This routine does NOT require any disk space to be reserved
- * for it within the transaction.
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
  *
- * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
- * and it indicates the fork which is to be truncated.  For the
- * attribute fork we only support truncation to size 0.
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
+ * indicates the fork which is to be truncated.  For the attribute fork we only
+ * support truncation to size 0.
  *
- * We use the sync parameter to indicate whether or not the first
- * transaction we perform might have to be synchronous.  For the attr fork,
- * it needs to be so if the unlink of the inode is not yet known to be
- * permanent in the log.  This keeps us from freeing and reusing the
- * blocks of the attribute fork before the unlink of the inode becomes
- * permanent.
+ * We use the sync parameter to indicate whether or not the first transaction
+ * we perform might have to be synchronous.  For the attr fork, it needs to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
+ * This keeps us from freeing and reusing the blocks of the attribute fork
+ * before the unlink of the inode becomes permanent.
  *
- * For the data fork, we normally have to run synchronously if we're
- * being called out of the inactive path or we're being called
- * out of the create path where we're truncating an existing file.
- * Either way, the truncate needs to be sync so blocks don't reappear
- * in the file with altered data in case of a crash.  wsync filesystems
- * can run the first case async because anything that shrinks the inode
- * has to run sync so by the time we're called here from inactive, the
- * inode size is permanently set to 0.
+ * For the data fork, we normally have to run synchronously if we're being
+ * called out of the inactive path or we're being called out of the create path
+ * where we're truncating an existing file.  Either way, the truncate needs to
+ * be sync so blocks don't reappear in the file with altered data in case of a
+ * crash.  wsync filesystems can run the first case async because anything that
+ * shrinks the inode has to run sync so by the time we're called here from
+ * inactive, the inode size is permanently set to 0.
  *
- * Calls from the truncate path always need to be sync unless we're
- * in a wsync filesystem and the file has already been unlinked.
+ * Calls from the truncate path always need to be sync unless we're in a wsync
+ * filesystem and the file has already been unlinked.
  *
- * The caller is responsible for correctly setting the sync parameter.
- * It gets too hard for us to guess here which path we're being called
- * out of just based on inode state.
+ * The caller is responsible for correctly setting the sync parameter.  It gets
+ * too hard for us to guess here which path we're being called out of just
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
  */
 int
 xfs_itruncate_finish(
@@ -1687,45 +1686,51 @@ xfs_itruncate_finish(
 		 */
 		error = xfs_bmap_finish(tp, &free_list, &committed);
 		ntp = *tp;
+		if (committed) {
+			/* link the inode into the next xact in the chain */
+			xfs_trans_ijoin(ntp, ip,
+					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+			xfs_trans_ihold(ntp, ip);
+		}
+
 		if (error) {
 			/*
-			 * If the bmap finish call encounters an error,
-			 * return to the caller where the transaction
-			 * can be properly aborted.  We just need to
-			 * make sure we're not holding any resources
-			 * that we were not when we came in.
+			 * If the bmap finish call encounters an error, return
+			 * to the caller where the transaction can be properly
+			 * aborted.  We just need to make sure we're not
+			 * holding any resources that we were not when we came
+			 * in.
 			 *
-			 * Aborting from this point might lose some
-			 * blocks in the file system, but oh well.
+			 * Aborting from this point might lose some blocks in
+			 * the file system, but oh well.
 			 */
 			xfs_bmap_cancel(&free_list);
-			if (committed)
-				goto error_join;
 			return error;
 		}
 
 		if (committed) {
 			/*
-			 * The first xact was committed, so add the inode to
-			 * the new one.  Mark it dirty so it will be logged and
+			 * Mark the inode dirty so it will be logged and
 			 * moved forward in the log as part of every commit.
 			 */
-			xfs_trans_ijoin(ntp, ip,
-					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-			xfs_trans_ihold(ntp, ip);
 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 		}
+
 		ntp = xfs_trans_dup(ntp);
 		error = xfs_trans_commit(*tp, 0);
 		*tp = ntp;
-		if (error)
-			goto error_join;
-		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-					  XFS_TRANS_PERM_LOG_RES,
-					  XFS_ITRUNCATE_LOG_COUNT);
-		if (error)
-			goto error_join;
 
+		/* link the inode into the next transaction in the chain */
+		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ihold(ntp, ip);
+
+		if (!error)
+			error = xfs_trans_reserve(ntp, 0,
+					XFS_ITRUNCATE_LOG_RES(mp), 0,
+					XFS_TRANS_PERM_LOG_RES,
+					XFS_ITRUNCATE_LOG_COUNT);
+		if (error)
+			return error;
 	}
 	/*
 	 * Only update the size in the case of the data fork, but
@@ -1757,18 +1762,6 @@ xfs_itruncate_finish(
 	       (ip->i_d.di_nextents == 0));
 	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
 	return 0;
-
-error_join:
-	/*
-	 * Add the inode being truncated to the next chained transaction.  This
-	 * keeps things simple for the higher level code, because it always
-	 * knows that the inode is locked and held in the transaction that
-	 * returns to it whether errors occur or not.  We don't mark the inode
-	 * dirty so that this transaction can be easily aborted if possible.
-	 */
-	xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ihold(ntp, ip);
-	return error;
 }
 
 
-- 
cgit v1.2.3


From cb49dbb130e17a6f9af4cb4714cf6976cf09afdf Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 17 Apr 2008 16:50:09 +1000
Subject: [XFS] Always use di_forkoff when checking for attr space.

In the case where we mount a filesystem which was previously using the
attr2 format as attr1, returning the default mp->m_attroffset instead of
the per-inode di_forkoff for inline attribute fit calculations, may result
in corruption, if for example, the data fork is already taking more space
than the default fork offset and we try to add an extended attribute. Fix
tested by xfstests/186.

SGI-PV: 979606
SGI-Modid: xfs-linux-melb:xfs-kern:30861a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 96ba6aa4ed8c..303d41e4217b 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 
 	if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
 		if (bytes <= XFS_IFORK_ASIZE(dp))
-			return mp->m_attroffset >> 3;
+			return dp->i_d.di_forkoff;
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 6d1337b29bf09a97682d39db36ac2d0dfc6659c0 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Thu, 17 Apr 2008 16:50:16 +1000
Subject: [XFS] xfs_bmap_compute_maxlevels should be based on di_forkoff

Fix up xfs_bmap_compute_maxlevels() to account for the case when we go
from using attr2 to using attr1. In that case attr1 will no longer
necessarily be at m_attr_offset>>3, but could be at a different value for
di_forkoff. Therefore, we return the worst case scenario using MINDBTPTRS
and MINABTPTRS, as this function is used for determining the maximum log
space.

SGI-PV: 979606
SGI-Modid: xfs-linux-melb:xfs-kern:30862a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_bmap.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6d9b5448deb2..eb198c01c35d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4153,16 +4153,21 @@ xfs_bmap_compute_maxlevels(
 	 * number of leaf entries, is controlled by the type of di_nextents
 	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
 	 * (a signed 16-bit number, xfs_aextnum_t).
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that
+	 * the fork offset of all the inodes will be (m_attroffset >> 3)
+	 * because we could have mounted with ATTR2 and then mounted back
+	 * with ATTR1, keeping the di_forkoff's fixed but probably at
+	 * various positions. Therefore, for both ATTR1 and ATTR2
+	 * we have to assume the worst case scenario of a minimum size
+	 * available.
 	 */
 	if (whichfork == XFS_DATA_FORK) {
 		maxleafents = MAXEXTNUM;
-		sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
-			XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
 	} else {
 		maxleafents = MAXAEXTNUM;
-		sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
-			XFS_BMDR_SPACE_CALC(MINABTPTRS) :
-			mp->m_sb.sb_inodesize - mp->m_attroffset;
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
 	}
 	maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
 	minleafrecs = mp->m_bmap_dmnr[0];
-- 
cgit v1.2.3


From f7d3c34788696f5ba9ac9fa414ad80e2a91d4b2e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 17 Apr 2008 16:50:22 +1000
Subject: [XFS] Remove CONFIG_XFS_SECURITY.

There is no point to the CONFIG_XFS_SECURITY option; it disables the
ability to set security attributes at runtime, but it does not actually
slim down or remove any code for runtime. Just remove it and always allow
security attributes to be set.

SGI-PV: 980310
SGI-Modid: xfs-linux-melb:xfs-kern:30877a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/Kconfig               | 12 ------------
 fs/xfs/linux-2.6/xfs_super.h |  8 +-------
 fs/xfs/xfs_attr.c            | 10 +---------
 3 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35115bca036e..524021ff5436 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -35,18 +35,6 @@ config XFS_QUOTA
 	  with or without the generic quota support enabled (CONFIG_QUOTA) -
 	  they are completely independent subsystems.
 
-config XFS_SECURITY
-	bool "XFS Security Label support"
-	depends on XFS_FS
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute namespace for inode security
-	  labels in the XFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for inode security labels, say N.
-
 config XFS_POSIX_ACL
 	bool "XFS POSIX ACL support"
 	depends on XFS_FS
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efcf45b14ab..3efb7c6d3303 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -50,13 +50,7 @@ extern void xfs_qm_exit(void);
 # define set_posix_acl_flag(sb)	do { } while (0)
 #endif
 
-#ifdef CONFIG_XFS_SECURITY
-# define XFS_SECURITY_STRING	"security attributes, "
-# define ENOSECURITY		0
-#else
-# define XFS_SECURITY_STRING
-# define ENOSECURITY		EOPNOTSUPP
-#endif
+#define XFS_SECURITY_STRING	"security attributes, "
 
 #ifdef CONFIG_XFS_RT
 # define XFS_REALTIME_STRING	"realtime, "
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e58f321fdae9..36d781ee5fcc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2646,14 +2646,6 @@ attr_trusted_capable(
 	return 0;
 }
 
-STATIC int
-attr_secure_capable(
-	bhv_vnode_t	*vp,
-	cred_t		*cred)
-{
-	return -ENOSECURITY;
-}
-
 STATIC int
 attr_system_set(
 	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
@@ -2724,7 +2716,7 @@ struct attrnames attr_secure = {
 	.attr_get	= attr_generic_get,
 	.attr_set	= attr_generic_set,
 	.attr_remove	= attr_generic_remove,
-	.attr_capable	= attr_secure_capable,
+	.attr_capable	= (attrcapable_t)fs_noerr,
 };
 
 struct attrnames attr_user = {
-- 
cgit v1.2.3


From e687330b5ed1ea899fdaf0dea50aba196b6e019a Mon Sep 17 00:00:00 2001
From: Donald Douwsma <donaldd@sgi.com>
Date: Thu, 17 Apr 2008 16:50:28 +1000
Subject: [XFS] Remove unused HAVE_SPLICE macro.

HAVE_SPLICE was part of the infrastructure for building 2.4 and 2.6
kernels out of the same tree. Now we don't build 2.4 kernels this

SGI-PV: 971046
SGI-Modid: xfs-linux-melb:xfs-kern:30878a

Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..765aaf65e2d3 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
 #define STATIC
 #define DEBUG 1
 #define XFS_BUF_LOCK_TRACKING 1
-/* #define QUOTADEBUG 1 */
+#define QUOTADEBUG 1
 #endif
 
 #ifdef CONFIG_XFS_TRACE
-- 
cgit v1.2.3


From 3b2816be271b8b364294a5b48721a3e68af46cfa Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 18 Apr 2008 12:43:35 +1000
Subject: [XFS] The forward declarations for the xfs_ioctl() helpers and the
 associated comment about gcc behavior really aren't needed; all of these
 functions are marked STATIC which includes noinline, and the stack usage
 won't be a problem.

This effectively just removes the forward declarations and moves
xfs_ioctl() back to the end of the file.

SGI-PV: 971186
SGI-Modid: xfs-linux-melb:xfs-kern:30534a

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Niv Sardi <xaiki@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 314 +------------------------------------------
 1 file changed, 2 insertions(+), 312 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index c6399b2cf17c..640c8b6b1956 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -651,314 +651,6 @@ xfs_attrmulti_by_handle(
 	return -error;
 }
 
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions.  Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-
-STATIC int
-xfs_ioc_space(
-	struct xfs_inode	*ip,
-	struct inode		*inode,
-	struct file		*filp,
-	int			flags,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_bulkstat(
-	xfs_mount_t		*mp,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
-	xfs_mount_t		*mp,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry(
-	xfs_mount_t		*mp,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_xattr(
-	xfs_inode_t		*ip,
-	struct file		*filp,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgetxattr(
-	xfs_inode_t		*ip,
-	int			attr,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_getbmap(
-	struct xfs_inode	*ip,
-	int			flags,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_getbmapx(
-	struct xfs_inode	*ip,
-	void			__user *arg);
-
-int
-xfs_ioctl(
-	xfs_inode_t		*ip,
-	struct file		*filp,
-	int			ioflags,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	struct inode		*inode = filp->f_path.dentry->d_inode;
-	xfs_mount_t		*mp = ip->i_mount;
-	int			error;
-
-	xfs_itrace_entry(XFS_I(inode));
-	switch (cmd) {
-
-	case XFS_IOC_ALLOCSP:
-	case XFS_IOC_FREESP:
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_ALLOCSP64:
-	case XFS_IOC_FREESP64:
-	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64:
-		/*
-		 * Only allow the sys admin to reserve space unless
-		 * unwritten extents are enabled.
-		 */
-		if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-		    !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
-
-	case XFS_IOC_DIOINFO: {
-		struct dioattr	da;
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(ip) ?
-			mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
-		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
-		if (copy_to_user(arg, &da, sizeof(da)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_FSBULKSTAT_SINGLE:
-	case XFS_IOC_FSBULKSTAT:
-	case XFS_IOC_FSINUMBERS:
-		return xfs_ioc_bulkstat(mp, cmd, arg);
-
-	case XFS_IOC_FSGEOMETRY_V1:
-		return xfs_ioc_fsgeometry_v1(mp, arg);
-
-	case XFS_IOC_FSGEOMETRY:
-		return xfs_ioc_fsgeometry(mp, arg);
-
-	case XFS_IOC_GETVERSION:
-		return put_user(inode->i_generation, (int __user *)arg);
-
-	case XFS_IOC_FSGETXATTR:
-		return xfs_ioc_fsgetxattr(ip, 0, arg);
-	case XFS_IOC_FSGETXATTRA:
-		return xfs_ioc_fsgetxattr(ip, 1, arg);
-	case XFS_IOC_GETXFLAGS:
-	case XFS_IOC_SETXFLAGS:
-	case XFS_IOC_FSSETXATTR:
-		return xfs_ioc_xattr(ip, filp, cmd, arg);
-
-	case XFS_IOC_FSSETDM: {
-		struct fsdmidata	dmi;
-
-		if (copy_from_user(&dmi, arg, sizeof(dmi)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
-				dmi.fsd_dmstate);
-		return -error;
-	}
-
-	case XFS_IOC_GETBMAP:
-	case XFS_IOC_GETBMAPA:
-		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
-
-	case XFS_IOC_GETBMAPX:
-		return xfs_ioc_getbmapx(ip, arg);
-
-	case XFS_IOC_FD_TO_HANDLE:
-	case XFS_IOC_PATH_TO_HANDLE:
-	case XFS_IOC_PATH_TO_FSHANDLE:
-		return xfs_find_handle(cmd, arg);
-
-	case XFS_IOC_OPEN_BY_HANDLE:
-		return xfs_open_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_FSSETDM_BY_HANDLE:
-		return xfs_fssetdm_by_handle(mp, arg, inode);
-
-	case XFS_IOC_READLINK_BY_HANDLE:
-		return xfs_readlink_by_handle(mp, arg, inode);
-
-	case XFS_IOC_ATTRLIST_BY_HANDLE:
-		return xfs_attrlist_by_handle(mp, arg, inode);
-
-	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-		return xfs_attrmulti_by_handle(mp, arg, inode);
-
-	case XFS_IOC_SWAPEXT: {
-		error = xfs_swapext((struct xfs_swapext __user *)arg);
-		return -error;
-	}
-
-	case XFS_IOC_FSCOUNTS: {
-		xfs_fsop_counts_t out;
-
-		error = xfs_fs_counts(mp, &out);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_SET_RESBLKS: {
-		xfs_fsop_resblks_t inout;
-		__uint64_t	   in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&inout, arg, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
-
-		/* input parameter is passed in resblks field of structure */
-		in = inout.resblks;
-		error = xfs_reserve_blocks(mp, &in, &inout);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &inout, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_GET_RESBLKS: {
-		xfs_fsop_resblks_t out;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_reserve_blocks(mp, NULL, &out);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
-
-		return 0;
-	}
-
-	case XFS_IOC_FSGROWFSDATA: {
-		xfs_growfs_data_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_data(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FSGROWFSLOG: {
-		xfs_growfs_log_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_log(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FSGROWFSRT: {
-		xfs_growfs_rt_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_rt(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FREEZE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (inode->i_sb->s_frozen == SB_UNFROZEN)
-			freeze_bdev(inode->i_sb->s_bdev);
-		return 0;
-
-	case XFS_IOC_THAW:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		if (inode->i_sb->s_frozen != SB_UNFROZEN)
-			thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-		return 0;
-
-	case XFS_IOC_GOINGDOWN: {
-		__uint32_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (get_user(in, (__uint32_t __user *)arg))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_fs_goingdown(mp, in);
-		return -error;
-	}
-
-	case XFS_IOC_ERROR_INJECTION: {
-		xfs_error_injection_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_errortag_add(in.errtag, mp);
-		return -error;
-	}
-
-	case XFS_IOC_ERROR_CLEARALL:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_errortag_clearall(mp, 1);
-		return -error;
-
-	default:
-		return -ENOTTY;
-	}
-}
-
 STATIC int
 xfs_ioc_space(
 	struct xfs_inode	*ip,
@@ -1398,12 +1090,10 @@ xfs_ioctl(
 		return xfs_ioc_fsgetxattr(ip, 0, arg);
 	case XFS_IOC_FSGETXATTRA:
 		return xfs_ioc_fsgetxattr(ip, 1, arg);
-	case XFS_IOC_FSSETXATTR:
-		return xfs_ioc_fssetxattr(ip, filp, arg);
 	case XFS_IOC_GETXFLAGS:
-		return xfs_ioc_getxflags(ip, arg);
 	case XFS_IOC_SETXFLAGS:
-		return xfs_ioc_setxflags(ip, filp, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_xattr(ip, filp, cmd, arg);
 
 	case XFS_IOC_FSSETDM: {
 		struct fsdmidata	dmi;
-- 
cgit v1.2.3


From 65e67f5165c8a156b34ee7adf65d5ed3b16a910d Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 18 Apr 2008 12:59:45 +1000
Subject: [XFS] Fix merge failure

---
 fs/xfs/linux-2.6/xfs_ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 640c8b6b1956..bf7759793856 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1090,10 +1090,12 @@ xfs_ioctl(
 		return xfs_ioc_fsgetxattr(ip, 0, arg);
 	case XFS_IOC_FSGETXATTRA:
 		return xfs_ioc_fsgetxattr(ip, 1, arg);
+	case XFS_IOC_FSSETXATTR:
+		return xfs_ioc_fssetxattr(ip, filp, arg);
 	case XFS_IOC_GETXFLAGS:
+		return xfs_ioc_getxflags(ip, arg);
 	case XFS_IOC_SETXFLAGS:
-	case XFS_IOC_FSSETXATTR:
-		return xfs_ioc_xattr(ip, filp, cmd, arg);
+		return xfs_ioc_setxflags(ip, filp, arg);
 
 	case XFS_IOC_FSSETDM: {
 		struct fsdmidata	dmi;
-- 
cgit v1.2.3


From 62be1f71677c53d5e51223807a06ac9052f49b0f Mon Sep 17 00:00:00 2001
From: Roel Kluin <12o3l@tiscali.nl>
Date: Thu, 17 Apr 2008 17:25:37 +0200
Subject: [GFS2] fix assertion in log_refund()

since unsigned, unused >= 0 is always true.

Signed-off-by: Roel Kluin <12o3l@tiscali.nl>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b335304fc5d6..548264b1836d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -769,8 +769,8 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
 	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
 	reserved = calc_reserved(sdp);
+	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
 	unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
-	gfs2_assert_withdraw(sdp, unused >= 0);
 	atomic_add(unused, &sdp->sd_log_blks_free);
 	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
 			     sdp->sd_jdesc->jd_blocks);
-- 
cgit v1.2.3


From 8e8a4603b5422c9145880e73b23bc4c2c4de0098 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Fri, 1 Feb 2008 11:59:09 -0800
Subject: ocfs2: Move slot map access into slot_map.c

journal.c and dlmglue.c would refresh the slot map by hand.  Instead, have
the update and clear functions do the work inside slot_map.c.  The eventual
result is to make ocfs2_slot_info defined privately in slot_map.c

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c  |  8 +------
 fs/ocfs2/journal.c  |  3 +--
 fs/ocfs2/slot_map.c | 62 +++++++++++++++++++++++++++++++++++++++++------------
 fs/ocfs2/slot_map.h | 11 ++++------
 fs/ocfs2/super.c    |  3 +--
 5 files changed, 55 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1f1873bf41fb..1a80fa9e7c9a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2132,8 +2132,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 	int status = 0;
 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
-	struct buffer_head *bh;
-	struct ocfs2_slot_info *si = osb->slot_info;
 
 	mlog_entry_void();
 
@@ -2159,11 +2157,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 		goto bail;
 	}
 	if (status) {
-		bh = si->si_bh;
-		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
-					  si->si_inode);
-		if (status == 0)
-			ocfs2_update_slot_info(si);
+		status = ocfs2_refresh_slot_info(osb);
 
 		ocfs2_complete_lock_res_refresh(lockres, status);
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c3..c2e654ee703a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1123,8 +1123,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
 	/* Likewise, this would be a strange but ultimately not so
 	 * harmful place to get an error... */
-	ocfs2_clear_slot(si, slot_num);
-	status = ocfs2_update_disk_slots(osb, si);
+	status = ocfs2_clear_slot(osb, slot_num);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e64..f5727b8cc913 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -49,7 +49,7 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
 			      s16 node_num);
 
 /* post the slot information on disk into our slot_info struct. */
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 {
 	int i;
 	__le16 *disk_info;
@@ -65,10 +65,27 @@ void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 	spin_unlock(&si->si_lock);
 }
 
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+	int ret;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	struct buffer_head *bh;
+
+	if (si == NULL)
+		return 0;
+
+	bh = si->si_bh;
+	ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode);
+	if (ret == 0)
+		ocfs2_update_slot_info(si);
+
+	return ret;
+}
+
 /* post the our slot info stuff into it's destination bh and write it
  * out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-			    struct ocfs2_slot_info *si)
+static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+				   struct ocfs2_slot_info *si)
 {
 	int status, i;
 	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
@@ -135,6 +152,19 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 	return ret;
 }
 
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	if (si == NULL)
+		return;
+
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh)
+		brelse(si->si_bh);
+
+	kfree(si);
+}
+
 static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
 			      s16 slot_num,
 			      s16 node_num)
@@ -147,12 +177,18 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
 	si->si_global_node_nums[slot_num] = node_num;
 }
 
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
-		      s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
 {
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (si == NULL)
+		return 0;
+
 	spin_lock(&si->si_lock);
 	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
 	spin_unlock(&si->si_lock);
+
+	return ocfs2_update_disk_slots(osb, osb->slot_info);
 }
 
 int ocfs2_init_slot_info(struct ocfs2_super *osb)
@@ -202,18 +238,17 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 	osb->slot_info = si;
 bail:
 	if (status < 0 && si)
-		ocfs2_free_slot_info(si);
+		__ocfs2_free_slot_info(si);
 
 	return status;
 }
 
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
 {
-	if (si->si_inode)
-		iput(si->si_inode);
-	if (si->si_bh)
-		brelse(si->si_bh);
-	kfree(si);
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	osb->slot_info = NULL;
+	__ocfs2_free_slot_info(si);
 }
 
 int ocfs2_find_slot(struct ocfs2_super *osb)
@@ -285,7 +320,6 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
 	}
 
 bail:
-	osb->slot_info = NULL;
-	ocfs2_free_slot_info(si);
+	ocfs2_free_slot_info(osb);
 }
 
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaade..b029ffdc8ea5 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -30,7 +30,7 @@
 struct ocfs2_slot_info {
 	spinlock_t si_lock;
 
-       	struct inode *si_inode;
+	struct inode *si_inode;
 	struct buffer_head *si_bh;
 	unsigned int si_num_slots;
 	unsigned int si_size;
@@ -38,19 +38,16 @@ struct ocfs2_slot_info {
 };
 
 int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
 
 int ocfs2_find_slot(struct ocfs2_super *osb);
 void ocfs2_put_slot(struct ocfs2_super *osb);
 
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-			    struct ocfs2_slot_info *si);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
 
 s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 			   s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
-		      s16 slot_num);
+int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);
 
 static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
 				      int slot_num)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9f..fad37af2af9c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1724,8 +1724,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 
 	/* This function assumes that the caller has the main osb resource */
 
-	if (osb->slot_info)
-		ocfs2_free_slot_info(osb->slot_info);
+	ocfs2_free_slot_info(osb);
 
 	kfree(osb->osb_orphan_wipes);
 	/* FIXME
-- 
cgit v1.2.3


From d85b20e4b300edfd290f21fc2d790ba16d2f225b Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:01:05 -0800
Subject: ocfs2: Make ocfs2_slot_info private.

Just use osb_lock around the ocfs2_slot_info data.  This allows us to
take the ocfs2_slot_info structure private in slot_info.c.  All access
is now via accessors.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c  | 24 ++++++++--------
 fs/ocfs2/ocfs2.h    |  1 +
 fs/ocfs2/slot_map.c | 81 +++++++++++++++++++++++++++++++++++++----------------
 fs/ocfs2/slot_map.h | 25 +++--------------
 4 files changed, 74 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c2e654ee703a..ed0c6d0850d7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1079,7 +1079,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 {
 	int status = 0;
 	int slot_num;
-	struct ocfs2_slot_info *si = osb->slot_info;
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
@@ -1092,8 +1091,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 	 * case we should've called ocfs2_journal_load instead. */
 	BUG_ON(osb->node_num == node_num);
 
-	slot_num = ocfs2_node_num_to_slot(si, node_num);
-	if (slot_num == OCFS2_INVALID_SLOT) {
+	slot_num = ocfs2_node_num_to_slot(osb, node_num);
+	if (slot_num == -ENOENT) {
 		status = 0;
 		mlog(0, "no slot for this node, so no recovery required.\n");
 		goto done;
@@ -1183,23 +1182,24 @@ bail:
  * slot info struct has been updated from disk. */
 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
-	int status, i, node_num;
-	struct ocfs2_slot_info *si = osb->slot_info;
+	unsigned int node_num;
+	int status, i;
 
 	/* This is called with the super block cluster lock, so we
 	 * know that the slot map can't change underneath us. */
 
-	spin_lock(&si->si_lock);
-	for(i = 0; i < si->si_num_slots; i++) {
+	spin_lock(&osb->osb_lock);
+	for (i = 0; i < osb->max_slots; i++) {
 		if (i == osb->slot_num)
 			continue;
-		if (ocfs2_is_empty_slot(si, i))
+
+		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+		if (status == -ENOENT)
 			continue;
 
-		node_num = si->si_global_node_nums[i];
 		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
 			continue;
-		spin_unlock(&si->si_lock);
+		spin_unlock(&osb->osb_lock);
 
 		/* Ok, we have a slot occupied by another node which
 		 * is not in the recovery map. We trylock his journal
@@ -1215,9 +1215,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 			goto bail;
 		}
 
-		spin_lock(&si->si_lock);
+		spin_lock(&osb->osb_lock);
 	}
-	spin_unlock(&si->si_lock);
+	spin_unlock(&osb->osb_lock);
 
 	status = 0;
 bail:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e3..ee3f675a4210 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -179,6 +179,7 @@ enum ocfs2_mount_options
 #define OCFS2_DEFAULT_ATIME_QUANTUM	60
 
 struct ocfs2_journal;
+struct ocfs2_slot_info;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index f5727b8cc913..762360d95e93 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,13 +42,25 @@
 
 #include "buffer_head_io.h"
 
+struct ocfs2_slot_info {
+	struct inode *si_inode;
+	struct buffer_head *si_bh;
+	unsigned int si_num_slots;
+	unsigned int si_size;
+	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+};
+
+
 static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 				    s16 global);
 static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
 			      s16 slot_num,
 			      s16 node_num);
 
-/* post the slot information on disk into our slot_info struct. */
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
 static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 {
 	int i;
@@ -56,13 +68,10 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 
 	/* we don't read the slot block here as ocfs2_super_lock
 	 * should've made sure we have the most recent copy. */
-	spin_lock(&si->si_lock);
 	disk_info = (__le16 *) si->si_bh->b_data;
 
 	for (i = 0; i < si->si_size; i++)
 		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
-
-	spin_unlock(&si->si_lock);
 }
 
 int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
@@ -76,8 +85,11 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 
 	bh = si->si_bh;
 	ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode);
-	if (ret == 0)
+	if (ret == 0) {
+		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
+		spin_unlock(&osb->osb_lock);
+	}
 
 	return ret;
 }
@@ -90,10 +102,10 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
 	int status, i;
 	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
 
-	spin_lock(&si->si_lock);
+	spin_lock(&osb->osb_lock);
 	for (i = 0; i < si->si_size; i++)
 		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
-	spin_unlock(&si->si_lock);
+	spin_unlock(&osb->osb_lock);
 
 	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
 	if (status < 0)
@@ -119,7 +131,8 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 	return ret;
 }
 
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+				   s16 preferred)
 {
 	int i;
 	s16 ret = OCFS2_INVALID_SLOT;
@@ -141,15 +154,36 @@ out:
 	return ret;
 }
 
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-			   s16 global)
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
 {
-	s16 ret;
+	s16 slot;
+	struct ocfs2_slot_info *si = osb->slot_info;
 
-	spin_lock(&si->si_lock);
-	ret = __ocfs2_node_num_to_slot(si, global);
-	spin_unlock(&si->si_lock);
-	return ret;
+	spin_lock(&osb->osb_lock);
+	slot = __ocfs2_node_num_to_slot(si, node_num);
+	spin_unlock(&osb->osb_lock);
+
+	if (slot == OCFS2_INVALID_SLOT)
+		return -ENOENT;
+
+	return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+				  unsigned int *node_num)
+{
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	assert_spin_locked(&osb->osb_lock);
+
+	BUG_ON(slot_num < 0);
+	BUG_ON(slot_num > osb->max_slots);
+
+	if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT)
+		return -ENOENT;
+
+	*node_num = si->si_global_node_nums[slot_num];
+	return 0;
 }
 
 static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
@@ -184,9 +218,9 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
 	if (si == NULL)
 		return 0;
 
-	spin_lock(&si->si_lock);
+	spin_lock(&osb->osb_lock);
 	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
-	spin_unlock(&si->si_lock);
+	spin_unlock(&osb->osb_lock);
 
 	return ocfs2_update_disk_slots(osb, osb->slot_info);
 }
@@ -206,7 +240,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	spin_lock_init(&si->si_lock);
 	si->si_num_slots = osb->max_slots;
 	si->si_size = OCFS2_MAX_SLOTS;
 
@@ -235,7 +268,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 
 	si->si_inode = inode;
 	si->si_bh = bh;
-	osb->slot_info = si;
+	osb->slot_info = (struct ocfs2_slot_info *)si;
 bail:
 	if (status < 0 && si)
 		__ocfs2_free_slot_info(si);
@@ -261,9 +294,9 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 
 	si = osb->slot_info;
 
+	spin_lock(&osb->osb_lock);
 	ocfs2_update_slot_info(si);
 
-	spin_lock(&si->si_lock);
 	/* search for ourselves first and take the slot if it already
 	 * exists. Perhaps we need to mark this in a variable for our
 	 * own journal recovery? Possibly not, though we certainly
@@ -274,7 +307,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 		 * one. */
 		slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
 		if (slot == OCFS2_INVALID_SLOT) {
-			spin_unlock(&si->si_lock);
+			spin_unlock(&osb->osb_lock);
 			mlog(ML_ERROR, "no free slots available!\n");
 			status = -EINVAL;
 			goto bail;
@@ -285,7 +318,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 
 	__ocfs2_fill_slot(si, slot, osb->node_num);
 	osb->slot_num = slot;
-	spin_unlock(&si->si_lock);
+	spin_unlock(&osb->osb_lock);
 
 	mlog(0, "taking node slot %d\n", osb->slot_num);
 
@@ -306,12 +339,12 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
 	if (!si)
 		return;
 
+	spin_lock(&osb->osb_lock);
 	ocfs2_update_slot_info(si);
 
-	spin_lock(&si->si_lock);
 	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
 	osb->slot_num = OCFS2_INVALID_SLOT;
-	spin_unlock(&si->si_lock);
+	spin_unlock(&osb->osb_lock);
 
 	status = ocfs2_update_disk_slots(osb, si);
 	if (status < 0) {
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index b029ffdc8ea5..5118e89c84eb 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,16 +27,6 @@
 #ifndef SLOTMAP_H
 #define SLOTMAP_H
 
-struct ocfs2_slot_info {
-	spinlock_t si_lock;
-
-	struct inode *si_inode;
-	struct buffer_head *si_bh;
-	unsigned int si_num_slots;
-	unsigned int si_size;
-	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
-
 int ocfs2_init_slot_info(struct ocfs2_super *osb);
 void ocfs2_free_slot_info(struct ocfs2_super *osb);
 
@@ -45,17 +35,10 @@ void ocfs2_put_slot(struct ocfs2_super *osb);
 
 int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
 
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-			   s16 global);
-int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+				  unsigned int *node_num);
 
-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
-				      int slot_num)
-{
-	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
-	assert_spin_locked(&si->si_lock);
-
-	return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
-}
+int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);
 
 #endif
-- 
cgit v1.2.3


From 553abd046af609191a91af7289d87d477adc659f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:03:57 -0800
Subject: ocfs2: Change the recovery map to an array of node numbers.

The old recovery map was a bitmap of node numbers.  This was sufficient
for the maximum node number of 254.  Going forward, we want node numbers
to be UINT32.  Thus, we need a new recovery map.

Note that we can't keep track of slots here.  We must write down the
node number to recovery *before* we get the locks needed to convert a
node number into a slot number.

The recovery map is now an array of unsigned ints, max_slots in size.
It moves to journal.c with the rest of recovery.

Because it needs to be initialized, we move all of recovery initialization
into a new function, ocfs2_recovery_init().  This actually cleans up
ocfs2_initialize_super() a little as well.  Following on, recovery cleaup
becomes part of ocfs2_recovery_exit().

A number of node map functions are rendered obsolete and are removed.

Finally, waiting on recovery is wrapped in a function rather than naked
checks on the recovery_event.  This is a cleanup from Mark.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   |   6 +-
 fs/ocfs2/heartbeat.c | 111 -------------------------------
 fs/ocfs2/heartbeat.h |  14 ----
 fs/ocfs2/journal.c   | 181 ++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/journal.h   |   4 ++
 fs/ocfs2/ocfs2.h     |   3 +-
 fs/ocfs2/super.c     |  33 +++-------
 7 files changed, 182 insertions(+), 170 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1a80fa9e7c9a..15a5167e0513 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1950,8 +1950,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
 		goto local;
 
 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-		wait_event(osb->recovery_event,
-			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+		ocfs2_wait_for_recovery(osb);
 
 	lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	level = ex ? LKM_EXMODE : LKM_PRMODE;
@@ -1974,8 +1973,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
 	 * committed to owning this lock so we don't allow signals to
 	 * abort the operation. */
 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-		wait_event(osb->recovery_event,
-			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+		ocfs2_wait_for_recovery(osb);
 
 local:
 	/*
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf64da0..80de2397c161 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -48,7 +48,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
 					    int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
 					      int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
 
 /* special case -1 for now
  * TODO: should *really* make sure the calling func never passes -1!!  */
@@ -62,7 +61,6 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
 	spin_lock_init(&osb->node_map_lock);
-	ocfs2_node_map_init(&osb->recovery_map);
 	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
 
@@ -192,112 +190,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
 	return ret;
 }
 
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
-	int bit;
-	bit = find_next_bit(map->map, map->num_nodes, 0);
-	if (bit < map->num_nodes)
-		return 0;
-	return 1;
-}
-
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-			    struct ocfs2_node_map *map)
-{
-	int ret;
-	BUG_ON(map->num_nodes == 0);
-	spin_lock(&osb->node_map_lock);
-	ret = __ocfs2_node_map_is_empty(map);
-	spin_unlock(&osb->node_map_lock);
-	return ret;
-}
-
-#if 0
-
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
-				 struct ocfs2_node_map *from)
-{
-	BUG_ON(from->num_nodes == 0);
-	ocfs2_node_map_init(target);
-	__ocfs2_node_map_set(target, from);
-}
-
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
-			   struct ocfs2_node_map *target,
-			   int bit)
-{
-	struct ocfs2_node_map temp;
-	int ret;
-
-	spin_lock(&osb->node_map_lock);
-	__ocfs2_node_map_dup(&temp, target);
-	__ocfs2_node_map_clear_bit(&temp, bit);
-	ret = __ocfs2_node_map_is_empty(&temp);
-	spin_unlock(&osb->node_map_lock);
-
-	return ret;
-}
-
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
-				 struct ocfs2_node_map *from)
-{
-	int num_longs, i;
-
-	BUG_ON(target->num_nodes != from->num_nodes);
-	BUG_ON(target->num_nodes == 0);
-
-	num_longs = BITS_TO_LONGS(target->num_nodes);
-	for (i = 0; i < num_longs; i++)
-		target->map[i] = from->map[i];
-}
-
-#endif  /*  0  */
-
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-			   int num)
-{
-	int set = 0;
-
-	spin_lock(&osb->node_map_lock);
-
-	if (!test_bit(num, osb->recovery_map.map)) {
-	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
-	    set = 1;
-	}
-
-	spin_unlock(&osb->node_map_lock);
-
-	return set;
-}
-
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-			      int num)
-{
-	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-			   struct ocfs2_node_map *map,
-			   int idx)
-{
-	int i = idx;
-
-	idx = O2NM_INVALID_NODE_NUM;
-	spin_lock(&osb->node_map_lock);
-	if ((i != O2NM_INVALID_NODE_NUM) &&
-	    (i >= 0) &&
-	    (i < map->num_nodes)) {
-		while(i < map->num_nodes) {
-			if (test_bit(i, map->map)) {
-				idx = i;
-				break;
-			}
-			i++;
-		}
-	}
-	spin_unlock(&osb->node_map_lock);
-	return idx;
-}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63aed7611..98d8ffc995b1 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -33,8 +33,6 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 
 /* node map functions - used to keep track of mounted and in-recovery
  * nodes. */
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-			    struct ocfs2_node_map *map);
 void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
 			    struct ocfs2_node_map *map,
 			    int bit);
@@ -44,17 +42,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
 			    struct ocfs2_node_map *map,
 			    int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-			   struct ocfs2_node_map *map,
-			   int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
-					       struct ocfs2_node_map *map)
-{
-	return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-			   int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-			      int num);
 
 #endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ed0c6d0850d7..ca4c0ea5a4cd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 				 int slot);
 static int ocfs2_commit_thread(void *arg);
 
+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+	int rm_used;
+	unsigned int *rm_entries;
+};
+
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+	struct ocfs2_recovery_map *rm;
+
+	mutex_init(&osb->recovery_lock);
+	osb->disable_recovery = 0;
+	osb->recovery_thread_task = NULL;
+	init_waitqueue_head(&osb->recovery_event);
+
+	rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+		     osb->max_slots * sizeof(unsigned int),
+		     GFP_KERNEL);
+	if (!rm) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	rm->rm_entries = (unsigned int *)((char *)rm +
+					  sizeof(struct ocfs2_recovery_map));
+	osb->recovery_map = rm;
+
+	return 0;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+	mb();
+	return osb->recovery_thread_task != NULL;
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+	struct ocfs2_recovery_map *rm;
+
+	/* disable any new recovery threads and wait for any currently
+	 * running ones to exit. Do this before setting the vol_state. */
+	mutex_lock(&osb->recovery_lock);
+	osb->disable_recovery = 1;
+	mutex_unlock(&osb->recovery_lock);
+	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+	/* At this point, we know that no more recovery threads can be
+	 * launched, so wait for any recovery completion work to
+	 * complete. */
+	flush_workqueue(ocfs2_wq);
+
+	/*
+	 * Now that recovery is shut down, and the osb is about to be
+	 * freed,  the osb_lock is not taken here.
+	 */
+	rm = osb->recovery_map;
+	/* XXX: Should we bug if there are dirty entries? */
+
+	kfree(rm);
+}
+
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+				     unsigned int node_num)
+{
+	int i;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	assert_spin_locked(&osb->osb_lock);
+
+	for (i = 0; i < rm->rm_used; i++) {
+		if (rm->rm_entries[i] == node_num)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+				  unsigned int node_num)
+{
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+	if (__ocfs2_recovery_map_test(osb, node_num)) {
+		spin_unlock(&osb->osb_lock);
+		return 1;
+	}
+
+	/* XXX: Can this be exploited? Not from o2dlm... */
+	BUG_ON(rm->rm_used >= osb->max_slots);
+
+	rm->rm_entries[rm->rm_used] = node_num;
+	rm->rm_used++;
+	spin_unlock(&osb->osb_lock);
+
+	return 0;
+}
+
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+				     unsigned int node_num)
+{
+	int i;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+
+	for (i = 0; i < rm->rm_used; i++) {
+		if (rm->rm_entries[i] == node_num)
+			break;
+	}
+
+	if (i < rm->rm_used) {
+		/* XXX: be careful with the pointer math */
+		memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+			(rm->rm_used - i - 1) * sizeof(unsigned int));
+		rm->rm_used--;
+	}
+
+	spin_unlock(&osb->osb_lock);
+}
+
 static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
 	int status = 0;
@@ -650,6 +781,23 @@ bail:
 	return status;
 }
 
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+	int empty;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+	spin_lock(&osb->osb_lock);
+	empty = (rm->rm_used == 0);
+	spin_unlock(&osb->osb_lock);
+
+	return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+	wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
 /*
  * JBD Might read a cached version of another nodes journal file. We
  * don't want this as this file changes often and we get no
@@ -848,6 +996,7 @@ static int __ocfs2_recovery_thread(void *arg)
 {
 	int status, node_num;
 	struct ocfs2_super *osb = arg;
+	struct ocfs2_recovery_map *rm = osb->recovery_map;
 
 	mlog_entry_void();
 
@@ -863,26 +1012,29 @@ restart:
 		goto bail;
 	}
 
-	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
-		node_num = ocfs2_node_map_first_set_bit(osb,
-							&osb->recovery_map);
-		if (node_num == O2NM_INVALID_NODE_NUM) {
-			mlog(0, "Out of nodes to recover.\n");
-			break;
-		}
+	spin_lock(&osb->osb_lock);
+	while (rm->rm_used) {
+		/* It's always safe to remove entry zero, as we won't
+		 * clear it until ocfs2_recover_node() has succeeded. */
+		node_num = rm->rm_entries[0];
+		spin_unlock(&osb->osb_lock);
 
 		status = ocfs2_recover_node(osb, node_num);
-		if (status < 0) {
+		if (!status) {
+			ocfs2_recovery_map_clear(osb, node_num);
+		} else {
 			mlog(ML_ERROR,
 			     "Error %d recovering node %d on device (%u,%u)!\n",
 			     status, node_num,
 			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 			mlog(ML_ERROR, "Volume requires unmount.\n");
-			continue;
 		}
 
-		ocfs2_recovery_map_clear(osb, node_num);
+		spin_lock(&osb->osb_lock);
 	}
+	spin_unlock(&osb->osb_lock);
+	mlog(0, "All nodes recovered\n");
+
 	ocfs2_super_unlock(osb, 1);
 
 	/* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1045,7 @@ restart:
 
 bail:
 	mutex_lock(&osb->recovery_lock);
-	if (!status &&
-	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+	if (!status && !ocfs2_recovery_completed(osb)) {
 		mutex_unlock(&osb->recovery_lock);
 		goto restart;
 	}
@@ -924,8 +1075,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 
 	/* People waiting on recovery will wait on
 	 * the recovery map to empty. */
-	if (!ocfs2_recovery_map_set(osb, node_num))
-		mlog(0, "node %d already be in recovery.\n", node_num);
+	if (ocfs2_recovery_map_set(osb, node_num))
+		mlog(0, "node %d already in recovery map.\n", node_num);
 
 	mlog(0, "starting recovery thread...\n");
 
@@ -1197,7 +1348,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 		if (status == -ENOENT)
 			continue;
 
-		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+		if (__ocfs2_recovery_map_test(osb, node_num))
 			continue;
 		spin_unlock(&osb->osb_lock);
 
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e78..db82be2532ed 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
 
 /*
  *  Journal Control:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ee3f675a4210..c6ed8c35de0d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -180,6 +180,7 @@ enum ocfs2_mount_options
 
 struct ocfs2_journal;
 struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
@@ -191,7 +192,6 @@ struct ocfs2_super
 	struct ocfs2_slot_info *slot_info;
 
 	spinlock_t node_map_lock;
-	struct ocfs2_node_map recovery_map;
 
 	u64 root_blkno;
 	u64 system_dir_blkno;
@@ -226,6 +226,7 @@ struct ocfs2_super
 
 	atomic_t vol_state;
 	struct mutex recovery_lock;
+	struct ocfs2_recovery_map *recovery_map;
 	struct task_struct *recovery_thread_task;
 	int disable_recovery;
 	wait_queue_head_t checkpoint_event;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fad37af2af9c..1a4c7c7850f2 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1224,15 +1224,6 @@ leave:
 	return status;
 }
 
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
-	mb();
-	return osb->recovery_thread_task != NULL;
-}
-
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
 	int tmp;
@@ -1249,17 +1240,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_truncate_log_shutdown(osb);
 
-	/* disable any new recovery threads and wait for any currently
-	 * running ones to exit. Do this before setting the vol_state. */
-	mutex_lock(&osb->recovery_lock);
-	osb->disable_recovery = 1;
-	mutex_unlock(&osb->recovery_lock);
-	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
-	/* At this point, we know that no more recovery threads can be
-	 * launched, so wait for any recovery completion work to
-	 * complete. */
-	flush_workqueue(ocfs2_wq);
+	/* This will disable recovery and flush any recovery work. */
+	ocfs2_recovery_exit(osb);
 
 	ocfs2_journal_shutdown(osb);
 
@@ -1368,7 +1350,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->s_sectsize_bits = blksize_bits(sector_size);
 	BUG_ON(!osb->s_sectsize_bits);
 
-	init_waitqueue_head(&osb->recovery_event);
 	spin_lock_init(&osb->dc_task_lock);
 	init_waitqueue_head(&osb->dc_event);
 	osb->dc_work_sequence = 0;
@@ -1388,10 +1369,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
-	mutex_init(&osb->recovery_lock);
-
-	osb->disable_recovery = 0;
-	osb->recovery_thread_task = NULL;
+	status = ocfs2_recovery_init(osb);
+	if (status) {
+		mlog(ML_ERROR, "Unable to initialize recovery state\n");
+		mlog_errno(status);
+		goto bail;
+	}
 
 	init_waitqueue_head(&osb->checkpoint_event);
 	atomic_set(&osb->needs_checkpoint, 0);
-- 
cgit v1.2.3


From 1c8d9a6a330f46b3a6ddd204a2580131d5f0d6b7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 11:59:07 -0800
Subject: ocfs2: slot_map I/O based on max_slots.

The slot map code assumed a slot_map file has one block allocated.
This changes the code to I/O as many blocks as will cover max_slots.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/slot_map.c | 128 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 108 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 762360d95e93..5bddee110091 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -44,7 +44,8 @@
 
 struct ocfs2_slot_info {
 	struct inode *si_inode;
-	struct buffer_head *si_bh;
+	unsigned int si_blocks;
+	struct buffer_head **si_bh;
 	unsigned int si_num_slots;
 	unsigned int si_size;
 	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
@@ -68,7 +69,7 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 
 	/* we don't read the slot block here as ocfs2_super_lock
 	 * should've made sure we have the most recent copy. */
-	disk_info = (__le16 *) si->si_bh->b_data;
+	disk_info = (__le16 *) si->si_bh[0]->b_data;
 
 	for (i = 0; i < si->si_size; i++)
 		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
@@ -78,13 +79,23 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 {
 	int ret;
 	struct ocfs2_slot_info *si = osb->slot_info;
-	struct buffer_head *bh;
 
 	if (si == NULL)
 		return 0;
 
-	bh = si->si_bh;
-	ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode);
+	BUG_ON(si->si_blocks == 0);
+	BUG_ON(si->si_bh == NULL);
+
+	mlog(0, "Refreshing slot map, reading %u block(s)\n",
+	     si->si_blocks);
+
+	/*
+	 * We pass -1 as blocknr because we expect all of si->si_bh to
+	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
+	 * this is not true, the read of -1 (UINT64_MAX) will fail.
+	 */
+	ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+				si->si_inode);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -100,20 +111,42 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
 				   struct ocfs2_slot_info *si)
 {
 	int status, i;
-	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
+	__le16 *disk_info = (__le16 *) si->si_bh[0]->b_data;
 
 	spin_lock(&osb->osb_lock);
 	for (i = 0; i < si->si_size; i++)
 		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
 	spin_unlock(&osb->osb_lock);
 
-	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+	status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode);
 	if (status < 0)
 		mlog_errno(status);
 
 	return status;
 }
 
+/*
+ * Calculate how many bytes are needed by the slot map.  Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+					struct inode *inode,
+					unsigned long long *bytes)
+{
+	unsigned long long bytes_needed;
+
+	bytes_needed = osb->max_slots * sizeof(__le16);
+	if (bytes_needed > i_size_read(inode)) {
+		mlog(ML_ERROR,
+		     "Slot map file is too small!  (size %llu, needed %llu)\n",
+		     i_size_read(inode), bytes_needed);
+		return -ENOSPC;
+	}
+
+	*bytes = bytes_needed;
+	return 0;
+}
+
 /* try to find global node in the slot info. Returns
  * OCFS2_INVALID_SLOT if nothing is found. */
 static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
@@ -188,13 +221,22 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
 
 static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
 {
+	unsigned int i;
+
 	if (si == NULL)
 		return;
 
 	if (si->si_inode)
 		iput(si->si_inode);
-	if (si->si_bh)
-		brelse(si->si_bh);
+	if (si->si_bh) {
+		for (i = 0; i < si->si_blocks; i++) {
+			if (si->si_bh[i]) {
+				brelse(si->si_bh[i]);
+				si->si_bh[i] = NULL;
+			}
+		}
+		kfree(si->si_bh);
+	}
 
 	kfree(si);
 }
@@ -225,12 +267,65 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
 	return ocfs2_update_disk_slots(osb, osb->slot_info);
 }
 
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+				  struct ocfs2_slot_info *si)
+{
+	int status = 0;
+	u64 blkno;
+	unsigned long long blocks, bytes;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+	if (status)
+		goto bail;
+
+	blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+	BUG_ON(blocks > UINT_MAX);
+	si->si_blocks = blocks;
+	if (!si->si_blocks)
+		goto bail;
+
+	mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+	     si->si_blocks, bytes);
+
+	si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+			    GFP_KERNEL);
+	if (!si->si_bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	for (i = 0; i < si->si_blocks; i++) {
+		status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+						     &blkno, NULL, NULL);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		mlog(0, "Reading slot map block %u at %llu\n", i,
+		     (unsigned long long)blkno);
+
+		bh = NULL;  /* Acquire a fresh bh */
+		status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		si->si_bh[i] = bh;
+	}
+
+bail:
+	return status;
+}
+
 int ocfs2_init_slot_info(struct ocfs2_super *osb)
 {
 	int status, i;
-	u64 blkno;
 	struct inode *inode = NULL;
-	struct buffer_head *bh = NULL;
 	struct ocfs2_slot_info *si;
 
 	si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
@@ -254,20 +349,13 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+	si->si_inode = inode;
+	status = ocfs2_map_slot_buffers(osb, si);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	si->si_inode = inode;
-	si->si_bh = bh;
 	osb->slot_info = (struct ocfs2_slot_info *)si;
 bail:
 	if (status < 0 && si)
-- 
cgit v1.2.3


From fc881fa0d59596c02f8707b5572567c369d4789a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:04:48 -0800
Subject: ocfs2: De-magic the in-memory slot map.

The in-memory slot map uses the same magic as the on-disk one.  There is
a special value to mark a slot as invalid.  It relies on the size of
certain types and so on.

Write a new in-memory map that keeps validity as a separate field.  Outside
of the I/O functions, OCFS2_INVALID_SLOT now means what it is supposed to.
It also is no longer tied to the type size.

This also means that only the I/O functions refer to 16bit quantities.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c  |   2 +-
 fs/ocfs2/ocfs2.h    |   6 +--
 fs/ocfs2/slot_map.c | 130 +++++++++++++++++++++++++++++-----------------------
 fs/ocfs2/slot_map.h |   2 +-
 4 files changed, 77 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ca4c0ea5a4cd..bffd2d714f67 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -71,7 +71,7 @@ static int ocfs2_commit_thread(void *arg);
  */
 
 struct ocfs2_recovery_map {
-	int rm_used;
+	unsigned int rm_used;
 	unsigned int *rm_entries;
 };
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c6ed8c35de0d..95f783dbe8b2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -216,10 +216,10 @@ struct ocfs2_super
 	unsigned long s_mount_opt;
 	unsigned int s_atime_quantum;
 
-	u16 max_slots;
+	unsigned int max_slots;
 	s16 node_num;
-	s16 slot_num;
-	s16 preferred_slot;
+	int slot_num;
+	int preferred_slot;
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 5bddee110091..65a61bfa3f2e 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,21 +42,41 @@
 
 #include "buffer_head_io.h"
 
+
+struct ocfs2_slot {
+	int sl_valid;
+	unsigned int sl_node_num;
+};
+
 struct ocfs2_slot_info {
 	struct inode *si_inode;
 	unsigned int si_blocks;
 	struct buffer_head **si_bh;
 	unsigned int si_num_slots;
-	unsigned int si_size;
-	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+	struct ocfs2_slot *si_slots;
 };
 
 
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-				    s16 global);
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
-			      s16 slot_num,
-			      s16 node_num);
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+				  int slot_num)
+{
+	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+	si->si_slots[slot_num].sl_valid = 0;
+}
+
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+			   int slot_num, unsigned int node_num)
+{
+	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+	BUG_ON((node_num == O2NM_INVALID_NODE_NUM) ||
+	       (node_num >= O2NM_MAX_NODES));
+
+	si->si_slots[slot_num].sl_valid = 1;
+	si->si_slots[slot_num].sl_node_num = node_num;
+}
 
 /*
  * Post the slot information on disk into our slot_info struct.
@@ -71,8 +91,12 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 	 * should've made sure we have the most recent copy. */
 	disk_info = (__le16 *) si->si_bh[0]->b_data;
 
-	for (i = 0; i < si->si_size; i++)
-		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT)
+			ocfs2_invalidate_slot(si, i);
+		else
+			ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i]));
+	}
 }
 
 int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
@@ -114,8 +138,13 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
 	__le16 *disk_info = (__le16 *) si->si_bh[0]->b_data;
 
 	spin_lock(&osb->osb_lock);
-	for (i = 0; i < si->si_size; i++)
-		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (si->si_slots[i].sl_valid)
+			disk_info[i] =
+				cpu_to_le16(si->si_slots[i].sl_node_num);
+		else
+			disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+	}
 	spin_unlock(&osb->osb_lock);
 
 	status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode);
@@ -147,39 +176,39 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
 	return 0;
 }
 
-/* try to find global node in the slot info. Returns
- * OCFS2_INVALID_SLOT if nothing is found. */
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-				    s16 global)
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    unsigned int node_num)
 {
-	int i;
-	s16 ret = OCFS2_INVALID_SLOT;
+	int i, ret = -ENOENT;
 
 	for(i = 0; i < si->si_num_slots; i++) {
-		if (global == si->si_global_node_nums[i]) {
-			ret = (s16) i;
+		if (si->si_slots[i].sl_valid &&
+		    (node_num == si->si_slots[i].sl_node_num)) {
+			ret = i;
 			break;
 		}
 	}
+
 	return ret;
 }
 
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
-				   s16 preferred)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+				   int preferred)
 {
-	int i;
-	s16 ret = OCFS2_INVALID_SLOT;
+	int i, ret = -ENOSPC;
 
-	if (preferred >= 0 && preferred < si->si_num_slots) {
-		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+	if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+		if (!si->si_slots[preferred].sl_valid) {
 			ret = preferred;
 			goto out;
 		}
 	}
 
 	for(i = 0; i < si->si_num_slots; i++) {
-		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
-			ret = (s16) i;
+		if (!si->si_slots[i].sl_valid) {
+			ret = i;
 			break;
 		}
 	}
@@ -189,16 +218,13 @@ out:
 
 int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
 {
-	s16 slot;
+	int slot;
 	struct ocfs2_slot_info *si = osb->slot_info;
 
 	spin_lock(&osb->osb_lock);
 	slot = __ocfs2_node_num_to_slot(si, node_num);
 	spin_unlock(&osb->osb_lock);
 
-	if (slot == OCFS2_INVALID_SLOT)
-		return -ENOENT;
-
 	return slot;
 }
 
@@ -212,10 +238,10 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
 	BUG_ON(slot_num < 0);
 	BUG_ON(slot_num > osb->max_slots);
 
-	if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT)
+	if (!si->si_slots[slot_num].sl_valid)
 		return -ENOENT;
 
-	*node_num = si->si_global_node_nums[slot_num];
+	*node_num = si->si_slots[slot_num].sl_node_num;
 	return 0;
 }
 
@@ -241,19 +267,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
 	kfree(si);
 }
 
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
-			      s16 slot_num,
-			      s16 node_num)
-{
-	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
-	BUG_ON(slot_num >= si->si_num_slots);
-	BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
-	       (node_num >= O2NM_MAX_NODES));
-
-	si->si_global_node_nums[slot_num] = node_num;
-}
-
-int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
 {
 	struct ocfs2_slot_info *si = osb->slot_info;
 
@@ -261,7 +275,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num)
 		return 0;
 
 	spin_lock(&osb->osb_lock);
-	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
+	ocfs2_invalidate_slot(si, slot_num);
 	spin_unlock(&osb->osb_lock);
 
 	return ocfs2_update_disk_slots(osb, osb->slot_info);
@@ -324,11 +338,13 @@ bail:
 
 int ocfs2_init_slot_info(struct ocfs2_super *osb)
 {
-	int status, i;
+	int status;
 	struct inode *inode = NULL;
 	struct ocfs2_slot_info *si;
 
-	si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+	si = kzalloc(sizeof(struct ocfs2_slot_info) +
+		     (sizeof(struct ocfs2_slot) * osb->max_slots),
+		     GFP_KERNEL);
 	if (!si) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -336,10 +352,8 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 	}
 
 	si->si_num_slots = osb->max_slots;
-	si->si_size = OCFS2_MAX_SLOTS;
-
-	for(i = 0; i < si->si_num_slots; i++)
-		si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+	si->si_slots = (struct ocfs2_slot *)((char *)si +
+					     sizeof(struct ocfs2_slot_info));
 
 	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
 					    OCFS2_INVALID_SLOT);
@@ -375,7 +389,7 @@ void ocfs2_free_slot_info(struct ocfs2_super *osb)
 int ocfs2_find_slot(struct ocfs2_super *osb)
 {
 	int status;
-	s16 slot;
+	int slot;
 	struct ocfs2_slot_info *si;
 
 	mlog_entry_void();
@@ -390,11 +404,11 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	 * own journal recovery? Possibly not, though we certainly
 	 * need to warn to the user */
 	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
-	if (slot == OCFS2_INVALID_SLOT) {
+	if (slot < 0) {
 		/* if no slot yet, then just take 1st available
 		 * one. */
 		slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
-		if (slot == OCFS2_INVALID_SLOT) {
+		if (slot < 0) {
 			spin_unlock(&osb->osb_lock);
 			mlog(ML_ERROR, "no free slots available!\n");
 			status = -EINVAL;
@@ -404,7 +418,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
 		     slot);
 
-	__ocfs2_fill_slot(si, slot, osb->node_num);
+	ocfs2_set_slot(si, slot, osb->node_num);
 	osb->slot_num = slot;
 	spin_unlock(&osb->osb_lock);
 
@@ -430,7 +444,7 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
 	spin_lock(&osb->osb_lock);
 	ocfs2_update_slot_info(si);
 
-	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+	ocfs2_invalidate_slot(si, osb->slot_num);
 	osb->slot_num = OCFS2_INVALID_SLOT;
 	spin_unlock(&osb->osb_lock);
 
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 5118e89c84eb..601c95fd7003 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -39,6 +39,6 @@ int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
 int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
 				  unsigned int *node_num);
 
-int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num);
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
 
 #endif
-- 
cgit v1.2.3


From fb86b1f07120b66769a39c445da5c4300069dd44 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 11:59:05 -0800
Subject: ocfs2: Define the contents of the slot_map file.

The slot map file is merely an array of __le16.  Wrap it in a structure for
cleaner reference.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h | 12 ++++++++++++
 fs/ocfs2/slot_map.c | 15 ++++++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982f..3299116b8021 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -474,6 +474,18 @@ struct ocfs2_extent_block
 /* Actual on-disk size is one block */
 };
 
+/*
+ * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
+ * system file.
+ */
+struct ocfs2_slot_map {
+/*00*/	__le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block.  OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
 /*
  * On disk superblock for OCFS2
  * Note that it is contained inside an ocfs2_dinode, so all offsets
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 65a61bfa3f2e..e7e7a74156b1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -85,17 +85,17 @@ static void ocfs2_set_slot(struct ocfs2_slot_info *si,
 static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 {
 	int i;
-	__le16 *disk_info;
+	struct ocfs2_slot_map *sm;
 
 	/* we don't read the slot block here as ocfs2_super_lock
 	 * should've made sure we have the most recent copy. */
-	disk_info = (__le16 *) si->si_bh[0]->b_data;
+	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
 
 	for (i = 0; i < si->si_num_slots; i++) {
-		if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT)
+		if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
 			ocfs2_invalidate_slot(si, i);
 		else
-			ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i]));
+			ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
 	}
 }
 
@@ -135,15 +135,16 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
 				   struct ocfs2_slot_info *si)
 {
 	int status, i;
-	__le16 *disk_info = (__le16 *) si->si_bh[0]->b_data;
+	struct ocfs2_slot_map *sm;
 
 	spin_lock(&osb->osb_lock);
+	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
 	for (i = 0; i < si->si_num_slots; i++) {
 		if (si->si_slots[i].sl_valid)
-			disk_info[i] =
+			sm->sm_slots[i] =
 				cpu_to_le16(si->si_slots[i].sl_node_num);
 		else
-			disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+			sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
 	}
 	spin_unlock(&osb->osb_lock);
 
-- 
cgit v1.2.3


From 386a2ef8576e966076c293f6496b9e3d7e3d9035 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:06:54 -0800
Subject: ocfs2: New slot map format

The old slot map had a few limitations:

- It was limited to one block, so the maximum slot count was 255.
- Each slot was signed 16bits, limiting node numbers to INT16_MAX.
- An empty slot was marked by the magic 0xFFFF (-1).

The new slot map format provides 32bit node numbers (UINT32_MAX), a
separate space to mark a slot in use, and extra room to grow.  The slot
map is now bounded by i_size, not a block.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h    |   7 ++++
 fs/ocfs2/ocfs2_fs.h |  31 ++++++++++++++-
 fs/ocfs2/slot_map.c | 110 +++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 133 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 95f783dbe8b2..f78e9ed53249 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -374,6 +374,13 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
 }
 
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat &
+		OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
 #define OCFS2_IS_VALID_DINODE(ptr)					\
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3299116b8021..c49502329ab0 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,8 @@
 #define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
-					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
@@ -125,6 +126,10 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
 
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -476,7 +481,8 @@ struct ocfs2_extent_block
 
 /*
  * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
- * system file.
+ * system file.  A slot is valid if it contains a node number >= 0.  The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT.  This marks a slot empty.
  */
 struct ocfs2_slot_map {
 /*00*/	__le16 sm_slots[0];
@@ -486,6 +492,27 @@ struct ocfs2_slot_map {
  */
 };
 
+struct ocfs2_extended_slot {
+/*00*/	__u8	es_valid;
+	__u8	es_reserved1[3];
+	__le32	es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set.  It separates out the valid marker from the node number, and
+ * has room to grow.  Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/	struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file.  It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
 /*
  * On disk superblock for OCFS2
  * Note that it is contained inside an ocfs2_dinode, so all offsets
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e7e7a74156b1..63fb1b26d964 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -49,6 +49,8 @@ struct ocfs2_slot {
 };
 
 struct ocfs2_slot_info {
+	int si_extended;
+	int si_slots_per_block;
 	struct inode *si_inode;
 	unsigned int si_blocks;
 	struct buffer_head **si_bh;
@@ -78,17 +80,37 @@ static void ocfs2_set_slot(struct ocfs2_slot_info *si,
 	si->si_slots[slot_num].sl_node_num = node_num;
 }
 
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+	int b, i, slotno;
+	struct ocfs2_slot_map_extended *se;
+
+	slotno = 0;
+	for (b = 0; b < si->si_blocks; b++) {
+		se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+		for (i = 0;
+		     (i < si->si_slots_per_block) &&
+		     (slotno < si->si_num_slots);
+		     i++, slotno++) {
+			if (se->se_slots[i].es_valid)
+				ocfs2_set_slot(si, slotno,
+					       le32_to_cpu(se->se_slots[i].es_node_num));
+			else
+				ocfs2_invalidate_slot(si, slotno);
+		}
+	}
+}
+
 /*
  * Post the slot information on disk into our slot_info struct.
  * Must be protected by osb_lock.
  */
-static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
 {
 	int i;
 	struct ocfs2_slot_map *sm;
 
-	/* we don't read the slot block here as ocfs2_super_lock
-	 * should've made sure we have the most recent copy. */
 	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
 
 	for (i = 0; i < si->si_num_slots; i++) {
@@ -99,6 +121,18 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 	}
 }
 
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	/*
+	 * The slot data will have been refreshed when ocfs2_super_lock
+	 * was taken.
+	 */
+	if (si->si_extended)
+		ocfs2_update_slot_info_extended(si);
+	else
+		ocfs2_update_slot_info_old(si);
+}
+
 int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 {
 	int ret;
@@ -131,13 +165,31 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 
 /* post the our slot info stuff into it's destination bh and write it
  * out. */
-static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-				   struct ocfs2_slot_info *si)
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+					    int slot_num,
+					    struct buffer_head **bh)
+{
+	int blkind = slot_num / si->si_slots_per_block;
+	int slotno = slot_num % si->si_slots_per_block;
+	struct ocfs2_slot_map_extended *se;
+
+	BUG_ON(blkind >= si->si_blocks);
+
+	se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+	se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+	if (si->si_slots[slot_num].sl_valid)
+		se->se_slots[slotno].es_node_num =
+			cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+	*bh = si->si_bh[blkind];
+}
+
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+				       int slot_num,
+				       struct buffer_head **bh)
 {
-	int status, i;
+	int i;
 	struct ocfs2_slot_map *sm;
 
-	spin_lock(&osb->osb_lock);
 	sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
 	for (i = 0; i < si->si_num_slots; i++) {
 		if (si->si_slots[i].sl_valid)
@@ -146,9 +198,24 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb,
 		else
 			sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
 	}
+	*bh = si->si_bh[0];
+}
+
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+				  struct ocfs2_slot_info *si,
+				  int slot_num)
+{
+	int status;
+	struct buffer_head *bh;
+
+	spin_lock(&osb->osb_lock);
+	if (si->si_extended)
+		ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+	else
+		ocfs2_update_disk_slot_old(si, slot_num, &bh);
 	spin_unlock(&osb->osb_lock);
 
-	status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode);
+	status = ocfs2_write_block(osb, bh, si->si_inode);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -165,7 +232,12 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
 {
 	unsigned long long bytes_needed;
 
-	bytes_needed = osb->max_slots * sizeof(__le16);
+	if (ocfs2_uses_extended_slot_map(osb)) {
+		bytes_needed = osb->max_slots *
+			sizeof(struct ocfs2_extended_slot);
+	} else {
+		bytes_needed = osb->max_slots * sizeof(__le16);
+	}
 	if (bytes_needed > i_size_read(inode)) {
 		mlog(ML_ERROR,
 		     "Slot map file is too small!  (size %llu, needed %llu)\n",
@@ -279,7 +351,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
 	ocfs2_invalidate_slot(si, slot_num);
 	spin_unlock(&osb->osb_lock);
 
-	return ocfs2_update_disk_slots(osb, osb->slot_info);
+	return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
 }
 
 static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
@@ -301,6 +373,16 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 	if (!si->si_blocks)
 		goto bail;
 
+	if (si->si_extended)
+		si->si_slots_per_block =
+			(osb->sb->s_blocksize /
+			 sizeof(struct ocfs2_extended_slot));
+	else
+		si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+	/* The size checks above should ensure this */
+	BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
 	mlog(0, "Slot map needs %u buffers for %llu bytes\n",
 	     si->si_blocks, bytes);
 
@@ -352,6 +434,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
+	si->si_extended = ocfs2_uses_extended_slot_map(osb);
 	si->si_num_slots = osb->max_slots;
 	si->si_slots = (struct ocfs2_slot *)((char *)si +
 					     sizeof(struct ocfs2_slot_info));
@@ -425,7 +508,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 
 	mlog(0, "taking node slot %d\n", osb->slot_num);
 
-	status = ocfs2_update_disk_slots(osb, si);
+	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -436,7 +519,7 @@ bail:
 
 void ocfs2_put_slot(struct ocfs2_super *osb)
 {
-	int status;
+	int status, slot_num;
 	struct ocfs2_slot_info *si = osb->slot_info;
 
 	if (!si)
@@ -445,11 +528,12 @@ void ocfs2_put_slot(struct ocfs2_super *osb)
 	spin_lock(&osb->osb_lock);
 	ocfs2_update_slot_info(si);
 
+	slot_num = osb->slot_num;
 	ocfs2_invalidate_slot(si, osb->slot_num);
 	osb->slot_num = OCFS2_INVALID_SLOT;
 	spin_unlock(&osb->osb_lock);
 
-	status = ocfs2_update_disk_slots(osb, si);
+	status = ocfs2_update_disk_slot(osb, si, slot_num);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
-- 
cgit v1.2.3


From 24ef1815e5e13e50196eb1ab8ddc0d783443bdf8 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 29 Jan 2008 17:37:32 -0800
Subject: ocfs2: Separate out dlm lock functions.

This is the first in a series of patches to isolate ocfs2 from the
underlying cluster stack. Here we wrap the dlm locking functions with
ocfs2-specific calls. Because ocfs2 always uses the same dlm lock status
callbacks, we can eliminate the callbacks from the filesystem visible
functions.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile    |   1 +
 fs/ocfs2/dlmglue.c   | 110 ++++++++++++++++++++++++++++-----------------------
 fs/ocfs2/dlmglue.h   |   3 ++
 fs/ocfs2/stackglue.c |  65 ++++++++++++++++++++++++++++++
 fs/ocfs2/stackglue.h |  45 +++++++++++++++++++++
 fs/ocfs2/super.c     |   4 ++
 6 files changed, 179 insertions(+), 49 deletions(-)
 create mode 100644 fs/ocfs2/stackglue.c
 create mode 100644 fs/ocfs2/stackglue.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42c..3ba64af26951 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -24,6 +24,7 @@ ocfs2-objs := \
 	namei.o 		\
 	resize.o		\
 	slot_map.o 		\
+	stackglue.o		\
 	suballoc.o 		\
 	super.o 		\
 	symlink.o 		\
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 15a5167e0513..aea3bef19171 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "stackglue.h"
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
@@ -888,22 +889,21 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	status = dlmlock(osb->dlm,
-			 level,
-			 &lockres->l_lksb,
-			 dlm_flags,
-			 lockres->l_name,
-			 OCFS2_LOCK_ID_MAX_LEN - 1,
-			 ocfs2_locking_ast,
-			 lockres,
-			 ocfs2_blocking_ast);
+	status = ocfs2_dlm_lock(osb->dlm,
+				level,
+				&lockres->l_lksb,
+				dlm_flags,
+				lockres->l_name,
+				OCFS2_LOCK_ID_MAX_LEN - 1,
+				lockres);
 	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", status, lockres);
 		ret = -EINVAL;
 		ocfs2_recover_from_dlm_error(lockres, 1);
 	}
 
-	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+	mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+	     lockres->l_name);
 
 bail:
 	mlog_exit(ret);
@@ -1091,29 +1091,27 @@ again:
 		     lockres->l_name, lockres->l_level, level);
 
 		/* call dlm_lock to upgrade lock now */
-		status = dlmlock(osb->dlm,
-				 level,
-				 &lockres->l_lksb,
-				 lkm_flags,
-				 lockres->l_name,
-				 OCFS2_LOCK_ID_MAX_LEN - 1,
-				 ocfs2_locking_ast,
-				 lockres,
-				 ocfs2_blocking_ast);
+		status = ocfs2_dlm_lock(osb->dlm,
+					level,
+					&lockres->l_lksb,
+					lkm_flags,
+					lockres->l_name,
+					OCFS2_LOCK_ID_MAX_LEN - 1,
+					lockres);
 		if (status != DLM_NORMAL) {
 			if ((lkm_flags & LKM_NOQUEUE) &&
 			    (status == DLM_NOTQUEUED))
 				ret = -EAGAIN;
 			else {
-				ocfs2_log_dlm_error("dlmlock", status,
-						    lockres);
+				ocfs2_log_dlm_error("ocfs2_dlm_lock",
+						    status, lockres);
 				ret = -EINVAL;
 			}
 			ocfs2_recover_from_dlm_error(lockres, 1);
 			goto out;
 		}
 
-		mlog(0, "lock %s, successfull return from dlmlock\n",
+		mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
 		     lockres->l_name);
 
 		/* At this point we've gone inside the dlm and need to
@@ -1503,14 +1501,14 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
-		      lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
-		      ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+	ret = ocfs2_dlm_lock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+			     lockres);
 	if (ret != DLM_NORMAL) {
 		if (trylock && ret == DLM_NOTQUEUED)
 			ret = -EAGAIN;
 		else {
-			ocfs2_log_dlm_error("dlmlock", ret, lockres);
+			ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
 			ret = -EINVAL;
 		}
 
@@ -2699,15 +2697,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
-			   ocfs2_unlock_ast, lockres);
+	status = ocfs2_dlm_unlock(osb->dlm, &lockres->l_lksb, lkm_flags,
+				  lockres);
 	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
 		dlm_print_one_lock(lockres->l_lksb.lockid);
 		BUG();
 	}
-	mlog(0, "lock %s, successfull return from dlmunlock\n",
+	mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
 	     lockres->l_name);
 
 	ocfs2_wait_on_busy_lock(lockres);
@@ -2832,17 +2830,15 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 	if (lvb)
 		dlm_flags |= LKM_VALBLK;
 
-	status = dlmlock(osb->dlm,
-			 new_level,
-			 &lockres->l_lksb,
-			 dlm_flags,
-			 lockres->l_name,
-			 OCFS2_LOCK_ID_MAX_LEN - 1,
-			 ocfs2_locking_ast,
-			 lockres,
-			 ocfs2_blocking_ast);
+	status = ocfs2_dlm_lock(osb->dlm,
+				new_level,
+				&lockres->l_lksb,
+				dlm_flags,
+				lockres->l_name,
+				OCFS2_LOCK_ID_MAX_LEN - 1,
+				lockres);
 	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", status, lockres);
 		ret = -EINVAL;
 		ocfs2_recover_from_dlm_error(lockres, 1);
 		goto bail;
@@ -2854,7 +2850,7 @@ bail:
 	return ret;
 }
 
-/* returns 1 when the caller should unlock and call dlmunlock */
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 				        struct ocfs2_lock_res *lockres)
 {
@@ -2896,18 +2892,17 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 	mlog(0, "lock %s\n", lockres->l_name);
 
 	ret = 0;
-	status = dlmunlock(osb->dlm,
-			   &lockres->l_lksb,
-			   LKM_CANCEL,
-			   ocfs2_unlock_ast,
-			   lockres);
+	status = ocfs2_dlm_unlock(osb->dlm,
+				  &lockres->l_lksb,
+				  LKM_CANCEL,
+				  lockres);
 	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
 		ret = -EINVAL;
 		ocfs2_recover_from_dlm_error(lockres, 0);
 	}
 
-	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+	mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
 
 	mlog_exit(ret);
 	return ret;
@@ -3211,6 +3206,23 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 }
 
+static struct ocfs2_locking_protocol lproto = {
+	.lp_lock_ast		= ocfs2_locking_ast,
+	.lp_blocking_ast	= ocfs2_blocking_ast,
+	.lp_unlock_ast		= ocfs2_unlock_ast,
+};
+
+/* This interface isn't the final one, hence the less-than-perfect names */
+void dlmglue_init_stack(void)
+{
+	o2cb_get_stack(&lproto);
+}
+
+void dlmglue_exit_stack(void)
+{
+	o2cb_put_stack();
+}
+
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 				       struct ocfs2_lock_res *lockres)
 {
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902404b4..32380439401f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -114,5 +114,8 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
+void dlmglue_init_stack(void);
+void dlmglue_exit_stack(void);
+
 extern const struct dlm_protocol_version ocfs2_locking_protocol;
 #endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000000..4f44f23795f0
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,65 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+#include "dlm/dlmapi.h"
+
+#include "stackglue.h"
+
+static struct ocfs2_locking_protocol *lproto;
+
+enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
+		   int mode,
+		   struct dlm_lockstatus *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen,
+		   void *astarg)
+{
+	BUG_ON(lproto == NULL);
+	return dlmlock(dlm, mode, lksb, flags, name, namelen,
+		       lproto->lp_lock_ast, astarg,
+		       lproto->lp_blocking_ast);
+}
+
+enum dlm_status ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
+		     struct dlm_lockstatus *lksb,
+		     u32 flags,
+		     void *astarg)
+{
+	BUG_ON(lproto == NULL);
+
+	return dlmunlock(dlm, lksb, flags, lproto->lp_unlock_ast, astarg);
+}
+
+
+void o2cb_get_stack(struct ocfs2_locking_protocol *proto)
+{
+	BUG_ON(proto == NULL);
+
+	lproto = proto;
+}
+
+void o2cb_put_stack(void)
+{
+	lproto = NULL;
+}
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000000..40a002413404
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+
+struct ocfs2_locking_protocol {
+	void (*lp_lock_ast)(void *astarg);
+	void (*lp_blocking_ast)(void *astarg, int level);
+	void (*lp_unlock_ast)(void *astarg, enum dlm_status status);
+};
+
+enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
+		   int mode,
+		   struct dlm_lockstatus *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen,
+		   void *astarg);
+enum dlm_status ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
+		     struct dlm_lockstatus *lksb,
+		     u32 flags,
+		     void *astarg);
+
+void o2cb_get_stack(struct ocfs2_locking_protocol *proto);
+void o2cb_put_stack(void);
+
+#endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1a4c7c7850f2..c8675464e299 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -933,6 +933,8 @@ static int __init ocfs2_init(void)
 
 	ocfs2_print_version();
 
+	dlmglue_init_stack();
+
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0) {
 		mlog_errno(status);
@@ -988,6 +990,8 @@ static void __exit ocfs2_exit(void)
 
 	exit_ocfs2_uptodate_cache();
 
+	dlmglue_exit_stack();
+
 	mlog_exit_void();
 }
 
-- 
cgit v1.2.3


From bd3e76105d4478ab89951a52d1a35250d24a9f16 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:14:57 -0800
Subject: ocfs2: Use global DLM_ constants in generic code.

The ocfs2 generic code should use the values in <linux/dlmconstants.h>.
stackglue.c will convert them to o2dlm values.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   | 140 +++++++++++++++++++++++++--------------------------
 fs/ocfs2/stackglue.c |  71 +++++++++++++++++++++++---
 fs/ocfs2/stackglue.h |  13 +++++
 3 files changed, 147 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index aea3bef19171..b8ac903ef8ef 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -37,8 +37,6 @@
 #include <cluster/nodemanager.h>
 #include <cluster/tcp.h>
 
-#include <dlm/dlmapi.h>
-
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
 
@@ -317,7 +315,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
 static int ocfs2_lock_create(struct ocfs2_super *osb,
 			     struct ocfs2_lock_res *lockres,
 			     int level,
-			     int dlm_flags);
+			     u32 dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
 						     int wanted);
 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -407,9 +405,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
 	res->l_ops           = ops;
 	res->l_priv          = priv;
 
-	res->l_level         = LKM_IVMODE;
-	res->l_requested     = LKM_IVMODE;
-	res->l_blocking      = LKM_IVMODE;
+	res->l_level         = DLM_LOCK_IV;
+	res->l_requested     = DLM_LOCK_IV;
+	res->l_blocking      = DLM_LOCK_IV;
 	res->l_action        = OCFS2_AST_INVALID;
 	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
 
@@ -605,10 +603,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
 	BUG_ON(!lockres);
 
 	switch(level) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		lockres->l_ex_holders++;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		lockres->l_ro_holders++;
 		break;
 	default:
@@ -626,11 +624,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
 	BUG_ON(!lockres);
 
 	switch(level) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		BUG_ON(!lockres->l_ex_holders);
 		lockres->l_ex_holders--;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		BUG_ON(!lockres->l_ro_holders);
 		lockres->l_ro_holders--;
 		break;
@@ -645,12 +643,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
  * lock types are added. */
 static inline int ocfs2_highest_compat_lock_level(int level)
 {
-	int new_level = LKM_EXMODE;
+	int new_level = DLM_LOCK_EX;
 
-	if (level == LKM_EXMODE)
-		new_level = LKM_NLMODE;
-	else if (level == LKM_PRMODE)
-		new_level = LKM_PRMODE;
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
 	return new_level;
 }
 
@@ -689,12 +687,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
 
 	lockres->l_level = lockres->l_requested;
 	if (lockres->l_level <=
 	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
-		lockres->l_blocking = LKM_NLMODE;
+		lockres->l_blocking = DLM_LOCK_NL;
 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
 	}
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -713,7 +711,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
 	 * information is already up to data. Convert from NL to
 	 * *anything* however should mark ourselves as needing an
 	 * update */
-	if (lockres->l_level == LKM_NLMODE &&
+	if (lockres->l_level == DLM_LOCK_NL &&
 	    lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
 
@@ -730,7 +728,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
 	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
 
-	if (lockres->l_requested > LKM_NLMODE &&
+	if (lockres->l_requested > DLM_LOCK_NL &&
 	    !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
 	    lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -775,7 +773,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 	int needs_downconvert;
 	unsigned long flags;
 
-	BUG_ON(level <= LKM_NLMODE);
+	BUG_ON(level <= DLM_LOCK_NL);
 
 	mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
 	     lockres->l_name, level, lockres->l_level,
@@ -866,7 +864,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 static int ocfs2_lock_create(struct ocfs2_super *osb,
 			     struct ocfs2_lock_res *lockres,
 			     int level,
-			     int dlm_flags)
+			     u32 dlm_flags)
 {
 	int ret = 0;
 	enum dlm_status status = DLM_NORMAL;
@@ -874,7 +872,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
-	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+	mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
 	     dlm_flags);
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
@@ -1016,7 +1014,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
 			      struct ocfs2_lock_res *lockres,
 			      int level,
-			      int lkm_flags,
+			      u32 lkm_flags,
 			      int arg_flags)
 {
 	struct ocfs2_mask_waiter mw;
@@ -1030,7 +1028,7 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
 	ocfs2_init_mask_waiter(&mw);
 
 	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-		lkm_flags |= LKM_VALBLK;
+		lkm_flags |= DLM_LKF_VALBLK;
 
 again:
 	wait = 0;
@@ -1074,18 +1072,18 @@ again:
 
 		if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
 			lockres->l_action = OCFS2_AST_ATTACH;
-			lkm_flags &= ~LKM_CONVERT;
+			lkm_flags &= ~DLM_LKF_CONVERT;
 		} else {
 			lockres->l_action = OCFS2_AST_CONVERT;
-			lkm_flags |= LKM_CONVERT;
+			lkm_flags |= DLM_LKF_CONVERT;
 		}
 
 		lockres->l_requested = level;
 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-		BUG_ON(level == LKM_IVMODE);
-		BUG_ON(level == LKM_NLMODE);
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
 
 		mlog(0, "lock %s, convert from %d to level = %d\n",
 		     lockres->l_name, lockres->l_level, level);
@@ -1099,7 +1097,7 @@ again:
 					OCFS2_LOCK_ID_MAX_LEN - 1,
 					lockres);
 		if (status != DLM_NORMAL) {
-			if ((lkm_flags & LKM_NOQUEUE) &&
+			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
 			    (status == DLM_NOTQUEUED))
 				ret = -EAGAIN;
 			else {
@@ -1175,9 +1173,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
 				 int ex,
 				 int local)
 {
-	int level =  ex ? LKM_EXMODE : LKM_PRMODE;
+	int level =  ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	unsigned long flags;
-	int lkm_flags = local ? LKM_LOCAL : 0;
+	u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1220,7 +1218,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 	}
 
 	/*
-	 * We don't want to use LKM_LOCAL on a meta data lock as they
+	 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
 	 * don't use a generation in their lock names.
 	 */
 	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1259,7 +1257,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 
 	lockres = &OCFS2_I(inode)->ip_rw_lockres;
 
-	level = write ? LKM_EXMODE : LKM_PRMODE;
+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
 
 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
 				    0);
@@ -1272,7 +1270,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 
 void ocfs2_rw_unlock(struct inode *inode, int write)
 {
-	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -1310,7 +1308,7 @@ int ocfs2_open_lock(struct inode *inode)
 	lockres = &OCFS2_I(inode)->ip_open_lockres;
 
 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-				    LKM_PRMODE, 0, 0);
+				    DLM_LOCK_PR, 0, 0);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -1338,16 +1336,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
 
 	lockres = &OCFS2_I(inode)->ip_open_lockres;
 
-	level = write ? LKM_EXMODE : LKM_PRMODE;
+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
 
 	/*
 	 * The file system may already holding a PRMODE/EXMODE open lock.
-	 * Since we pass LKM_NOQUEUE, the request won't block waiting on
+	 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
 	 * other nodes and the -EAGAIN will indicate to the caller that
 	 * this inode is still in use.
 	 */
 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-				    level, LKM_NOQUEUE, 0);
+				    level, DLM_LKF_NOQUEUE, 0);
 
 out:
 	mlog_exit(status);
@@ -1372,10 +1370,10 @@ void ocfs2_open_unlock(struct inode *inode)
 
 	if(lockres->l_ro_holders)
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-				     LKM_PRMODE);
+				     DLM_LOCK_PR);
 	if(lockres->l_ex_holders)
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-				     LKM_EXMODE);
+				     DLM_LOCK_EX);
 
 out:
 	mlog_exit_void();
@@ -1462,7 +1460,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	ocfs2_init_mask_waiter(&mw);
 
 	if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
-	    (lockres->l_level > LKM_NLMODE)) {
+	    (lockres->l_level > DLM_LOCK_NL)) {
 		mlog(ML_ERROR,
 		     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
 		     "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1570,7 +1568,7 @@ void ocfs2_file_unlock(struct file *file)
 	 * Fake a blocking ast for the downconvert code.
 	 */
 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-	lockres->l_blocking = LKM_EXMODE;
+	lockres->l_blocking = DLM_LOCK_EX;
 
 	ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
 	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
@@ -1599,11 +1597,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 	 * condition. */
 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
 		switch(lockres->l_blocking) {
-		case LKM_EXMODE:
+		case DLM_LOCK_EX:
 			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
 				kick = 1;
 			break;
-		case LKM_PRMODE:
+		case DLM_LOCK_PR:
 			if (!lockres->l_ex_holders)
 				kick = 1;
 			break;
@@ -1921,7 +1919,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
 			 int ex,
 			 int arg_flags)
 {
-	int status, level, dlm_flags, acquired;
+	int status, level, acquired;
+	u32 dlm_flags;
 	struct ocfs2_lock_res *lockres = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *local_bh = NULL;
@@ -1951,10 +1950,10 @@ int ocfs2_inode_lock_full(struct inode *inode,
 		ocfs2_wait_for_recovery(osb);
 
 	lockres = &OCFS2_I(inode)->ip_inode_lockres;
-	level = ex ? LKM_EXMODE : LKM_PRMODE;
+	level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	dlm_flags = 0;
 	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
-		dlm_flags |= LKM_NOQUEUE;
+		dlm_flags |= DLM_LKF_NOQUEUE;
 
 	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
 	if (status < 0) {
@@ -2105,7 +2104,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 void ocfs2_inode_unlock(struct inode *inode,
 		       int ex)
 {
-	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -2126,7 +2125,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 		     int ex)
 {
 	int status = 0;
-	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 
 	mlog_entry_void();
@@ -2168,7 +2167,7 @@ bail:
 void ocfs2_super_unlock(struct ocfs2_super *osb,
 			int ex)
 {
-	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 
 	if (!ocfs2_mount_local(osb))
@@ -2186,7 +2185,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
 	if (ocfs2_mount_local(osb))
 		return 0;
 
-	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -2198,13 +2197,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
 
 	if (!ocfs2_mount_local(osb))
-		ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+		ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
 
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
 	int ret;
-	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
@@ -2225,7 +2224,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
 {
-	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
@@ -2614,7 +2613,7 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
 		lockres->l_action = OCFS2_AST_INVALID;
 		break;
 	case OCFS2_UNLOCK_DROP_LOCK:
-		lockres->l_level = LKM_IVMODE;
+		lockres->l_level = DLM_LOCK_IV;
 		break;
 	default:
 		BUG();
@@ -2635,14 +2634,14 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 {
 	enum dlm_status status;
 	unsigned long flags;
-	int lkm_flags = 0;
+	u32 lkm_flags = 0;
 
 	/* We didn't get anywhere near actually using this lockres. */
 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
 		goto out;
 
 	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-		lkm_flags |= LKM_VALBLK;
+		lkm_flags |= DLM_LKF_VALBLK;
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 
@@ -2668,7 +2667,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
 		if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
-		    lockres->l_level == LKM_EXMODE &&
+		    lockres->l_level == DLM_LOCK_EX &&
 		    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
 			lockres->l_ops->set_lvb(lockres);
 	}
@@ -2801,10 +2800,10 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
 {
 	assert_spin_locked(&lockres->l_lock);
 
-	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
 
 	if (lockres->l_level <= new_level) {
-		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+		mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
 		     lockres->l_level, new_level);
 		BUG();
 	}
@@ -2822,13 +2821,14 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 				  int new_level,
 				  int lvb)
 {
-	int ret, dlm_flags = LKM_CONVERT;
+	int ret;
+	u32 dlm_flags = DLM_LKF_CONVERT;
 	enum dlm_status status;
 
 	mlog_entry_void();
 
 	if (lvb)
-		dlm_flags |= LKM_VALBLK;
+		dlm_flags |= DLM_LKF_VALBLK;
 
 	status = ocfs2_dlm_lock(osb->dlm,
 				new_level,
@@ -2894,7 +2894,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 	ret = 0;
 	status = ocfs2_dlm_unlock(osb->dlm,
 				  &lockres->l_lksb,
-				  LKM_CANCEL,
+				  DLM_LKF_CANCEL,
 				  lockres);
 	if (status != DLM_NORMAL) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
@@ -2939,13 +2939,13 @@ recheck:
 
 	/* if we're blocking an exclusive and we have *any* holders,
 	 * then requeue. */
-	if ((lockres->l_blocking == LKM_EXMODE)
+	if ((lockres->l_blocking == DLM_LOCK_EX)
 	    && (lockres->l_ex_holders || lockres->l_ro_holders))
 		goto leave_requeue;
 
 	/* If it's a PR we're blocking, then only
 	 * requeue if we've got any EX holders */
-	if (lockres->l_blocking == LKM_PRMODE &&
+	if (lockres->l_blocking == DLM_LOCK_PR &&
 	    lockres->l_ex_holders)
 		goto leave_requeue;
 
@@ -2992,7 +2992,7 @@ downconvert:
 	ctl->requeue = 0;
 
 	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
-		if (lockres->l_level == LKM_EXMODE)
+		if (lockres->l_level == DLM_LOCK_EX)
 			set_lvb = 1;
 
 		/*
@@ -3046,7 +3046,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	}
 	sync_mapping_buffers(mapping);
-	if (blocking == LKM_EXMODE) {
+	if (blocking == DLM_LOCK_EX) {
 		truncate_inode_pages(mapping, 0);
 	} else {
 		/* We only need to wait on the I/O if we're not also
@@ -3067,8 +3067,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
 	struct inode *inode = ocfs2_lock_res_inode(lockres);
 	int checkpointed = ocfs2_inode_fully_checkpointed(inode);
 
-	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
-	BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
+	BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
+	BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
 
 	if (checkpointed)
 		return 1;
@@ -3132,7 +3132,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	 * valid. The downconvert code will retain a PR for this node,
 	 * so there's no further work to do.
 	 */
-	if (blocking == LKM_PRMODE)
+	if (blocking == DLM_LOCK_PR)
 		return UNBLOCK_CONTINUE;
 
 	/*
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 4f44f23795f0..99538043fc17 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -18,15 +18,65 @@
  * General Public License for more details.
  */
 
-#include <linux/types.h>
-#include <linux/list.h>
-
-#include "dlm/dlmapi.h"
-
 #include "stackglue.h"
 
 static struct ocfs2_locking_protocol *lproto;
 
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+	BUG_ON(mode > LKM_MAXMODE);
+
+	return mode;
+}
+
+#define map_flag(_generic, _o2dlm)		\
+	if (flags & (_generic)) {		\
+		flags &= ~(_generic);		\
+		o2dlm_flags |= (_o2dlm);	\
+	}
+static int flags_to_o2dlm(u32 flags)
+{
+	int o2dlm_flags = 0;
+
+	map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+	map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+	map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+	map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+	map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+	map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+	map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+	map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+	map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+	/* map_flag() should have cleared every flag passed in */
+	BUG_ON(flags != 0);
+
+	return o2dlm_flags;
+}
+#undef map_flag
+
 enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   int mode,
 		   struct dlm_lockstatus *lksb,
@@ -35,8 +85,12 @@ enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   unsigned int namelen,
 		   void *astarg)
 {
+	int o2dlm_mode = mode_to_o2dlm(mode);
+	int o2dlm_flags = flags_to_o2dlm(flags);
+
 	BUG_ON(lproto == NULL);
-	return dlmlock(dlm, mode, lksb, flags, name, namelen,
+
+	return dlmlock(dlm, o2dlm_mode, lksb, o2dlm_flags, name, namelen,
 		       lproto->lp_lock_ast, astarg,
 		       lproto->lp_blocking_ast);
 }
@@ -46,9 +100,12 @@ enum dlm_status ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
 		     u32 flags,
 		     void *astarg)
 {
+	int o2dlm_flags = flags_to_o2dlm(flags);
+
 	BUG_ON(lproto == NULL);
 
-	return dlmunlock(dlm, lksb, flags, lproto->lp_unlock_ast, astarg);
+	return dlmunlock(dlm, lksb, o2dlm_flags,
+			 lproto->lp_unlock_ast, astarg);
 }
 
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 40a002413404..986d059ed1e0 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -21,6 +21,19 @@
 #ifndef STACKGLUE_H
 #define STACKGLUE_H
 
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+
+/*
+ * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
+ * some day, but right now we need it.  Let's fake it.  This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL		0x00100000
+
+#include "dlm/dlmapi.h"
+
 struct ocfs2_locking_protocol {
 	void (*lp_lock_ast)(void *astarg);
 	void (*lp_blocking_ast)(void *astarg, int level);
-- 
cgit v1.2.3


From 7431cd7e8dd0e46e9b12bd6a1ac1286f4b420371 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:15:37 -0800
Subject: ocfs2: Use -errno instead of dlm_status for ocfs2_dlm_lock/unlock()
 API.

Change the ocfs2_dlm_lock/unlock() functions to return -errno values.
This is the first step towards elminiating dlm_status in
fs/ocfs2/dlmglue.c.  The change also passes -errno values to
->unlock_ast().

[ Fix a return code in dlmglue.c and change the error translation table into
  an array of ints. --Mark ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   | 116 ++++++++++++++++++-----------------------
 fs/ocfs2/stackglue.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/stackglue.h |   6 +--
 3 files changed, 188 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b8ac903ef8ef..6a222a5c81da 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -329,10 +329,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert);
-#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
-	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
-		"resource %s: %s\n", dlm_errname(_stat), _func,	\
-		_lockres->l_name, dlm_errmsg(_stat));		\
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {			\
+	mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
+	     _err, _func, _lockres->l_name);				\
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -867,7 +866,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 			     u32 dlm_flags)
 {
 	int ret = 0;
-	enum dlm_status status = DLM_NORMAL;
 	unsigned long flags;
 
 	mlog_entry_void();
@@ -887,21 +885,19 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	status = ocfs2_dlm_lock(osb->dlm,
-				level,
-				&lockres->l_lksb,
-				dlm_flags,
-				lockres->l_name,
-				OCFS2_LOCK_ID_MAX_LEN - 1,
-				lockres);
-	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("ocfs2_dlm_lock", status, lockres);
-		ret = -EINVAL;
+	ret = ocfs2_dlm_lock(osb->dlm,
+			     level,
+			     &lockres->l_lksb,
+			     dlm_flags,
+			     lockres->l_name,
+			     OCFS2_LOCK_ID_MAX_LEN - 1,
+			     lockres);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 1);
 	}
 
-	mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
-	     lockres->l_name);
+	mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 
 bail:
 	mlog_exit(ret);
@@ -1018,7 +1014,6 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
 			      int arg_flags)
 {
 	struct ocfs2_mask_waiter mw;
-	enum dlm_status status;
 	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
 	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
 	unsigned long flags;
@@ -1089,21 +1084,18 @@ again:
 		     lockres->l_name, lockres->l_level, level);
 
 		/* call dlm_lock to upgrade lock now */
-		status = ocfs2_dlm_lock(osb->dlm,
-					level,
-					&lockres->l_lksb,
-					lkm_flags,
-					lockres->l_name,
-					OCFS2_LOCK_ID_MAX_LEN - 1,
-					lockres);
-		if (status != DLM_NORMAL) {
-			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
-			    (status == DLM_NOTQUEUED))
-				ret = -EAGAIN;
-			else {
+		ret = ocfs2_dlm_lock(osb->dlm,
+				     level,
+				     &lockres->l_lksb,
+				     lkm_flags,
+				     lockres->l_name,
+				     OCFS2_LOCK_ID_MAX_LEN - 1,
+				     lockres);
+		if (ret) {
+			if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
+			    (ret != -EAGAIN)) {
 				ocfs2_log_dlm_error("ocfs2_dlm_lock",
-						    status, lockres);
-				ret = -EINVAL;
+						    ret, lockres);
 			}
 			ocfs2_recover_from_dlm_error(lockres, 1);
 			goto out;
@@ -1502,10 +1494,8 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	ret = ocfs2_dlm_lock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
 			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
 			     lockres);
-	if (ret != DLM_NORMAL) {
-		if (trylock && ret == DLM_NOTQUEUED)
-			ret = -EAGAIN;
-		else {
+	if (ret) {
+		if (!trylock || (ret != -EAGAIN)) {
 			ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
 			ret = -EINVAL;
 		}
@@ -2573,7 +2563,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
 	mlog_exit_void();
 }
 
-static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
+static void ocfs2_unlock_ast(void *opaque, int error)
 {
 	struct ocfs2_lock_res *lockres = opaque;
 	unsigned long flags;
@@ -2589,7 +2579,7 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
 	 * state. The wake_up call done at the bottom is redundant
 	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
 	 * hurt anything anyway */
-	if (status == DLM_CANCELGRANT &&
+	if (error == -DLM_ECANCEL &&
 	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
 		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
 
@@ -2599,9 +2589,10 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
 		goto complete_unlock;
 	}
 
-	if (status != DLM_NORMAL) {
-		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
-		     "unlock_action %d\n", status, lockres->l_name,
+	/* DLM_EUNLOCK is the success code for unlock */
+	if (error != -DLM_EUNLOCK) {
+		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+		     "unlock_action %d\n", error, lockres->l_name,
 		     lockres->l_unlock_action);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		return;
@@ -2632,7 +2623,7 @@ complete_unlock:
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
 			   struct ocfs2_lock_res *lockres)
 {
-	enum dlm_status status;
+	int ret;
 	unsigned long flags;
 	u32 lkm_flags = 0;
 
@@ -2696,10 +2687,10 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	status = ocfs2_dlm_unlock(osb->dlm, &lockres->l_lksb, lkm_flags,
-				  lockres);
-	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
+	ret = ocfs2_dlm_unlock(osb->dlm, &lockres->l_lksb, lkm_flags,
+			       lockres);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
 		dlm_print_one_lock(lockres->l_lksb.lockid);
 		BUG();
@@ -2823,23 +2814,21 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 {
 	int ret;
 	u32 dlm_flags = DLM_LKF_CONVERT;
-	enum dlm_status status;
 
 	mlog_entry_void();
 
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
-	status = ocfs2_dlm_lock(osb->dlm,
-				new_level,
-				&lockres->l_lksb,
-				dlm_flags,
-				lockres->l_name,
-				OCFS2_LOCK_ID_MAX_LEN - 1,
-				lockres);
-	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("ocfs2_dlm_lock", status, lockres);
-		ret = -EINVAL;
+	ret = ocfs2_dlm_lock(osb->dlm,
+			     new_level,
+			     &lockres->l_lksb,
+			     dlm_flags,
+			     lockres->l_name,
+			     OCFS2_LOCK_ID_MAX_LEN - 1,
+			     lockres);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 1);
 		goto bail;
 	}
@@ -2886,19 +2875,14 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 				struct ocfs2_lock_res *lockres)
 {
 	int ret;
-	enum dlm_status status;
 
 	mlog_entry_void();
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	ret = 0;
-	status = ocfs2_dlm_unlock(osb->dlm,
-				  &lockres->l_lksb,
-				  DLM_LKF_CANCEL,
-				  lockres);
-	if (status != DLM_NORMAL) {
-		ocfs2_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
-		ret = -EINVAL;
+	ret = ocfs2_dlm_unlock(osb->dlm, &lockres->l_lksb,
+			       DLM_LKF_CANCEL, lockres);
+	if (ret) {
+		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 0);
 	}
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 99538043fc17..0aec2fcf2175 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -18,6 +18,7 @@
  * General Public License for more details.
  */
 
+#include "cluster/masklog.h"
 #include "stackglue.h"
 
 static struct ocfs2_locking_protocol *lproto;
@@ -77,7 +78,126 @@ static int flags_to_o2dlm(u32 flags)
 }
 #undef map_flag
 
-enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:		0
+ * DLM_NOTQUEUED:	-EAGAIN
+ * DLM_CANCELGRANT:	-DLM_ECANCEL
+ * DLM_CANCEL:		-DLM_EUNLOCK
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+	[DLM_NORMAL]			= 0,		/* Success */
+	[DLM_GRANTED]			= -EINVAL,
+	[DLM_DENIED]			= -EACCES,
+	[DLM_DENIED_NOLOCKS]		= -EACCES,
+	[DLM_WORKING]			= -EBUSY,
+	[DLM_BLOCKED]			= -EINVAL,
+	[DLM_BLOCKED_ORPHAN]		= -EINVAL,
+	[DLM_DENIED_GRACE_PERIOD]	= -EACCES,
+	[DLM_SYSERR]			= -ENOMEM,	/* It is what it is */
+	[DLM_NOSUPPORT]			= -EPROTO,
+	[DLM_CANCELGRANT]		= -DLM_ECANCEL, /* Cancel after grant */
+	[DLM_IVLOCKID]			= -EINVAL,
+	[DLM_SYNC]			= -EINVAL,
+	[DLM_BADTYPE]			= -EINVAL,
+	[DLM_BADRESOURCE]		= -EINVAL,
+	[DLM_MAXHANDLES]		= -ENOMEM,
+	[DLM_NOCLINFO]			= -EINVAL,
+	[DLM_NOLOCKMGR]			= -EINVAL,
+	[DLM_NOPURGED]			= -EINVAL,
+	[DLM_BADARGS]			= -EINVAL,
+	[DLM_VOID]			= -EINVAL,
+	[DLM_NOTQUEUED]			= -EAGAIN,	/* Trylock failed */
+	[DLM_IVBUFLEN]			= -EINVAL,
+	[DLM_CVTUNGRANT]		= -EPERM,
+	[DLM_BADPARAM]			= -EINVAL,
+	[DLM_VALNOTVALID]		= -EINVAL,
+	[DLM_REJECTED]			= -EPERM,
+	[DLM_ABORT]			= -EINVAL,
+	[DLM_CANCEL]			= -DLM_EUNLOCK,	/* Successful cancel */
+	[DLM_IVRESHANDLE]		= -EINVAL,
+	[DLM_DEADLOCK]			= -EDEADLK,
+	[DLM_DENIED_NOASTS]		= -EINVAL,
+	[DLM_FORWARD]			= -EINVAL,
+	[DLM_TIMEOUT]			= -ETIMEDOUT,
+	[DLM_IVGROUPID]			= -EINVAL,
+	[DLM_VERS_CONFLICT]		= -EOPNOTSUPP,
+	[DLM_BAD_DEVICE_PATH]		= -ENOENT,
+	[DLM_NO_DEVICE_PERMISSION]	= -EPERM,
+	[DLM_NO_CONTROL_DEVICE]		= -ENOENT,
+	[DLM_RECOVERING]		= -ENOTCONN,
+	[DLM_MIGRATING]			= -ERESTART,
+	[DLM_MAXSTATS]			= -EINVAL,
+};
+static int dlm_status_to_errno(enum dlm_status status)
+{
+	BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+
+	return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+	BUG_ON(lproto == NULL);
+
+	lproto->lp_lock_ast(astarg);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	BUG_ON(lproto == NULL);
+
+	lproto->lp_blocking_ast(astarg, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+	int error;
+
+	BUG_ON(lproto == NULL);
+
+	/*
+	 * XXX: CANCEL values are sketchy.
+	 *
+	 * Currently we have preserved the o2dlm paradigm.  You can get
+	 * unlock_ast() whether the cancel succeded or not.
+	 *
+	 * First, we're going to pass DLM_EUNLOCK just like fs/dlm does for
+	 * successful unlocks.  That is a clean behavior.
+	 *
+	 * In o2dlm, you can get both the lock_ast() for the lock being
+	 * granted and the unlock_ast() for the CANCEL failing.  A
+	 * successful cancel sends DLM_NORMAL here.  If the
+	 * lock grant happened before the cancel arrived, you get
+	 * DLM_CANCELGRANT.  For now, we'll use DLM_ECANCEL to signify
+	 * CANCELGRANT - the CANCEL was supposed to happen but didn't.  We
+	 * can then use DLM_EUNLOCK to signify a successful CANCEL -
+	 * effectively, the CANCEL caused the lock to roll back.
+	 *
+	 * In the future, we will likely move the o2dlm to send only one
+	 * ast - either unlock_ast() for a successful CANCEL or lock_ast()
+	 * when the grant succeeds.  At that point, we'll send DLM_ECANCEL
+	 * for all cancel results (CANCELGRANT will no longer exist).
+	 */
+	error = dlm_status_to_errno(status);
+
+	/* Successful unlock is DLM_EUNLOCK */
+	if (!error)
+		error = -DLM_EUNLOCK;
+
+	lproto->lp_unlock_ast(astarg, error);
+}
+
+int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   int mode,
 		   struct dlm_lockstatus *lksb,
 		   u32 flags,
@@ -85,27 +205,35 @@ enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   unsigned int namelen,
 		   void *astarg)
 {
+	enum dlm_status status;
 	int o2dlm_mode = mode_to_o2dlm(mode);
 	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
 
 	BUG_ON(lproto == NULL);
 
-	return dlmlock(dlm, o2dlm_mode, lksb, o2dlm_flags, name, namelen,
-		       lproto->lp_lock_ast, astarg,
-		       lproto->lp_blocking_ast);
+	status = dlmlock(dlm, o2dlm_mode, lksb, o2dlm_flags, name, namelen,
+		       o2dlm_lock_ast_wrapper, astarg,
+		       o2dlm_blocking_ast_wrapper);
+	ret = dlm_status_to_errno(status);
+	return ret;
 }
 
-enum dlm_status ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
+int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
 		     struct dlm_lockstatus *lksb,
 		     u32 flags,
 		     void *astarg)
 {
+	enum dlm_status status;
 	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
 
 	BUG_ON(lproto == NULL);
 
-	return dlmunlock(dlm, lksb, o2dlm_flags,
-			 lproto->lp_unlock_ast, astarg);
+	status = dlmunlock(dlm, lksb, o2dlm_flags,
+			 o2dlm_unlock_ast_wrapper, astarg);
+	ret = dlm_status_to_errno(status);
+	return ret;
 }
 
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 986d059ed1e0..8ebcfba62c7e 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -37,17 +37,17 @@
 struct ocfs2_locking_protocol {
 	void (*lp_lock_ast)(void *astarg);
 	void (*lp_blocking_ast)(void *astarg, int level);
-	void (*lp_unlock_ast)(void *astarg, enum dlm_status status);
+	void (*lp_unlock_ast)(void *astarg, int error);
 };
 
-enum dlm_status ocfs2_dlm_lock(struct dlm_ctxt *dlm,
+int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   int mode,
 		   struct dlm_lockstatus *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen,
 		   void *astarg);
-enum dlm_status ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
+int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
 		     struct dlm_lockstatus *lksb,
 		     u32 flags,
 		     void *astarg);
-- 
cgit v1.2.3


From 8f2c9c1b16bf6ed0903b29c49d56fa0109a390e4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 12:16:57 -0800
Subject: ocfs2: Create the lock status block union.

Wrap the lock status block (lksb) in a union.  Later we will add a union
element for the fs/dlm lksb.  Create accessors for the status and lvb
fields.

Other than a debugging function, dlmglue.c does not directly reference
the o2dlm locking path anymore.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   | 23 +++++++++++++----------
 fs/ocfs2/ocfs2.h     |  5 +++--
 fs/ocfs2/stackglue.c | 29 ++++++++++++++++++++++-------
 fs/ocfs2/stackglue.h | 11 +++++++++--
 4 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6a222a5c81da..459037653e5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -112,7 +112,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
 				     unsigned int line,
 				     struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	struct ocfs2_meta_lvb *lvb =
+		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	mlog(level, "LVB information for %s (called from %s:%u):\n",
 	     lockres->l_name, function, line);
@@ -799,14 +800,14 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 static void ocfs2_locking_ast(void *opaque)
 {
 	struct ocfs2_lock_res *lockres = opaque;
-	struct dlm_lockstatus *lksb = &lockres->l_lksb;
 	unsigned long flags;
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 
-	if (lksb->status != DLM_NORMAL) {
-		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
-		     lockres->l_name, lksb->status);
+	if (ocfs2_dlm_lock_status(&lockres->l_lksb)) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+		     lockres->l_name,
+		     ocfs2_dlm_lock_status(&lockres->l_lksb));
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		return;
 	}
@@ -1634,7 +1635,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 
 	mlog_entry_void();
 
-	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/*
 	 * Invalidate the LVB of a deleted inode - this way other
@@ -1686,7 +1687,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 
 	mlog_meta_lvb(0, lockres);
 
-	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/* We're safe here without the lockres lock... */
 	spin_lock(&oi->ip_lock);
@@ -1721,7 +1722,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 					      struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+	struct ocfs2_meta_lvb *lvb =
+		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	if (lvb->lvb_version == OCFS2_LVB_VERSION
 	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2379,7 +2381,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 		   lockres->l_blocking);
 
 	/* Dump the raw LVB */
-	lvb = lockres->l_lksb.lvb;
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 	for(i = 0; i < DLM_LVB_LEN; i++)
 		seq_printf(m, "0x%x\t", lvb[i]);
 
@@ -2692,7 +2694,8 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
-		dlm_print_one_lock(lockres->l_lksb.lockid);
+		/* XXX Need to abstract this */
+		dlm_print_one_lock(lockres->l_lksb.lksb_o2dlm.lockid);
 		BUG();
 	}
 	mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f78e9ed53249..6d7c6d2d0c23 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -40,7 +40,8 @@
 #include "cluster/heartbeat.h"
 #include "cluster/tcp.h"
 
-#include "dlm/dlmapi.h"
+/* For union ocfs2_dlm_lksb */
+#include "stackglue.h"
 
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
@@ -120,7 +121,7 @@ struct ocfs2_lock_res {
 	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	struct dlm_lockstatus    l_lksb;
+	union ocfs2_dlm_lksb     l_lksb;
 
 	/* used from AST/BAST funcs. */
 	enum ocfs2_ast_action    l_action;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 0aec2fcf2175..eb88854cb976 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -199,7 +199,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 
 int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   int mode,
-		   struct dlm_lockstatus *lksb,
+		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen,
@@ -212,15 +212,16 @@ int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 
 	BUG_ON(lproto == NULL);
 
-	status = dlmlock(dlm, o2dlm_mode, lksb, o2dlm_flags, name, namelen,
-		       o2dlm_lock_ast_wrapper, astarg,
-		       o2dlm_blocking_ast_wrapper);
+	status = dlmlock(dlm, o2dlm_mode, &lksb->lksb_o2dlm, o2dlm_flags,
+			 name, namelen,
+			 o2dlm_lock_ast_wrapper, astarg,
+			 o2dlm_blocking_ast_wrapper);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
 
 int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
-		     struct dlm_lockstatus *lksb,
+		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
 		     void *astarg)
 {
@@ -230,12 +231,26 @@ int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
 
 	BUG_ON(lproto == NULL);
 
-	status = dlmunlock(dlm, lksb, o2dlm_flags,
-			 o2dlm_unlock_ast_wrapper, astarg);
+	status = dlmunlock(dlm, &lksb->lksb_o2dlm, o2dlm_flags,
+			   o2dlm_unlock_ast_wrapper, astarg);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
 
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+/*
+ * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
+ * don't cast at the glue level.  The real answer is that the header
+ * ordering is nigh impossible.
+ */
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+	return (void *)(lksb->lksb_o2dlm.lvb);
+}
 
 void o2cb_get_stack(struct ocfs2_locking_protocol *proto)
 {
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 8ebcfba62c7e..3c91e241892b 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -40,18 +40,25 @@ struct ocfs2_locking_protocol {
 	void (*lp_unlock_ast)(void *astarg, int error);
 };
 
+union ocfs2_dlm_lksb {
+	struct dlm_lockstatus lksb_o2dlm;
+};
+
 int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 		   int mode,
-		   struct dlm_lockstatus *lksb,
+		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen,
 		   void *astarg);
 int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
-		     struct dlm_lockstatus *lksb,
+		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
 		     void *astarg);
 
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+
 void o2cb_get_stack(struct ocfs2_locking_protocol *proto);
 void o2cb_put_stack(void);
 
-- 
cgit v1.2.3


From 4670c46ded9a18268d1265417ff4ac72145a7917 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 14:39:35 -0800
Subject: ocfs2: Introduce the new ocfs2_cluster_connect/disconnect() API.

This step introduces a cluster stack agnostic API for initializing and
exiting.  fs/ocfs2/dlmglue.c no longer uses o2cb/o2dlm knowledge to
connect to the stack.  It is all handled in stackglue.c.

heartbeat.c no longer needs to know how it gets called.
ocfs2_do_node_down() is now a clean recovery trigger.

The big gotcha is the ordering of initializations and de-initializations done
underneath ocfs2_cluster_connect().  ocfs2_dlm_init() used to do all
o2dlm initialization in one block.  Thus, the o2dlm functionality of
ocfs2_cluster_connect() is very straightforward.  ocfs2_dlm_shutdown(),
however, did a few things between de-registration of the eviction
callback and actually shutting down the domain.  Now de-registration and
shutdown of the domain are wrapped within the single
ocfs2_cluster_disconnect() call.  I've checked the code paths to make
sure we can safely tear down things in ocfs2_dlm_shutdown() before
calling ocfs2_cluster_disconnect().  The filesystem has already set
itself to ignore the callback.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   |  97 +++++++++++++++++++-------------------
 fs/ocfs2/dlmglue.h   |   1 -
 fs/ocfs2/heartbeat.c |  40 ++++------------
 fs/ocfs2/heartbeat.h |   2 +-
 fs/ocfs2/ocfs2.h     |   4 +-
 fs/ocfs2/stackglue.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/stackglue.h |  35 +++++++++++++-
 fs/ocfs2/super.c     |  13 ++---
 8 files changed, 221 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 459037653e5a..6652a48cbe0f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
-#include <linux/crc32.h>
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
@@ -259,31 +258,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
 	.flags		= 0,
 };
 
-/*
- * This is the filesystem locking protocol version.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-const struct dlm_protocol_version ocfs2_locking_protocol = {
-	.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
-
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -886,7 +860,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	ret = ocfs2_dlm_lock(osb->dlm,
+	ret = ocfs2_dlm_lock(osb->cconn,
 			     level,
 			     &lockres->l_lksb,
 			     dlm_flags,
@@ -1085,7 +1059,7 @@ again:
 		     lockres->l_name, lockres->l_level, level);
 
 		/* call dlm_lock to upgrade lock now */
-		ret = ocfs2_dlm_lock(osb->dlm,
+		ret = ocfs2_dlm_lock(osb->cconn,
 				     level,
 				     &lockres->l_lksb,
 				     lkm_flags,
@@ -1492,7 +1466,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	ret = ocfs2_dlm_lock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+	ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
 			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
 			     lockres);
 	if (ret) {
@@ -2485,8 +2459,7 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
 int ocfs2_dlm_init(struct ocfs2_super *osb)
 {
 	int status = 0;
-	u32 dlm_key;
-	struct dlm_ctxt *dlm = NULL;
+	struct ocfs2_cluster_connection *conn = NULL;
 
 	mlog_entry_void();
 
@@ -2508,26 +2481,21 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	/* used by the dlm code to make message headers unique, each
-	 * node in this domain must agree on this. */
-	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
-
 	/* for now, uuid == domain */
-	dlm = dlm_register_domain(osb->uuid_str, dlm_key,
-				  &osb->osb_locking_proto);
-	if (IS_ERR(dlm)) {
-		status = PTR_ERR(dlm);
+	status = ocfs2_cluster_connect(osb->uuid_str,
+				       strlen(osb->uuid_str),
+				       ocfs2_do_node_down, osb,
+				       &conn);
+	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
-
 local:
 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
 
-	osb->dlm = dlm;
+	osb->cconn = conn;
 
 	status = 0;
 bail:
@@ -2545,10 +2513,14 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
 {
 	mlog_entry_void();
 
-	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
-
 	ocfs2_drop_osb_locks(osb);
 
+	/*
+	 * Now that we have dropped all locks and ocfs2_dismount_volume()
+	 * has disabled recovery, the DLM won't be talking to us.  It's
+	 * safe to tear things down before disconnecting the cluster.
+	 */
+
 	if (osb->dc_task) {
 		kthread_stop(osb->dc_task);
 		osb->dc_task = NULL;
@@ -2557,8 +2529,8 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
 	ocfs2_lock_res_free(&osb->osb_super_lockres);
 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
 
-	dlm_unregister_domain(osb->dlm);
-	osb->dlm = NULL;
+	ocfs2_cluster_disconnect(osb->cconn);
+	osb->cconn = NULL;
 
 	ocfs2_dlm_shutdown_debug(osb);
 
@@ -2689,7 +2661,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	ret = ocfs2_dlm_unlock(osb->dlm, &lockres->l_lksb, lkm_flags,
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
 			       lockres);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
@@ -2823,7 +2795,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
-	ret = ocfs2_dlm_lock(osb->dlm,
+	ret = ocfs2_dlm_lock(osb->cconn,
 			     new_level,
 			     &lockres->l_lksb,
 			     dlm_flags,
@@ -2882,7 +2854,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 	mlog_entry_void();
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	ret = ocfs2_dlm_unlock(osb->dlm, &lockres->l_lksb,
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
 			       DLM_LKF_CANCEL, lockres);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
@@ -3193,7 +3165,34 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 }
 
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
 static struct ocfs2_locking_protocol lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
 	.lp_lock_ast		= ocfs2_locking_ast,
 	.lp_blocking_ast	= ocfs2_blocking_ast,
 	.lp_unlock_ast		= ocfs2_unlock_ast,
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 32380439401f..2d0a8a03c431 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -117,5 +117,4 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 void dlmglue_init_stack(void);
 void dlmglue_exit_stack(void);
 
-extern const struct dlm_protocol_version ocfs2_locking_protocol;
 #endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 80de2397c161..dcac1a487288 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,8 +30,6 @@
 #include <linux/highmem.h>
 #include <linux/kmod.h>
 
-#include <dlm/dlmapi.h>
-
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
@@ -64,19 +62,20 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb)
 	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
 
-static void ocfs2_do_node_down(int node_num,
-			       struct ocfs2_super *osb)
+void ocfs2_do_node_down(int node_num, void *data)
 {
+	struct ocfs2_super *osb = data;
+
 	BUG_ON(osb->node_num == node_num);
 
 	mlog(0, "ocfs2: node down event for %d\n", node_num);
 
-	if (!osb->dlm) {
+	if (!osb->cconn) {
 		/*
-		 * No DLM means we're not even ready to participate yet.
-		 * We check the slots after the DLM comes up, so we will
-		 * notice the node death then.  We can safely ignore it
-		 * here.
+		 * No cluster connection means we're not even ready to
+		 * participate yet.  We check the slots after the cluster
+		 * comes up, so we will notice the node death then.  We
+		 * can safely ignore it here.
 		 */
 		return;
 	}
@@ -84,29 +83,6 @@ static void ocfs2_do_node_down(int node_num,
 	ocfs2_recovery_thread(osb, node_num);
 }
 
-/* Called from the dlm when it's about to evict a node. We may also
- * get a heartbeat callback later. */
-static void ocfs2_dlm_eviction_cb(int node_num,
-				  void *data)
-{
-	struct ocfs2_super *osb = (struct ocfs2_super *) data;
-	struct super_block *sb = osb->sb;
-
-	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
-	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
-
-	ocfs2_do_node_down(node_num, osb);
-}
-
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
-{
-	/* Not exactly a heartbeat callback, but leads to essentially
-	 * the same path so we set it up here. */
-	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
-			      ocfs2_dlm_eviction_cb,
-			      osb);
-}
-
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 {
 	int ret;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 98d8ffc995b1..38e2450bf14c 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,7 +28,7 @@
 
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
 
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_do_node_down(int node_num, void *data);
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 
 /* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6d7c6d2d0c23..664e4fe3eab5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -248,12 +248,10 @@ struct ocfs2_super
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
 
-	struct dlm_ctxt *dlm;
+	struct ocfs2_cluster_connection *cconn;
 	struct ocfs2_lock_res osb_super_lockres;
 	struct ocfs2_lock_res osb_rename_lockres;
-	struct dlm_eviction_cb osb_eviction_cb;
 	struct ocfs2_dlm_debug *osb_dlm_debug;
-	struct dlm_protocol_version osb_locking_proto;
 
 	struct dentry *osb_debug_root;
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index eb88854cb976..f6f309a08344 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -18,11 +18,21 @@
  * General Public License for more details.
  */
 
+#include <linux/slab.h>
+#include <linux/crc32.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
 #include "cluster/masklog.h"
 #include "stackglue.h"
 
 static struct ocfs2_locking_protocol *lproto;
 
+struct o2dlm_private {
+	struct dlm_eviction_cb op_eviction_cb;
+};
+
 /* These should be identical */
 #if (DLM_LOCK_IV != LKM_IVMODE)
 # error Lock modes do not match
@@ -197,7 +207,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 	lproto->lp_unlock_ast(astarg, error);
 }
 
-int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
 		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
@@ -212,15 +222,15 @@ int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
 
 	BUG_ON(lproto == NULL);
 
-	status = dlmlock(dlm, o2dlm_mode, &lksb->lksb_o2dlm, o2dlm_flags,
-			 name, namelen,
+	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+			 o2dlm_flags, name, namelen,
 			 o2dlm_lock_ast_wrapper, astarg,
 			 o2dlm_blocking_ast_wrapper);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
 
-int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
 		     void *astarg)
@@ -231,8 +241,8 @@ int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
 
 	BUG_ON(lproto == NULL);
 
-	status = dlmunlock(dlm, &lksb->lksb_o2dlm, o2dlm_flags,
-			   o2dlm_unlock_ast_wrapper, astarg);
+	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+			   o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
@@ -252,6 +262,115 @@ void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 	return (void *)(lksb->lksb_o2dlm.lvb);
 }
 
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+	struct ocfs2_cluster_connection *conn = data;
+
+	mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+	     node_num, conn->cc_namelen, conn->cc_name);
+
+	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+int ocfs2_cluster_connect(const char *group,
+			  int grouplen,
+			  void (*recovery_handler)(int node_num,
+						   void *recovery_data),
+			  void *recovery_data,
+			  struct ocfs2_cluster_connection **conn)
+{
+	int rc = 0;
+	struct ocfs2_cluster_connection *new_conn;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+	struct o2dlm_private *priv;
+	struct dlm_protocol_version dlm_version;
+
+	BUG_ON(group == NULL);
+	BUG_ON(conn == NULL);
+	BUG_ON(recovery_handler == NULL);
+
+	if (grouplen > GROUP_NAME_MAX) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+			   GFP_KERNEL);
+	if (!new_conn) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(new_conn->cc_name, group, grouplen);
+	new_conn->cc_namelen = grouplen;
+	new_conn->cc_recovery_handler = recovery_handler;
+	new_conn->cc_recovery_data = recovery_data;
+
+	/* Start the new connection at our maximum compatibility level */
+	new_conn->cc_version = lproto->lp_max_version;
+
+	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+	if (!priv) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	/* This just fills the structure in.  It is safe to use new_conn. */
+	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+			      new_conn);
+
+	new_conn->cc_private = priv;
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, group, grouplen);
+	dlm_version.pv_major = new_conn->cc_version.pv_major;
+	dlm_version.pv_minor = new_conn->cc_version.pv_minor;
+
+	dlm = dlm_register_domain(group, dlm_key, &dlm_version);
+	if (IS_ERR(dlm)) {
+		rc = PTR_ERR(dlm);
+		mlog_errno(rc);
+		goto out_free;
+	}
+
+	new_conn->cc_version.pv_major = dlm_version.pv_major;
+	new_conn->cc_version.pv_minor = dlm_version.pv_minor;
+	new_conn->cc_lockspace = dlm;
+
+	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+	*conn = new_conn;
+
+out_free:
+	if (rc) {
+		kfree(new_conn->cc_private);
+		kfree(new_conn);
+	}
+
+out:
+	return rc;
+}
+
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	struct dlm_ctxt *dlm = conn->cc_lockspace;
+	struct o2dlm_private *priv = conn->cc_private;
+
+	dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+	dlm_unregister_domain(dlm);
+
+	kfree(priv);
+	kfree(conn);
+
+	return 0;
+}
+
 void o2cb_get_stack(struct ocfs2_locking_protocol *proto)
 {
 	BUG_ON(proto == NULL);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 3c91e241892b..3900b5c3933c 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -32,9 +32,22 @@
  */
 #define DLM_LKF_LOCAL		0x00100000
 
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h.  That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX		64
+
+
 #include "dlm/dlmapi.h"
 
+struct ocfs2_protocol_version {
+	u8 pv_major;
+	u8 pv_minor;
+};
+
 struct ocfs2_locking_protocol {
+	struct ocfs2_protocol_version lp_max_version;
 	void (*lp_lock_ast)(void *astarg);
 	void (*lp_blocking_ast)(void *astarg, int level);
 	void (*lp_unlock_ast)(void *astarg, int error);
@@ -44,14 +57,32 @@ union ocfs2_dlm_lksb {
 	struct dlm_lockstatus lksb_o2dlm;
 };
 
-int ocfs2_dlm_lock(struct dlm_ctxt *dlm,
+struct ocfs2_cluster_connection {
+	char cc_name[GROUP_NAME_MAX];
+	int cc_namelen;
+	struct ocfs2_protocol_version cc_version;
+	void (*cc_recovery_handler)(int node_num, void *recovery_data);
+	void *cc_recovery_data;
+	void *cc_lockspace;
+	void *cc_private;
+};
+
+int ocfs2_cluster_connect(const char *group,
+			  int grouplen,
+			  void (*recovery_handler)(int node_num,
+						   void *recovery_data),
+			  void *recovery_data,
+			  struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn);
+
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
 		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen,
 		   void *astarg);
-int ocfs2_dlm_unlock(struct dlm_ctxt *dlm,
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
 		     void *astarg);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c8675464e299..0ee49757467d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1251,9 +1251,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_sync_blockdev(sb);
 
-	/* No dlm means we've failed during mount, so skip all the
-	 * steps which depended on that to complete. */
-	if (osb->dlm) {
+	/* No cluster connection means we've failed during mount, so skip
+	 * all the steps which depended on that to complete. */
+	if (osb->cconn) {
 		tmp = ocfs2_super_lock(osb, 1);
 		if (tmp < 0) {
 			mlog_errno(tmp);
@@ -1264,12 +1264,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	if (osb->slot_num != OCFS2_INVALID_SLOT)
 		ocfs2_put_slot(osb);
 
-	if (osb->dlm)
+	if (osb->cconn)
 		ocfs2_super_unlock(osb, 1);
 
 	ocfs2_release_system_inodes(osb);
 
-	if (osb->dlm)
+	if (osb->cconn)
 		ocfs2_dlm_shutdown(osb);
 
 	debugfs_remove(osb->osb_debug_root);
@@ -1341,7 +1341,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
-	osb->osb_locking_proto = ocfs2_locking_protocol;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
@@ -1391,8 +1390,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
 
-	ocfs2_setup_hb_callbacks(osb);
-
 	init_waitqueue_head(&osb->osb_mount_event);
 
 	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
-- 
cgit v1.2.3


From 19fdb624dc8ccb663f6e48b3a3a3fa4e4e567fc1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 30 Jan 2008 15:38:24 -0800
Subject: ocfs2: Abstract out node number queries.

ocfs2 asks the cluster stack for the local node's node number for two
reasons; to fill the slot map and to print it. While the slot map isn't
necessary for userspace cluster stacks, the printing is very nice for
debugging. Thus we add ocfs2_cluster_this_node() as a generic API to get
this value. It is anticipated that the slot map will not be used under a
userspace cluster stack, so validity checks of the node num only need to
exist in the slot map code. Otherwise, it just gets used and printed as an
opaque value.

[ Fixed up some "int" versus "unsigned int" issues and made osb->node_num
  truly opaque. --Mark ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h     |  2 +-
 fs/ocfs2/slot_map.c  |  2 --
 fs/ocfs2/stackglue.c | 17 +++++++++++++++++
 fs/ocfs2/stackglue.h |  1 +
 fs/ocfs2/super.c     | 22 +++++++++++-----------
 5 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 664e4fe3eab5..7006abaffd47 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -218,7 +218,7 @@ struct ocfs2_super
 	unsigned int s_atime_quantum;
 
 	unsigned int max_slots;
-	s16 node_num;
+	unsigned int node_num;
 	int slot_num;
 	int preferred_slot;
 	int s_sectsize_bits;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 63fb1b26d964..bb5ff8939bf1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -73,8 +73,6 @@ static void ocfs2_set_slot(struct ocfs2_slot_info *si,
 			   int slot_num, unsigned int node_num)
 {
 	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
-	BUG_ON((node_num == O2NM_INVALID_NODE_NUM) ||
-	       (node_num >= O2NM_MAX_NODES));
 
 	si->si_slots[slot_num].sl_valid = 1;
 	si->si_slots[slot_num].sl_node_num = node_num;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f6f309a08344..814686356cc6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -25,6 +25,8 @@
 #include <linux/fs.h>
 
 #include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+
 #include "stackglue.h"
 
 static struct ocfs2_locking_protocol *lproto;
@@ -371,6 +373,21 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 	return 0;
 }
 
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+	int node_num;
+
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_INVALID_NODE_NUM)
+		return -ENOENT;
+
+	if (node_num >= O2NM_MAX_NODES)
+		return -EOVERFLOW;
+
+	*node = node_num;
+	return 0;
+}
+
 void o2cb_get_stack(struct ocfs2_locking_protocol *proto)
 {
 	BUG_ON(proto == NULL);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 3900b5c3933c..ccb03991b514 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -74,6 +74,7 @@ int ocfs2_cluster_connect(const char *group,
 			  void *recovery_data,
 			  struct ocfs2_cluster_connection **conn);
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn);
+int ocfs2_cluster_this_node(unsigned int *node);
 
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0ee49757467d..d3c4d323fab5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -694,7 +694,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	if (ocfs2_mount_local(osb))
 		snprintf(nodestr, sizeof(nodestr), "local");
 	else
-		snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
 
 	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
 	       "with %s data mode.\n",
@@ -1145,16 +1145,17 @@ static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
 	 * desirable. */
 	if (ocfs2_mount_local(osb))
 		osb->node_num = 0;
-	else
-		osb->node_num = o2nm_this_node();
-
-	if (osb->node_num == O2NM_MAX_NODES) {
-		mlog(ML_ERROR, "could not find this host's node number\n");
-		status = -ENOENT;
-		goto bail;
+	else {
+		status = ocfs2_cluster_this_node(&osb->node_num);
+		if (status < 0) {
+			mlog_errno(status);
+			mlog(ML_ERROR,
+			     "could not find this host's node number\n");
+			goto bail;
+		}
 	}
 
-	mlog(0, "I am node %d\n", osb->node_num);
+	mlog(0, "I am node %u\n", osb->node_num);
 
 	status = 0;
 bail:
@@ -1282,7 +1283,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	if (ocfs2_mount_local(osb))
 		snprintf(nodestr, sizeof(nodestr), "local");
 	else
-		snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+		snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
 
 	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
 	       osb->dev_str, nodestr);
@@ -1384,7 +1385,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 
-	osb->node_num = O2NM_INVALID_NODE_NUM;
 	osb->slot_num = OCFS2_INVALID_SLOT;
 
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
-- 
cgit v1.2.3


From 6953b4c008628b945bfe0cee97f6e78a98773859 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 29 Jan 2008 16:59:56 -0800
Subject: ocfs2: Move o2hb functionality into the stack glue.

The last bit of classic stack used directly in ocfs2 code is o2hb.
Specifically, the check for heartbeat during mount and the call to
ocfs2_hb_ctl during unmount.

We create an extra API, ocfs2_cluster_hangup(), to encapsulate the call
to ocfs2_hb_ctl.  Other stacks will just leave hangup() empty.

The check for heartbeat is moved into ocfs2_cluster_connect().  It will
be matched by a similar check for other stacks.

With this change, only stackglue.c includes cluster/ headers.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   |  4 ----
 fs/ocfs2/heartbeat.c | 33 ---------------------------------
 fs/ocfs2/heartbeat.h |  1 -
 fs/ocfs2/ioctl.c     |  1 +
 fs/ocfs2/ocfs2.h     |  4 ----
 fs/ocfs2/stackglue.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/stackglue.h |  1 +
 fs/ocfs2/super.c     | 23 ++++++++++-------------
 8 files changed, 62 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6652a48cbe0f..05fd016ba4bf 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,10 +32,6 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
 
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index dcac1a487288..c6e7213db868 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/kmod.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -83,38 +82,6 @@ void ocfs2_do_node_down(int node_num, void *data)
 	ocfs2_recovery_thread(osb, node_num);
 }
 
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
-{
-	int ret;
-	char *argv[5], *envp[3];
-
-	if (ocfs2_mount_local(osb))
-		return;
-
-	if (!osb->uuid_str) {
-		/* This can happen if we don't get far enough in mount... */
-		mlog(0, "No UUID with which to stop heartbeat!\n\n");
-		return;
-	}
-
-	argv[0] = (char *)o2nm_get_hb_ctl_path();
-	argv[1] = "-K";
-	argv[2] = "-u";
-	argv[3] = osb->uuid_str;
-	argv[4] = NULL;
-
-	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
-	/* minimal command environment taken from cpu_run_sbin_hotplug */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-	if (ret < 0)
-		mlog_errno(ret);
-}
-
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
 					    int bit)
 {
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 38e2450bf14c..74b9c5dda28d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,7 +29,6 @@
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
 
 void ocfs2_do_node_down(int node_num, void *data);
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 
 /* node map functions - used to keep track of mounted and in-recovery
  * nodes. */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162b..ab1c2167d7f4 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7006abaffd47..31dc28b48392 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,10 +36,6 @@
 #include <linux/mutex.h>
 #include <linux/jbd.h>
 
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 814686356cc6..670fa945c212 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -20,12 +20,14 @@
 
 #include <linux/slab.h>
 #include <linux/crc32.h>
+#include <linux/kmod.h>
 
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
 #include <linux/fs.h>
 
 #include "cluster/masklog.h"
 #include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
 
 #include "stackglue.h"
 
@@ -301,6 +303,13 @@ int ocfs2_cluster_connect(const char *group,
 		goto out;
 	}
 
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		rc = -EINVAL;
+		goto out;
+	}
+
 	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
 			   GFP_KERNEL);
 	if (!new_conn) {
@@ -359,6 +368,7 @@ out:
 	return rc;
 }
 
+
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	struct dlm_ctxt *dlm = conn->cc_lockspace;
@@ -373,6 +383,46 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 	return 0;
 }
 
+static void o2hb_stop(const char *group)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = (char *)group;
+	argv[4] = NULL;
+
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+
+/*
+ * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
+ * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
+ * happens regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
+ * the glue and provide a "hangup" API for super.c to call.
+ *
+ * Other stacks will eventually provide a NULL ->hangup() pointer.
+ */
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+	BUG_ON(group == NULL);
+	BUG_ON(group[grouplen] != '\0');
+
+	o2hb_stop(group);
+}
+
 int ocfs2_cluster_this_node(unsigned int *node)
 {
 	int node_num;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index ccb03991b514..22af77b9a690 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -74,6 +74,7 @@ int ocfs2_cluster_connect(const char *group,
 			  void *recovery_data,
 			  struct ocfs2_cluster_connection **conn);
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
 int ocfs2_cluster_this_node(unsigned int *node);
 
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d3c4d323fab5..8f536b39ce5b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
 #include <linux/mount.h>
-
-#include <cluster/nodemanager.h>
+#include <linux/seq_file.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -579,15 +578,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 		goto read_super_error;
 	}
 
-	/* for now we only have one cluster/node, make sure we see it
-	 * in the heartbeat universe */
-	if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
-		if (!o2hb_check_local_node_heartbeating()) {
-			status = -EINVAL;
-			goto read_super_error;
-		}
-	}
-
 	/* probe for superblock */
 	status = ocfs2_sb_probe(sb, &bh, &sector_size);
 	if (status < 0) {
@@ -1275,8 +1265,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	debugfs_remove(osb->osb_debug_root);
 
-	if (!mnt_err)
-		ocfs2_stop_heartbeat(osb);
+	/*
+	 * This is a small hack to move ocfs2_hb_ctl into stackglue.
+	 * If we're dismounting due to mount error, mount.ocfs2 will clean
+	 * up heartbeat.  If we're a local mount, there is no heartbeat.
+	 * If we failed before we got a uuid_str yet, we can't stop
+	 * heartbeat.  Otherwise, do it.
+	 */
+	if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
 
 	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
 
-- 
cgit v1.2.3


From 0abd6d1803b01c741430af270026d1d95a103d9c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 29 Jan 2008 16:59:56 -0800
Subject: ocfs2: Fill node number during cluster stack init

It doesn't make sense to query for a node number before connecting to the
cluster stack. This should be safe to do because node_num is only just
printed,
and we're actually only moving the setting of node num a small amount
further in the mount process.

[ Disconnect when node query fails -- Joel ]

Reviewed-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 13 ++++++++++++-
 fs/ocfs2/super.c   | 33 ---------------------------------
 2 files changed, 12 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 05fd016ba4bf..c7653bb343e1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2459,8 +2459,10 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	if (ocfs2_mount_local(osb))
+	if (ocfs2_mount_local(osb)) {
+		osb->node_num = 0;
 		goto local;
+	}
 
 	status = ocfs2_dlm_init_debug(osb);
 	if (status < 0) {
@@ -2487,6 +2489,15 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 		goto bail;
 	}
 
+	status = ocfs2_cluster_this_node(&osb->node_num);
+	if (status < 0) {
+		mlog_errno(status);
+		mlog(ML_ERROR,
+		     "could not find this host's node number\n");
+		ocfs2_cluster_disconnect(conn);
+		goto bail;
+	}
+
 local:
 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8f536b39ce5b..fa9c46e2eab8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -108,7 +108,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 			       struct buffer_head *bh,
@@ -1126,32 +1125,6 @@ static int ocfs2_get_sector(struct super_block *sb,
 	return 0;
 }
 
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
-	int status;
-
-	/* XXX hold a ref on the node while mounte?  easy enough, if
-	 * desirable. */
-	if (ocfs2_mount_local(osb))
-		osb->node_num = 0;
-	else {
-		status = ocfs2_cluster_this_node(&osb->node_num);
-		if (status < 0) {
-			mlog_errno(status);
-			mlog(ML_ERROR,
-			     "could not find this host's node number\n");
-			goto bail;
-		}
-	}
-
-	mlog(0, "I am node %u\n", osb->node_num);
-
-	status = 0;
-bail:
-	return status;
-}
-
 static int ocfs2_mount_volume(struct super_block *sb)
 {
 	int status = 0;
@@ -1163,12 +1136,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	if (ocfs2_is_hard_readonly(osb))
 		goto leave;
 
-	status = ocfs2_fill_local_node_info(osb);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	status = ocfs2_dlm_init(osb);
 	if (status < 0) {
 		mlog_errno(status);
-- 
cgit v1.2.3


From de551246e7bc5558371c3427889a8db1b8cc60f4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 14:45:08 -0800
Subject: ocfs2: Remove CANCELGRANT from the view of dlmglue.

o2dlm has the non-standard behavior of providing a cancel callback
(unlock_ast) even when the cancel has failed (the locking operation
succeeded without canceling).  This is called CANCELGRANT after the
status code sent to the callback.  fs/dlm does not provide this
callback, so dlmglue must be changed to live without it.
o2dlm_unlock_ast_wrapper() in stackglue now ignores CANCELGRANT calls.

Because dlmglue no longer sees CANCELGRANT, ocfs2_unlock_ast() no longer
needs to check for it.  ocfs2_locking_ast() must catch that a cancel was
tried and clear the cancel state.

Making these changes opens up a locking race.  dlmglue uses the the
OCFS2_LOCK_BUSY flag to ensure only one thread is calling the dlm at any
one time.  But dlmglue must unlock the lockres before calling into the
dlm.  In the small window of time between unlocking the lockres and
calling the dlm, the downconvert thread can try to cancel the lock.  The
downconvert thread is checking the OCFS2_LOCK_BUSY flag - it doesn't
know that ocfs2_dlm_lock() has not yet been called.

Because ocfs2_dlm_lock() has not yet been called, the cancel operation
will just be a no-op.  There's nothing to cancel.  With CANCELGRANT,
dlmglue uses the CANCELGRANT callback to clear up the cancel state.
When it comes around again, it will retry the cancel.  Eventually, the
first thread will have called into ocfs2_dlm_lock(), and either the
lock or the cancel will succeed.  The downconvert thread can then do its
downconvert.

Without CANCELGRANT, there is nothing to clean up the cancellation
state.  The downconvert thread does not know to retry its operations.
More importantly, the original lock may be blocking on the other node
that is trying to cancel us.  With neither able to make progress, the
ast is never called and the cancellation state is never cleaned up that
way.  dlmglue is deadlocked.

The OCFS2_LOCK_PENDING flag is introduced to remedy this window.  It is
set at the same time OCFS2_LOCK_BUSY is.  Thus, the downconvert thread
can check whether the lock is cancelable.  If not, it just loops around
to try again.  Once ocfs2_dlm_lock() is called, the thread then clears
OCFS2_LOCK_PENDING and wakes the downconvert thread.  Now, if the
downconvert thread finds the lock BUSY, it can safely try to cancel it.
Whether the cancel works or not, the state will be properly set and the
lock processing can continue.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   | 199 +++++++++++++++++++++++++++++++++++++++++++--------
 fs/ocfs2/ocfs2.h     |   4 ++
 fs/ocfs2/stackglue.c |  40 ++++-------
 3 files changed, 188 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c7653bb343e1..295c47f7aba2 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -311,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 				  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-				      int new_level);
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+					      int new_level);
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 				  struct ocfs2_lock_res *lockres,
 				  int new_level,
-				  int lvb);
+				  int lvb,
+				  unsigned int generation);
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 				        struct ocfs2_lock_res *lockres);
 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -736,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 	return needs_downconvert;
 }
 
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist?  To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock().  See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely.  However, it introduces
+ * a race on itself.  In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns.  The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time.  When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again.  If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action?  The other path has re-set PENDING.  Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ *  ocfs2_cluster_lock()
+ *   set BUSY
+ *   set PENDING
+ *   drop l_lock
+ *   ocfs2_dlm_lock()
+ *    ocfs2_locking_ast()		ocfs2_downconvert_thread()
+ *     clear PENDING			 ocfs2_unblock_lock()
+ *					  take_l_lock
+ *					  !BUSY
+ *					  ocfs2_prepare_downconvert()
+ *					   set BUSY
+ *					   set PENDING
+ *					  drop l_lock
+ *   take l_lock
+ *   clear PENDING
+ *   drop l_lock
+ *			<window>
+ *					  ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert().  That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen.  A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres.  lockres_set_pending() will return the
+ * current generation number.  When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending().  In our
+ * example above, the generation numbers will *not* match.  Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+				    unsigned int generation,
+				    struct ocfs2_super *osb)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	/*
+	 * The ast and locking functions can race us here.  The winner
+	 * will clear pending, the loser will not.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+	    (lockres->l_pending_gen != generation))
+		return;
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+	lockres->l_pending_gen++;
+
+	/*
+	 * The downconvert thread may have skipped us because we
+	 * were PENDING.  Wake it up.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		ocfs2_wake_downconvert_thread(osb);
+}
+
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+				  unsigned int generation,
+				  struct ocfs2_super *osb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	__lockres_clear_pending(lockres, generation, osb);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+	lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+
+	return lockres->l_pending_gen;
+}
+
+
 static void ocfs2_blocking_ast(void *opaque, int level)
 {
 	struct ocfs2_lock_res *lockres = opaque;
@@ -770,6 +878,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 static void ocfs2_locking_ast(void *opaque)
 {
 	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
 	unsigned long flags;
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
@@ -805,6 +914,18 @@ static void ocfs2_locking_ast(void *opaque)
 	 * can catch it. */
 	lockres->l_action = OCFS2_AST_INVALID;
 
+	/* Did we try to cancel this lock?  Clear that state */
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	/*
+	 * We may have beaten the locking functions here.  We certainly
+	 * know that dlm_lock() has been called :-)
+	 * Because we can't have two lock calls in flight at once, we
+	 * can use lockres->l_pending_gen.
+	 */
+	__lockres_clear_pending(lockres, lockres->l_pending_gen,  osb);
+
 	wake_up(&lockres->l_event);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
@@ -838,6 +959,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 {
 	int ret = 0;
 	unsigned long flags;
+	unsigned int gen;
 
 	mlog_entry_void();
 
@@ -854,6 +976,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 	lockres->l_action = OCFS2_AST_ATTACH;
 	lockres->l_requested = level;
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	gen = lockres_set_pending(lockres);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	ret = ocfs2_dlm_lock(osb->cconn,
@@ -863,6 +986,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 			     lockres->l_name,
 			     OCFS2_LOCK_ID_MAX_LEN - 1,
 			     lockres);
+	lockres_clear_pending(lockres, gen, osb);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 1);
@@ -988,6 +1112,7 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
 	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
 	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
 	unsigned long flags;
+	unsigned int gen;
 
 	mlog_entry_void();
 
@@ -1046,6 +1171,7 @@ again:
 
 		lockres->l_requested = level;
 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		gen = lockres_set_pending(lockres);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 		BUG_ON(level == DLM_LOCK_IV);
@@ -1062,6 +1188,7 @@ again:
 				     lockres->l_name,
 				     OCFS2_LOCK_ID_MAX_LEN - 1,
 				     lockres);
+		lockres_clear_pending(lockres, gen, osb);
 		if (ret) {
 			if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
 			    (ret != -EAGAIN)) {
@@ -1506,6 +1633,7 @@ out:
 void ocfs2_file_unlock(struct file *file)
 {
 	int ret;
+	unsigned int gen;
 	unsigned long flags;
 	struct ocfs2_file_private *fp = file->private_data;
 	struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1531,11 +1659,11 @@ void ocfs2_file_unlock(struct file *file)
 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
 	lockres->l_blocking = DLM_LOCK_EX;
 
-	ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+	gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
 	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+	ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
 	if (ret) {
 		mlog_errno(ret);
 		return;
@@ -2555,23 +2683,7 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 	     lockres->l_unlock_action);
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
-	/* We tried to cancel a convert request, but it was already
-	 * granted. All we want to do here is clear our unlock
-	 * state. The wake_up call done at the bottom is redundant
-	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
-	 * hurt anything anyway */
-	if (error == -DLM_ECANCEL &&
-	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
-		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
-
-		/* We don't clear the busy flag in this case as it
-		 * should have been cleared by the ast which the dlm
-		 * has called. */
-		goto complete_unlock;
-	}
-
-	/* DLM_EUNLOCK is the success code for unlock */
-	if (error != -DLM_EUNLOCK) {
+	if (error) {
 		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
 		     "unlock_action %d\n", error, lockres->l_name,
 		     lockres->l_unlock_action);
@@ -2592,7 +2704,6 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 	}
 
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-complete_unlock:
 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
@@ -2768,8 +2879,8 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 	return status;
 }
 
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-				      int new_level)
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+					      int new_level)
 {
 	assert_spin_locked(&lockres->l_lock);
 
@@ -2787,12 +2898,14 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
 	lockres->l_requested = new_level;
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	return lockres_set_pending(lockres);
 }
 
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 				  struct ocfs2_lock_res *lockres,
 				  int new_level,
-				  int lvb)
+				  int lvb,
+				  unsigned int generation)
 {
 	int ret;
 	u32 dlm_flags = DLM_LKF_CONVERT;
@@ -2809,6 +2922,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 			     lockres->l_name,
 			     OCFS2_LOCK_ID_MAX_LEN - 1,
 			     lockres);
+	lockres_clear_pending(lockres, generation, osb);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 1);
@@ -2883,6 +2997,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 	int new_level;
 	int ret = 0;
 	int set_lvb = 0;
+	unsigned int gen;
 
 	mlog_entry_void();
 
@@ -2892,6 +3007,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 
 recheck:
 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		/* XXX
+		 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
+		 * exists entirely for one reason - another thread has set
+		 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+		 *
+		 * If we do ocfs2_cancel_convert() before the other thread
+		 * calls dlm_lock(), our cancel will do nothing.  We will
+		 * get no ast, and we will have no way of knowing the
+		 * cancel failed.  Meanwhile, the other thread will call
+		 * into dlm_lock() and wait...forever.
+		 *
+		 * Why forever?  Because another node has asked for the
+		 * lock first; that's why we're here in unblock_lock().
+		 *
+		 * The solution is OCFS2_LOCK_PENDING.  When PENDING is
+		 * set, we just requeue the unblock.  Only when the other
+		 * thread has called dlm_lock() and cleared PENDING will
+		 * we then cancel their request.
+		 *
+		 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+		 * at the same time they set OCFS2_DLM_BUSY.  They must
+		 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+		 */
+		if (lockres->l_flags & OCFS2_LOCK_PENDING)
+			goto leave_requeue;
+
 		ctl->requeue = 1;
 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2971,9 +3112,11 @@ downconvert:
 			lockres->l_ops->set_lvb(lockres);
 	}
 
-	ocfs2_prepare_downconvert(lockres, new_level);
+	gen = ocfs2_prepare_downconvert(lockres, new_level);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
-	ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+				     gen);
+
 leave:
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 31dc28b48392..af929eca5412 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -98,6 +98,9 @@ enum ocfs2_unlock_action {
 					       * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
 #define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
+						 call to dlm_lock.  Only
+						 exists with BUSY set. */
 
 struct ocfs2_lock_res_ops;
 
@@ -124,6 +127,7 @@ struct ocfs2_lock_res {
 	enum ocfs2_unlock_action l_unlock_action;
 	int                      l_requested;
 	int                      l_blocking;
+	unsigned int             l_pending_gen;
 
 	wait_queue_head_t        l_event;
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 670fa945c212..abdb9f6f4cc9 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -104,8 +104,8 @@ static int flags_to_o2dlm(u32 flags)
  *
  * DLM_NORMAL:		0
  * DLM_NOTQUEUED:	-EAGAIN
- * DLM_CANCELGRANT:	-DLM_ECANCEL
- * DLM_CANCEL:		-DLM_EUNLOCK
+ * DLM_CANCELGRANT:	-EBUSY
+ * DLM_CANCEL:		-DLM_ECANCEL
  */
 /* Keep in sync with dlmapi.h */
 static int status_map[] = {
@@ -113,13 +113,13 @@ static int status_map[] = {
 	[DLM_GRANTED]			= -EINVAL,
 	[DLM_DENIED]			= -EACCES,
 	[DLM_DENIED_NOLOCKS]		= -EACCES,
-	[DLM_WORKING]			= -EBUSY,
+	[DLM_WORKING]			= -EACCES,
 	[DLM_BLOCKED]			= -EINVAL,
 	[DLM_BLOCKED_ORPHAN]		= -EINVAL,
 	[DLM_DENIED_GRACE_PERIOD]	= -EACCES,
 	[DLM_SYSERR]			= -ENOMEM,	/* It is what it is */
 	[DLM_NOSUPPORT]			= -EPROTO,
-	[DLM_CANCELGRANT]		= -DLM_ECANCEL, /* Cancel after grant */
+	[DLM_CANCELGRANT]		= -EBUSY,	/* Cancel after grant */
 	[DLM_IVLOCKID]			= -EINVAL,
 	[DLM_SYNC]			= -EINVAL,
 	[DLM_BADTYPE]			= -EINVAL,
@@ -137,7 +137,7 @@ static int status_map[] = {
 	[DLM_VALNOTVALID]		= -EINVAL,
 	[DLM_REJECTED]			= -EPERM,
 	[DLM_ABORT]			= -EINVAL,
-	[DLM_CANCEL]			= -DLM_EUNLOCK,	/* Successful cancel */
+	[DLM_CANCEL]			= -DLM_ECANCEL,	/* Successful cancel */
 	[DLM_IVRESHANDLE]		= -EINVAL,
 	[DLM_DEADLOCK]			= -EDEADLK,
 	[DLM_DENIED_NOASTS]		= -EINVAL,
@@ -152,6 +152,7 @@ static int status_map[] = {
 	[DLM_MIGRATING]			= -ERESTART,
 	[DLM_MAXSTATS]			= -EINVAL,
 };
+
 static int dlm_status_to_errno(enum dlm_status status)
 {
 	BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
@@ -175,38 +176,23 @@ static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
-	int error;
+	int error = dlm_status_to_errno(status);
 
 	BUG_ON(lproto == NULL);
 
 	/*
-	 * XXX: CANCEL values are sketchy.
-	 *
-	 * Currently we have preserved the o2dlm paradigm.  You can get
-	 * unlock_ast() whether the cancel succeded or not.
-	 *
-	 * First, we're going to pass DLM_EUNLOCK just like fs/dlm does for
-	 * successful unlocks.  That is a clean behavior.
-	 *
 	 * In o2dlm, you can get both the lock_ast() for the lock being
 	 * granted and the unlock_ast() for the CANCEL failing.  A
 	 * successful cancel sends DLM_NORMAL here.  If the
 	 * lock grant happened before the cancel arrived, you get
-	 * DLM_CANCELGRANT.  For now, we'll use DLM_ECANCEL to signify
-	 * CANCELGRANT - the CANCEL was supposed to happen but didn't.  We
-	 * can then use DLM_EUNLOCK to signify a successful CANCEL -
-	 * effectively, the CANCEL caused the lock to roll back.
+	 * DLM_CANCELGRANT.
 	 *
-	 * In the future, we will likely move the o2dlm to send only one
-	 * ast - either unlock_ast() for a successful CANCEL or lock_ast()
-	 * when the grant succeeds.  At that point, we'll send DLM_ECANCEL
-	 * for all cancel results (CANCELGRANT will no longer exist).
+	 * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+	 * we just ignore it.  We expect the lock_ast() to handle the
+	 * granted lock.
 	 */
-	error = dlm_status_to_errno(status);
-
-	/* Successful unlock is DLM_EUNLOCK */
-	if (!error)
-		error = -DLM_EUNLOCK;
+	if (status == DLM_CANCELGRANT)
+		return;
 
 	lproto->lp_unlock_ast(astarg, error);
 }
-- 
cgit v1.2.3


From 1693a5c0117f8ccd010a666f97aaf0f14fb0a0e4 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 30 Jan 2008 16:52:53 -0800
Subject: ocfs2: handle async EAGAIN from NOQUEUE request

When using fsdlm, -EAGAIN is returned in the async callback for NOQUEUE
requests. Fix up dlmglue to expect this.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 295c47f7aba2..b640423b936a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -880,13 +880,20 @@ static void ocfs2_locking_ast(void *opaque)
 	struct ocfs2_lock_res *lockres = opaque;
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
 	unsigned long flags;
+	int status;
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 
-	if (ocfs2_dlm_lock_status(&lockres->l_lksb)) {
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+
+	if (status == -EAGAIN) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+		goto out;
+	}
+
+	if (status) {
 		mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
-		     lockres->l_name,
-		     ocfs2_dlm_lock_status(&lockres->l_lksb));
+		     lockres->l_name, status);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
 		return;
 	}
@@ -909,7 +916,7 @@ static void ocfs2_locking_ast(void *opaque)
 		     lockres->l_unlock_action);
 		BUG();
 	}
-
+out:
 	/* set it to something invalid so if we get called again we
 	 * can catch it. */
 	lockres->l_action = OCFS2_AST_INVALID;
@@ -1113,6 +1120,7 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
 	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
 	unsigned long flags;
 	unsigned int gen;
+	int noqueue_attempted = 0;
 
 	mlog_entry_void();
 
@@ -1157,6 +1165,13 @@ again:
 	}
 
 	if (level > lockres->l_level) {
+		if (noqueue_attempted > 0) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+		if (lkm_flags & DLM_LKF_NOQUEUE)
+			noqueue_attempted = 1;
+
 		if (lockres->l_action != OCFS2_AST_INVALID)
 			mlog(ML_ERROR, "lockres %s has action %u pending\n",
 			     lockres->l_name, lockres->l_action);
@@ -1621,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 		 * to just bubble sucess back up to the user.
 		 */
 		ret = ocfs2_flock_handle_signal(lockres, level);
+	} else if (!ret && (level > lockres->l_level)) {
+		/* Trylock failed asynchronously */
+		BUG_ON(!trylock);
+		ret = -EAGAIN;
 	}
 
 out:
-- 
cgit v1.2.3


From cf0acdcd640e9466059e69951c557e90b4bee45a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 29 Jan 2008 16:59:55 -0800
Subject: ocfs2: Abstract out a debugging function for underlying dlms.

dlmglue.c was still referencing a raw o2dlm lksb in one instance.  Let's
create a generic ocfs2_dlm_dump_lksb() function.  This allows underlying
DLMs to print whatever they want about their lock.

We then move the o2dlm dump into stackglue.c where it belongs.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   | 3 +--
 fs/ocfs2/stackglue.c | 5 +++++
 fs/ocfs2/stackglue.h | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b640423b936a..f41ff1c10ae8 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2803,8 +2803,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
-		/* XXX Need to abstract this */
-		dlm_print_one_lock(lockres->l_lksb.lksb_o2dlm.lockid);
+		ocfs2_dlm_dump_lksb(&lockres->l_lksb);
 		BUG();
 	}
 	mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index abdb9f6f4cc9..bd805411a856 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -252,6 +252,11 @@ void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 	return (void *)(lksb->lksb_o2dlm.lvb);
 }
 
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
 /*
  * Called from the dlm when it's about to evict a node. This is how the
  * classic stack signals node death.
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 22af77b9a690..01e3c9b9192a 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -91,6 +91,7 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
 void o2cb_get_stack(struct ocfs2_locking_protocol *proto);
 void o2cb_put_stack(void);
-- 
cgit v1.2.3


From 63e0c48ae6986a5bbb8e8dd9210c0e6ca79f2e50 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 30 Jan 2008 16:58:36 -0800
Subject: ocfs2: Clean up stackglue initialization

The stack glue initialization function needs a better name so that it can be
used cleanly when stackglue becomes a module.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   | 9 ++-------
 fs/ocfs2/dlmglue.h   | 5 ++---
 fs/ocfs2/stackglue.c | 8 ++------
 fs/ocfs2/stackglue.h | 3 +--
 fs/ocfs2/super.c     | 6 ++----
 5 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f41ff1c10ae8..8a9c84909be3 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3366,16 +3366,11 @@ static struct ocfs2_locking_protocol lproto = {
 	.lp_unlock_ast		= ocfs2_unlock_ast,
 };
 
-/* This interface isn't the final one, hence the less-than-perfect names */
-void dlmglue_init_stack(void)
+void ocfs2_set_locking_protocol(void)
 {
-	o2cb_get_stack(&lproto);
+	ocfs2_stack_glue_set_locking_protocol(&lproto);
 }
 
-void dlmglue_exit_stack(void)
-{
-	o2cb_put_stack();
-}
 
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 				       struct ocfs2_lock_res *lockres)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2d0a8a03c431..34b7598a0dc6 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -114,7 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
-void dlmglue_init_stack(void);
-void dlmglue_exit_stack(void);
-
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
 #endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index bd805411a856..51c2546b328d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -429,14 +429,10 @@ int ocfs2_cluster_this_node(unsigned int *node)
 	return 0;
 }
 
-void o2cb_get_stack(struct ocfs2_locking_protocol *proto)
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 {
-	BUG_ON(proto == NULL);
+	BUG_ON(proto != NULL);
 
 	lproto = proto;
 }
 
-void o2cb_put_stack(void)
-{
-	lproto = NULL;
-}
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 01e3c9b9192a..decb147106fd 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -93,7 +93,6 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
-void o2cb_get_stack(struct ocfs2_locking_protocol *proto);
-void o2cb_put_stack(void);
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
 
 #endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa9c46e2eab8..b4a02a00665d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -922,8 +922,6 @@ static int __init ocfs2_init(void)
 
 	ocfs2_print_version();
 
-	dlmglue_init_stack();
-
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0) {
 		mlog_errno(status);
@@ -948,6 +946,8 @@ static int __init ocfs2_init(void)
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 	}
 
+	ocfs2_set_locking_protocol();
+
 leave:
 	if (status < 0) {
 		ocfs2_free_mem_caches();
@@ -979,8 +979,6 @@ static void __exit ocfs2_exit(void)
 
 	exit_ocfs2_uptodate_cache();
 
-	dlmglue_exit_stack();
-
 	mlog_exit_void();
 }
 
-- 
cgit v1.2.3


From 553aa7e408eac402c00b67ddfa7aec13fe1f3a33 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 14:51:03 -0800
Subject: ocfs2: Split o2cb code from generic stack functions.

Split off the o2cb-specific funtionality from the generic stack glue
calls.  This is a precurser to wrapping the o2cb functionality in an
operations vector.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stackglue.c | 209 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 144 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 51c2546b328d..e35dde6217f5 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -197,21 +197,19 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 	lproto->lp_unlock_ast(astarg, error);
 }
 
-int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
-		   int mode,
-		   union ocfs2_dlm_lksb *lksb,
-		   u32 flags,
-		   void *name,
-		   unsigned int namelen,
-		   void *astarg)
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 union ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen,
+			 void *astarg)
 {
 	enum dlm_status status;
 	int o2dlm_mode = mode_to_o2dlm(mode);
 	int o2dlm_flags = flags_to_o2dlm(flags);
 	int ret;
 
-	BUG_ON(lproto == NULL);
-
 	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
 			 o2dlm_flags, name, namelen,
 			 o2dlm_lock_ast_wrapper, astarg,
@@ -220,43 +218,80 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 	return ret;
 }
 
-int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-		     union ocfs2_dlm_lksb *lksb,
-		     u32 flags,
-		     void *astarg)
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+		   int mode,
+		   union ocfs2_dlm_lksb *lksb,
+		   u32 flags,
+		   void *name,
+		   unsigned int namelen,
+		   void *astarg)
+{
+	BUG_ON(lproto == NULL);
+
+	return o2cb_dlm_lock(conn, mode, lksb, flags,
+			     name, namelen, astarg);
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   union ocfs2_dlm_lksb *lksb,
+			   u32 flags,
+			   void *astarg)
 {
 	enum dlm_status status;
 	int o2dlm_flags = flags_to_o2dlm(flags);
 	int ret;
 
-	BUG_ON(lproto == NULL);
-
 	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
 			   o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
 
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+		     union ocfs2_dlm_lksb *lksb,
+		     u32 flags,
+		     void *astarg)
+{
+	BUG_ON(lproto == NULL);
+
+	return o2cb_dlm_unlock(conn, lksb, flags, astarg);
+}
+
+static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 {
 	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
 }
 
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+	return o2cb_dlm_lock_status(lksb);
+}
+
 /*
  * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
  * don't cast at the glue level.  The real answer is that the header
  * ordering is nigh impossible.
  */
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
 	return (void *)(lksb->lksb_o2dlm.lvb);
 }
 
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+	return o2cb_dlm_lvb(lksb);
+}
+
+static void o2cb_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
 	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
 }
 
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+	o2cb_dlm_dump_lksb(lksb);
+}
+
 /*
  * Called from the dlm when it's about to evict a node. This is how the
  * classic stack signals node death.
@@ -271,6 +306,62 @@ static void o2dlm_eviction_cb(int node_num, void *data)
 	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
 }
 
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	int rc = 0;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+	struct o2dlm_private *priv;
+	struct dlm_protocol_version dlm_version;
+
+	BUG_ON(conn == NULL);
+
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+	if (!priv) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	/* This just fills the structure in.  It is safe to pass conn. */
+	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+			      conn);
+
+	conn->cc_private = priv;
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+	dlm_version.pv_major = conn->cc_version.pv_major;
+	dlm_version.pv_minor = conn->cc_version.pv_minor;
+
+	dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+	if (IS_ERR(dlm)) {
+		rc = PTR_ERR(dlm);
+		mlog_errno(rc);
+		goto out_free;
+	}
+
+	conn->cc_version.pv_major = dlm_version.pv_major;
+	conn->cc_version.pv_minor = dlm_version.pv_minor;
+	conn->cc_lockspace = dlm;
+
+	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+	if (rc && conn->cc_private)
+		kfree(conn->cc_private);
+
+out:
+	return rc;
+}
+
 int ocfs2_cluster_connect(const char *group,
 			  int grouplen,
 			  void (*recovery_handler)(int node_num,
@@ -280,10 +371,6 @@ int ocfs2_cluster_connect(const char *group,
 {
 	int rc = 0;
 	struct ocfs2_cluster_connection *new_conn;
-	u32 dlm_key;
-	struct dlm_ctxt *dlm;
-	struct o2dlm_private *priv;
-	struct dlm_protocol_version dlm_version;
 
 	BUG_ON(group == NULL);
 	BUG_ON(conn == NULL);
@@ -294,13 +381,6 @@ int ocfs2_cluster_connect(const char *group,
 		goto out;
 	}
 
-	/* for now we only have one cluster/node, make sure we see it
-	 * in the heartbeat universe */
-	if (!o2hb_check_local_node_heartbeating()) {
-		rc = -EINVAL;
-		goto out;
-	}
-
 	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
 			   GFP_KERNEL);
 	if (!new_conn) {
@@ -316,64 +396,53 @@ int ocfs2_cluster_connect(const char *group,
 	/* Start the new connection at our maximum compatibility level */
 	new_conn->cc_version = lproto->lp_max_version;
 
-	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
-	if (!priv) {
-		rc = -ENOMEM;
-		goto out_free;
-	}
-
-	/* This just fills the structure in.  It is safe to use new_conn. */
-	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
-			      new_conn);
-
-	new_conn->cc_private = priv;
-
-	/* used by the dlm code to make message headers unique, each
-	 * node in this domain must agree on this. */
-	dlm_key = crc32_le(0, group, grouplen);
-	dlm_version.pv_major = new_conn->cc_version.pv_major;
-	dlm_version.pv_minor = new_conn->cc_version.pv_minor;
-
-	dlm = dlm_register_domain(group, dlm_key, &dlm_version);
-	if (IS_ERR(dlm)) {
-		rc = PTR_ERR(dlm);
+	rc = o2cb_cluster_connect(new_conn);
+	if (rc) {
 		mlog_errno(rc);
 		goto out_free;
 	}
 
-	new_conn->cc_version.pv_major = dlm_version.pv_major;
-	new_conn->cc_version.pv_minor = dlm_version.pv_minor;
-	new_conn->cc_lockspace = dlm;
-
-	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
-
 	*conn = new_conn;
 
 out_free:
-	if (rc) {
-		kfree(new_conn->cc_private);
+	if (rc)
 		kfree(new_conn);
-	}
 
 out:
 	return rc;
 }
 
 
-int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	struct dlm_ctxt *dlm = conn->cc_lockspace;
 	struct o2dlm_private *priv = conn->cc_private;
 
 	dlm_unregister_eviction_cb(&priv->op_eviction_cb);
-	dlm_unregister_domain(dlm);
-
+	conn->cc_private = NULL;
 	kfree(priv);
-	kfree(conn);
+
+	dlm_unregister_domain(dlm);
+	conn->cc_lockspace = NULL;
 
 	return 0;
 }
 
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	int ret;
+
+	BUG_ON(conn == NULL);
+
+	ret = o2cb_cluster_disconnect(conn);
+
+	/* XXX Should we free it anyway? */
+	if (!ret)
+		kfree(conn);
+
+	return ret;
+}
+
 static void o2hb_stop(const char *group)
 {
 	int ret;
@@ -406,15 +475,20 @@ static void o2hb_stop(const char *group)
  *
  * Other stacks will eventually provide a NULL ->hangup() pointer.
  */
+static void o2cb_cluster_hangup(const char *group, int grouplen)
+{
+	o2hb_stop(group);
+}
+
 void ocfs2_cluster_hangup(const char *group, int grouplen)
 {
 	BUG_ON(group == NULL);
 	BUG_ON(group[grouplen] != '\0');
 
-	o2hb_stop(group);
+	o2cb_cluster_hangup(group, grouplen);
 }
 
-int ocfs2_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(unsigned int *node)
 {
 	int node_num;
 
@@ -429,6 +503,11 @@ int ocfs2_cluster_this_node(unsigned int *node)
 	return 0;
 }
 
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+	return o2cb_cluster_this_node(node);
+}
+
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 {
 	BUG_ON(proto != NULL);
-- 
cgit v1.2.3


From e3dad42bf993a0f24eb6e46152356c9b119c15e8 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 15:02:36 -0800
Subject: ocfs2: Create ocfs2_stack_operations and split out the o2cb stack.

Define the ocfs2_stack_operations structure.  Build o2cb_stack_ops from
all of the o2cb-specific stack functions.  Change the generic stack glue
functions to call the stack_ops instead of the o2cb functions directly.

The o2cb functions are moved to stack_o2cb.c.  The headers are cleaned up
to where only needed headers are included.

In this code, stackglue.c and stack_o2cb.c refer to some shared
extern variables.  When they become modules, that will change.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile     |   1 +
 fs/ocfs2/stack_o2cb.c | 395 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/stackglue.c  | 385 ++----------------------------------------------
 fs/ocfs2/stackglue.h  | 123 +++++++++++++++-
 4 files changed, 532 insertions(+), 372 deletions(-)
 create mode 100644 fs/ocfs2/stack_o2cb.c

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 3ba64af26951..8e86195fedbf 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -25,6 +25,7 @@ ocfs2-objs := \
 	resize.o		\
 	slot_map.o 		\
 	stackglue.o		\
+	stack_o2cb.o		\
 	suballoc.o 		\
 	super.o 		\
 	symlink.o 		\
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000000..c9bc3541a33f
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,395 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/crc32.h>
+#include <linux/kmod.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+
+#include "stackglue.h"
+
+struct o2dlm_private {
+	struct dlm_eviction_cb op_eviction_cb;
+};
+
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+	BUG_ON(mode > LKM_MAXMODE);
+
+	return mode;
+}
+
+#define map_flag(_generic, _o2dlm)		\
+	if (flags & (_generic)) {		\
+		flags &= ~(_generic);		\
+		o2dlm_flags |= (_o2dlm);	\
+	}
+static int flags_to_o2dlm(u32 flags)
+{
+	int o2dlm_flags = 0;
+
+	map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+	map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+	map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+	map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+	map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+	map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+	map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+	map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+	map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+	/* map_flag() should have cleared every flag passed in */
+	BUG_ON(flags != 0);
+
+	return o2dlm_flags;
+}
+#undef map_flag
+
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:		0
+ * DLM_NOTQUEUED:	-EAGAIN
+ * DLM_CANCELGRANT:	-EBUSY
+ * DLM_CANCEL:		-DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+	[DLM_NORMAL]			= 0,		/* Success */
+	[DLM_GRANTED]			= -EINVAL,
+	[DLM_DENIED]			= -EACCES,
+	[DLM_DENIED_NOLOCKS]		= -EACCES,
+	[DLM_WORKING]			= -EACCES,
+	[DLM_BLOCKED]			= -EINVAL,
+	[DLM_BLOCKED_ORPHAN]		= -EINVAL,
+	[DLM_DENIED_GRACE_PERIOD]	= -EACCES,
+	[DLM_SYSERR]			= -ENOMEM,	/* It is what it is */
+	[DLM_NOSUPPORT]			= -EPROTO,
+	[DLM_CANCELGRANT]		= -EBUSY,	/* Cancel after grant */
+	[DLM_IVLOCKID]			= -EINVAL,
+	[DLM_SYNC]			= -EINVAL,
+	[DLM_BADTYPE]			= -EINVAL,
+	[DLM_BADRESOURCE]		= -EINVAL,
+	[DLM_MAXHANDLES]		= -ENOMEM,
+	[DLM_NOCLINFO]			= -EINVAL,
+	[DLM_NOLOCKMGR]			= -EINVAL,
+	[DLM_NOPURGED]			= -EINVAL,
+	[DLM_BADARGS]			= -EINVAL,
+	[DLM_VOID]			= -EINVAL,
+	[DLM_NOTQUEUED]			= -EAGAIN,	/* Trylock failed */
+	[DLM_IVBUFLEN]			= -EINVAL,
+	[DLM_CVTUNGRANT]		= -EPERM,
+	[DLM_BADPARAM]			= -EINVAL,
+	[DLM_VALNOTVALID]		= -EINVAL,
+	[DLM_REJECTED]			= -EPERM,
+	[DLM_ABORT]			= -EINVAL,
+	[DLM_CANCEL]			= -DLM_ECANCEL,	/* Successful cancel */
+	[DLM_IVRESHANDLE]		= -EINVAL,
+	[DLM_DEADLOCK]			= -EDEADLK,
+	[DLM_DENIED_NOASTS]		= -EINVAL,
+	[DLM_FORWARD]			= -EINVAL,
+	[DLM_TIMEOUT]			= -ETIMEDOUT,
+	[DLM_IVGROUPID]			= -EINVAL,
+	[DLM_VERS_CONFLICT]		= -EOPNOTSUPP,
+	[DLM_BAD_DEVICE_PATH]		= -ENOENT,
+	[DLM_NO_DEVICE_PERMISSION]	= -EPERM,
+	[DLM_NO_CONTROL_DEVICE]		= -ENOENT,
+	[DLM_RECOVERING]		= -ENOTCONN,
+	[DLM_MIGRATING]			= -ERESTART,
+	[DLM_MAXSTATS]			= -EINVAL,
+};
+
+static int dlm_status_to_errno(enum dlm_status status)
+{
+	BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+
+	return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+	BUG_ON(stack_glue_lproto == NULL);
+
+	stack_glue_lproto->lp_lock_ast(astarg);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	BUG_ON(stack_glue_lproto == NULL);
+
+	stack_glue_lproto->lp_blocking_ast(astarg, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+	int error = dlm_status_to_errno(status);
+
+	BUG_ON(stack_glue_lproto == NULL);
+
+	/*
+	 * In o2dlm, you can get both the lock_ast() for the lock being
+	 * granted and the unlock_ast() for the CANCEL failing.  A
+	 * successful cancel sends DLM_NORMAL here.  If the
+	 * lock grant happened before the cancel arrived, you get
+	 * DLM_CANCELGRANT.
+	 *
+	 * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+	 * we just ignore it.  We expect the lock_ast() to handle the
+	 * granted lock.
+	 */
+	if (status == DLM_CANCELGRANT)
+		return;
+
+	stack_glue_lproto->lp_unlock_ast(astarg, error);
+}
+
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 union ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen,
+			 void *astarg)
+{
+	enum dlm_status status;
+	int o2dlm_mode = mode_to_o2dlm(mode);
+	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
+
+	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+			 o2dlm_flags, name, namelen,
+			 o2dlm_lock_ast_wrapper, astarg,
+			 o2dlm_blocking_ast_wrapper);
+	ret = dlm_status_to_errno(status);
+	return ret;
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   union ocfs2_dlm_lksb *lksb,
+			   u32 flags,
+			   void *astarg)
+{
+	enum dlm_status status;
+	int o2dlm_flags = flags_to_o2dlm(flags);
+	int ret;
+
+	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+			   o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+	ret = dlm_status_to_errno(status);
+	return ret;
+}
+
+static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+	return (void *)(lksb->lksb_o2dlm.lvb);
+}
+
+static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+	struct ocfs2_cluster_connection *conn = data;
+
+	mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+	     node_num, conn->cc_namelen, conn->cc_name);
+
+	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	int rc = 0;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+	struct o2dlm_private *priv;
+	struct dlm_protocol_version dlm_version;
+
+	BUG_ON(conn == NULL);
+
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+	if (!priv) {
+		rc = -ENOMEM;
+		goto out_free;
+	}
+
+	/* This just fills the structure in.  It is safe to pass conn. */
+	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+			      conn);
+
+	conn->cc_private = priv;
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+	dlm_version.pv_major = conn->cc_version.pv_major;
+	dlm_version.pv_minor = conn->cc_version.pv_minor;
+
+	dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+	if (IS_ERR(dlm)) {
+		rc = PTR_ERR(dlm);
+		mlog_errno(rc);
+		goto out_free;
+	}
+
+	conn->cc_version.pv_major = dlm_version.pv_major;
+	conn->cc_version.pv_minor = dlm_version.pv_minor;
+	conn->cc_lockspace = dlm;
+
+	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+	if (rc && conn->cc_private)
+		kfree(conn->cc_private);
+
+out:
+	return rc;
+}
+
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+	struct dlm_ctxt *dlm = conn->cc_lockspace;
+	struct o2dlm_private *priv = conn->cc_private;
+
+	dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+	conn->cc_private = NULL;
+	kfree(priv);
+
+	dlm_unregister_domain(dlm);
+	conn->cc_lockspace = NULL;
+
+	return 0;
+}
+
+static void o2hb_stop(const char *group)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = (char *)group;
+	argv[4] = NULL;
+
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+
+/*
+ * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
+ * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
+ * happens regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
+ * the glue and provide a "hangup" API for super.c to call.
+ *
+ * Other stacks will eventually provide a NULL ->hangup() pointer.
+ */
+static void o2cb_cluster_hangup(const char *group, int grouplen)
+{
+	o2hb_stop(group);
+}
+
+static int o2cb_cluster_this_node(unsigned int *node)
+{
+	int node_num;
+
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_INVALID_NODE_NUM)
+		return -ENOENT;
+
+	if (node_num >= O2NM_MAX_NODES)
+		return -EOVERFLOW;
+
+	*node = node_num;
+	return 0;
+}
+
+struct ocfs2_stack_operations o2cb_stack_ops = {
+	.connect	= o2cb_cluster_connect,
+	.disconnect	= o2cb_cluster_disconnect,
+	.hangup		= o2cb_cluster_hangup,
+	.this_node	= o2cb_cluster_this_node,
+	.dlm_lock	= o2cb_dlm_lock,
+	.dlm_unlock	= o2cb_dlm_unlock,
+	.lock_status	= o2cb_dlm_lock_status,
+	.lock_lvb	= o2cb_dlm_lvb,
+	.dump_lksb	= o2cb_dump_lksb,
+};
+
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index e35dde6217f5..e197367b6bd6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -19,204 +19,17 @@
  */
 
 #include <linux/slab.h>
-#include <linux/crc32.h>
 #include <linux/kmod.h>
 
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
 #include <linux/fs.h>
 
 #include "cluster/masklog.h"
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
 
 #include "stackglue.h"
 
-static struct ocfs2_locking_protocol *lproto;
-
-struct o2dlm_private {
-	struct dlm_eviction_cb op_eviction_cb;
-};
-
-/* These should be identical */
-#if (DLM_LOCK_IV != LKM_IVMODE)
-# error Lock modes do not match
-#endif
-#if (DLM_LOCK_NL != LKM_NLMODE)
-# error Lock modes do not match
-#endif
-#if (DLM_LOCK_CR != LKM_CRMODE)
-# error Lock modes do not match
-#endif
-#if (DLM_LOCK_CW != LKM_CWMODE)
-# error Lock modes do not match
-#endif
-#if (DLM_LOCK_PR != LKM_PRMODE)
-# error Lock modes do not match
-#endif
-#if (DLM_LOCK_PW != LKM_PWMODE)
-# error Lock modes do not match
-#endif
-#if (DLM_LOCK_EX != LKM_EXMODE)
-# error Lock modes do not match
-#endif
-static inline int mode_to_o2dlm(int mode)
-{
-	BUG_ON(mode > LKM_MAXMODE);
-
-	return mode;
-}
-
-#define map_flag(_generic, _o2dlm)		\
-	if (flags & (_generic)) {		\
-		flags &= ~(_generic);		\
-		o2dlm_flags |= (_o2dlm);	\
-	}
-static int flags_to_o2dlm(u32 flags)
-{
-	int o2dlm_flags = 0;
-
-	map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
-	map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
-	map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
-	map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
-	map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
-	map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
-	map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
-	map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
-	map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
-
-	/* map_flag() should have cleared every flag passed in */
-	BUG_ON(flags != 0);
-
-	return o2dlm_flags;
-}
-#undef map_flag
-
-/*
- * Map an o2dlm status to standard errno values.
- *
- * o2dlm only uses a handful of these, and returns even fewer to the
- * caller. Still, we try to assign sane values to each error.
- *
- * The following value pairs have special meanings to dlmglue, thus
- * the right hand side needs to stay unique - never duplicate the
- * mapping elsewhere in the table!
- *
- * DLM_NORMAL:		0
- * DLM_NOTQUEUED:	-EAGAIN
- * DLM_CANCELGRANT:	-EBUSY
- * DLM_CANCEL:		-DLM_ECANCEL
- */
-/* Keep in sync with dlmapi.h */
-static int status_map[] = {
-	[DLM_NORMAL]			= 0,		/* Success */
-	[DLM_GRANTED]			= -EINVAL,
-	[DLM_DENIED]			= -EACCES,
-	[DLM_DENIED_NOLOCKS]		= -EACCES,
-	[DLM_WORKING]			= -EACCES,
-	[DLM_BLOCKED]			= -EINVAL,
-	[DLM_BLOCKED_ORPHAN]		= -EINVAL,
-	[DLM_DENIED_GRACE_PERIOD]	= -EACCES,
-	[DLM_SYSERR]			= -ENOMEM,	/* It is what it is */
-	[DLM_NOSUPPORT]			= -EPROTO,
-	[DLM_CANCELGRANT]		= -EBUSY,	/* Cancel after grant */
-	[DLM_IVLOCKID]			= -EINVAL,
-	[DLM_SYNC]			= -EINVAL,
-	[DLM_BADTYPE]			= -EINVAL,
-	[DLM_BADRESOURCE]		= -EINVAL,
-	[DLM_MAXHANDLES]		= -ENOMEM,
-	[DLM_NOCLINFO]			= -EINVAL,
-	[DLM_NOLOCKMGR]			= -EINVAL,
-	[DLM_NOPURGED]			= -EINVAL,
-	[DLM_BADARGS]			= -EINVAL,
-	[DLM_VOID]			= -EINVAL,
-	[DLM_NOTQUEUED]			= -EAGAIN,	/* Trylock failed */
-	[DLM_IVBUFLEN]			= -EINVAL,
-	[DLM_CVTUNGRANT]		= -EPERM,
-	[DLM_BADPARAM]			= -EINVAL,
-	[DLM_VALNOTVALID]		= -EINVAL,
-	[DLM_REJECTED]			= -EPERM,
-	[DLM_ABORT]			= -EINVAL,
-	[DLM_CANCEL]			= -DLM_ECANCEL,	/* Successful cancel */
-	[DLM_IVRESHANDLE]		= -EINVAL,
-	[DLM_DEADLOCK]			= -EDEADLK,
-	[DLM_DENIED_NOASTS]		= -EINVAL,
-	[DLM_FORWARD]			= -EINVAL,
-	[DLM_TIMEOUT]			= -ETIMEDOUT,
-	[DLM_IVGROUPID]			= -EINVAL,
-	[DLM_VERS_CONFLICT]		= -EOPNOTSUPP,
-	[DLM_BAD_DEVICE_PATH]		= -ENOENT,
-	[DLM_NO_DEVICE_PERMISSION]	= -EPERM,
-	[DLM_NO_CONTROL_DEVICE]		= -ENOENT,
-	[DLM_RECOVERING]		= -ENOTCONN,
-	[DLM_MIGRATING]			= -ERESTART,
-	[DLM_MAXSTATS]			= -EINVAL,
-};
-
-static int dlm_status_to_errno(enum dlm_status status)
-{
-	BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+struct ocfs2_locking_protocol *stack_glue_lproto;
 
-	return status_map[status];
-}
-
-static void o2dlm_lock_ast_wrapper(void *astarg)
-{
-	BUG_ON(lproto == NULL);
-
-	lproto->lp_lock_ast(astarg);
-}
-
-static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
-{
-	BUG_ON(lproto == NULL);
-
-	lproto->lp_blocking_ast(astarg, level);
-}
-
-static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
-{
-	int error = dlm_status_to_errno(status);
-
-	BUG_ON(lproto == NULL);
-
-	/*
-	 * In o2dlm, you can get both the lock_ast() for the lock being
-	 * granted and the unlock_ast() for the CANCEL failing.  A
-	 * successful cancel sends DLM_NORMAL here.  If the
-	 * lock grant happened before the cancel arrived, you get
-	 * DLM_CANCELGRANT.
-	 *
-	 * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
-	 * we just ignore it.  We expect the lock_ast() to handle the
-	 * granted lock.
-	 */
-	if (status == DLM_CANCELGRANT)
-		return;
-
-	lproto->lp_unlock_ast(astarg, error);
-}
-
-static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
-			 int mode,
-			 union ocfs2_dlm_lksb *lksb,
-			 u32 flags,
-			 void *name,
-			 unsigned int namelen,
-			 void *astarg)
-{
-	enum dlm_status status;
-	int o2dlm_mode = mode_to_o2dlm(mode);
-	int o2dlm_flags = flags_to_o2dlm(flags);
-	int ret;
-
-	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
-			 o2dlm_flags, name, namelen,
-			 o2dlm_lock_ast_wrapper, astarg,
-			 o2dlm_blocking_ast_wrapper);
-	ret = dlm_status_to_errno(status);
-	return ret;
-}
 
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
@@ -226,25 +39,10 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   unsigned int namelen,
 		   void *astarg)
 {
-	BUG_ON(lproto == NULL);
-
-	return o2cb_dlm_lock(conn, mode, lksb, flags,
-			     name, namelen, astarg);
-}
-
-static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
-			   union ocfs2_dlm_lksb *lksb,
-			   u32 flags,
-			   void *astarg)
-{
-	enum dlm_status status;
-	int o2dlm_flags = flags_to_o2dlm(flags);
-	int ret;
+	BUG_ON(stack_glue_lproto == NULL);
 
-	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
-			   o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
-	ret = dlm_status_to_errno(status);
-	return ret;
+	return o2cb_stack_ops.dlm_lock(conn, mode, lksb, flags,
+				       name, namelen, astarg);
 }
 
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
@@ -252,19 +50,14 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     u32 flags,
 		     void *astarg)
 {
-	BUG_ON(lproto == NULL);
+	BUG_ON(stack_glue_lproto == NULL);
 
-	return o2cb_dlm_unlock(conn, lksb, flags, astarg);
-}
-
-static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
-{
-	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+	return o2cb_stack_ops.dlm_unlock(conn, lksb, flags, astarg);
 }
 
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 {
-	return o2cb_dlm_lock_status(lksb);
+	return o2cb_stack_ops.lock_status(lksb);
 }
 
 /*
@@ -272,94 +65,14 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
  * don't cast at the glue level.  The real answer is that the header
  * ordering is nigh impossible.
  */
-static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
-{
-	return (void *)(lksb->lksb_o2dlm.lvb);
-}
-
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
-	return o2cb_dlm_lvb(lksb);
-}
-
-static void o2cb_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
-{
-	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+	return o2cb_stack_ops.lock_lvb(lksb);
 }
 
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
-	o2cb_dlm_dump_lksb(lksb);
-}
-
-/*
- * Called from the dlm when it's about to evict a node. This is how the
- * classic stack signals node death.
- */
-static void o2dlm_eviction_cb(int node_num, void *data)
-{
-	struct ocfs2_cluster_connection *conn = data;
-
-	mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
-	     node_num, conn->cc_namelen, conn->cc_name);
-
-	conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
-}
-
-static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
-{
-	int rc = 0;
-	u32 dlm_key;
-	struct dlm_ctxt *dlm;
-	struct o2dlm_private *priv;
-	struct dlm_protocol_version dlm_version;
-
-	BUG_ON(conn == NULL);
-
-	/* for now we only have one cluster/node, make sure we see it
-	 * in the heartbeat universe */
-	if (!o2hb_check_local_node_heartbeating()) {
-		rc = -EINVAL;
-		goto out;
-	}
-
-	priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
-	if (!priv) {
-		rc = -ENOMEM;
-		goto out_free;
-	}
-
-	/* This just fills the structure in.  It is safe to pass conn. */
-	dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
-			      conn);
-
-	conn->cc_private = priv;
-
-	/* used by the dlm code to make message headers unique, each
-	 * node in this domain must agree on this. */
-	dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
-	dlm_version.pv_major = conn->cc_version.pv_major;
-	dlm_version.pv_minor = conn->cc_version.pv_minor;
-
-	dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
-	if (IS_ERR(dlm)) {
-		rc = PTR_ERR(dlm);
-		mlog_errno(rc);
-		goto out_free;
-	}
-
-	conn->cc_version.pv_major = dlm_version.pv_major;
-	conn->cc_version.pv_minor = dlm_version.pv_minor;
-	conn->cc_lockspace = dlm;
-
-	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
-
-out_free:
-	if (rc && conn->cc_private)
-		kfree(conn->cc_private);
-
-out:
-	return rc;
+	o2cb_stack_ops.dump_lksb(lksb);
 }
 
 int ocfs2_cluster_connect(const char *group,
@@ -394,9 +107,9 @@ int ocfs2_cluster_connect(const char *group,
 	new_conn->cc_recovery_data = recovery_data;
 
 	/* Start the new connection at our maximum compatibility level */
-	new_conn->cc_version = lproto->lp_max_version;
+	new_conn->cc_version = stack_glue_lproto->lp_max_version;
 
-	rc = o2cb_cluster_connect(new_conn);
+	rc = o2cb_stack_ops.connect(new_conn);
 	if (rc) {
 		mlog_errno(rc);
 		goto out_free;
@@ -412,29 +125,13 @@ out:
 	return rc;
 }
 
-
-static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
-	struct dlm_ctxt *dlm = conn->cc_lockspace;
-	struct o2dlm_private *priv = conn->cc_private;
-
-	dlm_unregister_eviction_cb(&priv->op_eviction_cb);
-	conn->cc_private = NULL;
-	kfree(priv);
-
-	dlm_unregister_domain(dlm);
-	conn->cc_lockspace = NULL;
-
-	return 0;
-}
-
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	int ret;
 
 	BUG_ON(conn == NULL);
 
-	ret = o2cb_cluster_disconnect(conn);
+	ret = o2cb_stack_ops.disconnect(conn);
 
 	/* XXX Should we free it anyway? */
 	if (!ret)
@@ -443,75 +140,23 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 	return ret;
 }
 
-static void o2hb_stop(const char *group)
-{
-	int ret;
-	char *argv[5], *envp[3];
-
-	argv[0] = (char *)o2nm_get_hb_ctl_path();
-	argv[1] = "-K";
-	argv[2] = "-u";
-	argv[3] = (char *)group;
-	argv[4] = NULL;
-
-	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
-	/* minimal command environment taken from cpu_run_sbin_hotplug */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-	if (ret < 0)
-		mlog_errno(ret);
-}
-
-/*
- * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
- * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
- * happens regardless of whether the DLM got started, so we can't do it
- * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
- * the glue and provide a "hangup" API for super.c to call.
- *
- * Other stacks will eventually provide a NULL ->hangup() pointer.
- */
-static void o2cb_cluster_hangup(const char *group, int grouplen)
-{
-	o2hb_stop(group);
-}
-
 void ocfs2_cluster_hangup(const char *group, int grouplen)
 {
 	BUG_ON(group == NULL);
 	BUG_ON(group[grouplen] != '\0');
 
-	o2cb_cluster_hangup(group, grouplen);
-}
-
-static int o2cb_cluster_this_node(unsigned int *node)
-{
-	int node_num;
-
-	node_num = o2nm_this_node();
-	if (node_num == O2NM_INVALID_NODE_NUM)
-		return -ENOENT;
-
-	if (node_num >= O2NM_MAX_NODES)
-		return -EOVERFLOW;
-
-	*node = node_num;
-	return 0;
+	o2cb_stack_ops.hangup(group, grouplen);
 }
 
 int ocfs2_cluster_this_node(unsigned int *node)
 {
-	return o2cb_cluster_this_node(node);
+	return o2cb_stack_ops.this_node(node);
 }
 
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 {
 	BUG_ON(proto != NULL);
 
-	lproto = proto;
+	stack_glue_lproto = proto;
 }
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index decb147106fd..083632215dc5 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -25,6 +25,8 @@
 #include <linux/list.h>
 #include <linux/dlmconstants.h>
 
+#include "dlm/dlmapi.h"
+
 /*
  * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
  * some day, but right now we need it.  Let's fake it.  This value is larger
@@ -39,13 +41,18 @@
 #define GROUP_NAME_MAX		64
 
 
-#include "dlm/dlmapi.h"
-
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior.  See dlmglue.c for more information.
+ */
 struct ocfs2_protocol_version {
 	u8 pv_major;
 	u8 pv_minor;
 };
 
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
 struct ocfs2_locking_protocol {
 	struct ocfs2_protocol_version lp_max_version;
 	void (*lp_lock_ast)(void *astarg);
@@ -53,10 +60,20 @@ struct ocfs2_locking_protocol {
 	void (*lp_unlock_ast)(void *astarg, int error);
 };
 
+/*
+ * A union of all lock status structures.  We define it here so that the
+ * size of the union is known.  Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
 union ocfs2_dlm_lksb {
 	struct dlm_lockstatus lksb_o2dlm;
 };
 
+/*
+ * A cluster connection.  Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack.  ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
 struct ocfs2_cluster_connection {
 	char cc_name[GROUP_NAME_MAX];
 	int cc_namelen;
@@ -67,6 +84,106 @@ struct ocfs2_cluster_connection {
 	void *cc_private;
 };
 
+/*
+ * Each cluster stack implements the stack operations structure.  Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+	/*
+	 * The fs code calls ocfs2_cluster_connect() to attach a new
+	 * filesystem to the cluster stack.  The ->connect() op is passed
+	 * an ocfs2_cluster_connection with the name and recovery field
+	 * filled in.
+	 *
+	 * The stack must set up any notification mechanisms and create
+	 * the filesystem lockspace in the DLM.  The lockspace should be
+	 * stored on cc_lockspace.  Any other information can be stored on
+	 * cc_private.
+	 *
+	 * ->connect() must not return until it is guaranteed that
+	 *
+	 *  - Node down notifications for the filesystem will be recieved
+	 *    and passed to conn->cc_recovery_handler().
+	 *  - Locking requests for the filesystem will be processed.
+	 */
+	int (*connect)(struct ocfs2_cluster_connection *conn);
+
+	/*
+	 * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+	 * no longer needs cluster services.  All DLM locks have been
+	 * dropped, and recovery notification is being ignored by the
+	 * fs code.  The stack must disengage from the DLM and discontinue
+	 * recovery notification.
+	 *
+	 * Once ->disconnect() has returned, the connection structure will
+	 * be freed.  Thus, a stack must not return from ->disconnect()
+	 * until it will no longer reference the conn pointer.
+	 */
+	int (*disconnect)(struct ocfs2_cluster_connection *conn);
+
+	/*
+	 * ocfs2_cluster_hangup() exists for compatibility with older
+	 * ocfs2 tools.  Only the classic stack really needs it.  As such
+	 * ->hangup() is not required of all stacks.  See the comment by
+	 * ocfs2_cluster_hangup() for more details.
+	 */
+	void (*hangup)(const char *group, int grouplen);
+
+	/*
+	 * ->this_node() returns the cluster's unique identifier for the
+	 * local node.
+	 */
+	int (*this_node)(unsigned int *node);
+
+	/*
+	 * Call the underlying dlm lock function.  The ->dlm_lock()
+	 * callback should convert the flags and mode as appropriate.
+	 *
+	 * ast and bast functions are not part of the call because the
+	 * stack will likely want to wrap ast and bast calls before passing
+	 * them to stack->sp_proto.
+	 */
+	int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+			int mode,
+			union ocfs2_dlm_lksb *lksb,
+			u32 flags,
+			void *name,
+			unsigned int namelen,
+			void *astarg);
+
+	/*
+	 * Call the underlying dlm unlock function.  The ->dlm_unlock()
+	 * function should convert the flags as appropriate.
+	 *
+	 * The unlock ast is not passed, as the stack will want to wrap
+	 * it before calling stack->sp_proto->lp_unlock_ast().
+	 */
+	int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+			  union ocfs2_dlm_lksb *lksb,
+			  u32 flags,
+			  void *astarg);
+
+	/*
+	 * Return the status of the current lock status block.  The fs
+	 * code should never dereference the union.  The ->lock_status()
+	 * callback pulls out the stack-specific lksb, converts the status
+	 * to a proper errno, and returns it.
+	 */
+	int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * Pull the lvb pointer off of the stack-specific lksb.
+	 */
+	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+
+	/*
+	 * This is an optoinal debugging hook.  If provided, the
+	 * stack can dump debugging information about this lock.
+	 */
+	void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+};
+
 int ocfs2_cluster_connect(const char *group,
 			  int grouplen,
 			  void (*recovery_handler)(int node_num,
@@ -95,4 +212,6 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
 
+extern struct ocfs2_locking_protocol *stack_glue_lproto;
+extern struct ocfs2_stack_operations o2cb_stack_ops;
 #endif  /* STACKGLUE_H */
-- 
cgit v1.2.3


From 286eaa95c5c5915a6b72cc3f0a2534161fd7928b Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 15:03:57 -0800
Subject: ocfs2: Break out stackglue into modules.

We define the ocfs2_stack_plugin structure to represent a stack driver.
The o2cb stack code is split into stack_o2cb.c.  This becomes the
ocfs2_stack_o2cb.ko module.

The stackglue generic functions are similarly split into the
ocfs2_stackglue.ko module.  This module now provides an interface to
register drivers.  The ocfs2_stack_o2cb driver registers itself.  As
part of this interface, ocfs2_stackglue can load drivers on demand.
This is accomplished in ocfs2_cluster_connect().

ocfs2_cluster_disconnect() is now notified when a _hangup() is pending.
If a hangup is pending, it will not release the driver module and will
let _hangup() do that.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/Makefile     |   7 +-
 fs/ocfs2/dlmglue.c    |   7 +-
 fs/ocfs2/dlmglue.h    |   2 +-
 fs/ocfs2/stack_o2cb.c |  41 +++++++--
 fs/ocfs2/stackglue.c  | 238 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/stackglue.h  |  36 +++++++-
 fs/ocfs2/super.c      |  16 ++--
 7 files changed, 297 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 8e86195fedbf..b734254329b2 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,7 @@ EXTRA_CFLAGS += -Ifs/ocfs2
 
 EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o ocfs2_stackglue.o ocfs2_stack_o2cb.o
 
 ocfs2-objs := \
 	alloc.o 		\
@@ -24,8 +24,6 @@ ocfs2-objs := \
 	namei.o 		\
 	resize.o		\
 	slot_map.o 		\
-	stackglue.o		\
-	stack_o2cb.o		\
 	suballoc.o 		\
 	super.o 		\
 	symlink.o 		\
@@ -33,5 +31,8 @@ ocfs2-objs := \
 	uptodate.o		\
 	ver.o
 
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8a9c84909be3..f62a9e4fc315 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2641,7 +2641,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 		mlog_errno(status);
 		mlog(ML_ERROR,
 		     "could not find this host's node number\n");
-		ocfs2_cluster_disconnect(conn);
+		ocfs2_cluster_disconnect(conn, 0);
 		goto bail;
 	}
 
@@ -2663,7 +2663,8 @@ bail:
 	return status;
 }
 
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+			int hangup_pending)
 {
 	mlog_entry_void();
 
@@ -2683,7 +2684,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
 	ocfs2_lock_res_free(&osb->osb_super_lockres);
 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
 
-	ocfs2_cluster_disconnect(osb->cconn);
+	ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
 	osb->cconn = NULL;
 
 	ocfs2_dlm_shutdown_debug(osb);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 34b7598a0dc6..2bb01f09c1b1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
 #define OCFS2_LOCK_NONBLOCK		(0x04)
 
 int ocfs2_dlm_init(struct ocfs2_super *osb);
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
 			       enum ocfs2_lock_type type,
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index c9bc3541a33f..ac1d74c63bf5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -18,7 +18,7 @@
  */
 
 #include <linux/crc32.h>
-#include <linux/kmod.h>
+#include <linux/module.h>
 
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
 #include <linux/fs.h>
@@ -33,6 +33,8 @@ struct o2dlm_private {
 	struct dlm_eviction_cb op_eviction_cb;
 };
 
+static struct ocfs2_stack_plugin o2cb_stack;
+
 /* These should be identical */
 #if (DLM_LOCK_IV != LKM_IVMODE)
 # error Lock modes do not match
@@ -158,23 +160,23 @@ static int dlm_status_to_errno(enum dlm_status status)
 
 static void o2dlm_lock_ast_wrapper(void *astarg)
 {
-	BUG_ON(stack_glue_lproto == NULL);
+	BUG_ON(o2cb_stack.sp_proto == NULL);
 
-	stack_glue_lproto->lp_lock_ast(astarg);
+	o2cb_stack.sp_proto->lp_lock_ast(astarg);
 }
 
 static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	BUG_ON(stack_glue_lproto == NULL);
+	BUG_ON(o2cb_stack.sp_proto == NULL);
 
-	stack_glue_lproto->lp_blocking_ast(astarg, level);
+	o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
 }
 
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
 	int error = dlm_status_to_errno(status);
 
-	BUG_ON(stack_glue_lproto == NULL);
+	BUG_ON(o2cb_stack.sp_proto == NULL);
 
 	/*
 	 * In o2dlm, you can get both the lock_ast() for the lock being
@@ -190,7 +192,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 	if (status == DLM_CANCELGRANT)
 		return;
 
-	stack_glue_lproto->lp_unlock_ast(astarg, error);
+	o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
 }
 
 static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -267,6 +269,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	struct dlm_protocol_version dlm_version;
 
 	BUG_ON(conn == NULL);
+	BUG_ON(o2cb_stack.sp_proto == NULL);
 
 	/* for now we only have one cluster/node, make sure we see it
 	 * in the heartbeat universe */
@@ -314,7 +317,8 @@ out:
 	return rc;
 }
 
-static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+				   int hangup_pending)
 {
 	struct dlm_ctxt *dlm = conn->cc_lockspace;
 	struct o2dlm_private *priv = conn->cc_private;
@@ -393,3 +397,24 @@ struct ocfs2_stack_operations o2cb_stack_ops = {
 	.dump_lksb	= o2cb_dump_lksb,
 };
 
+static struct ocfs2_stack_plugin o2cb_stack = {
+	.sp_name	= "o2cb",
+	.sp_ops		= &o2cb_stack_ops,
+	.sp_owner	= THIS_MODULE,
+};
+
+static int __init o2cb_stack_init(void)
+{
+	return ocfs2_stack_glue_register(&o2cb_stack);
+}
+
+static void __exit o2cb_stack_exit(void)
+{
+	ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index e197367b6bd6..1978c9cff0e9 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -18,17 +18,176 @@
  * General Public License for more details.
  */
 
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
 
-/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
-#include <linux/fs.h>
+#include "stackglue.h"
 
-#include "cluster/masklog.h"
+static struct ocfs2_locking_protocol *lproto;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
 
-#include "stackglue.h"
+/*
+ * The stack currently in use.  If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+	struct ocfs2_stack_plugin *p;
+
+	assert_spin_locked(&ocfs2_stack_lock);
+
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		if (!strcmp(p->sp_name, name))
+			return p;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_stack_driver_request(const char *name)
+{
+	int rc;
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+
+	if (active_stack) {
+		/*
+		 * If the active stack isn't the one we want, it cannot
+		 * be selected right now.
+		 */
+		if (!strcmp(active_stack->sp_name, name))
+			rc = 0;
+		else
+			rc = -EBUSY;
+		goto out;
+	}
+
+	p = ocfs2_stack_lookup(name);
+	if (!p || !try_module_get(p->sp_owner)) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	/* Ok, the stack is pinned */
+	p->sp_count++;
+	active_stack = p;
+
+	rc = 0;
+
+out:
+	spin_unlock(&ocfs2_stack_lock);
+	return rc;
+}
+
+/*
+ * This function looks up the appropriate stack and makes it active.  If
+ * there is no stack, it tries to load it.  It will fail if the stack still
+ * cannot be found.  It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *name)
+{
+	int rc;
+
+	rc = ocfs2_stack_driver_request(name);
+	if (rc == -ENOENT) {
+		request_module("ocfs2_stack_%s", name);
+		rc = ocfs2_stack_driver_request(name);
+	}
+
+	if (rc == -ENOENT) {
+		printk(KERN_ERR
+		       "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+		       name);
+	} else if (rc == -EBUSY) {
+		printk(KERN_ERR
+		       "ocfs2: A different cluster stack driver is in use\n");
+	}
+
+	return rc;
+}
 
-struct ocfs2_locking_protocol *stack_glue_lproto;
+static void ocfs2_stack_driver_put(void)
+{
+	spin_lock(&ocfs2_stack_lock);
+	BUG_ON(active_stack == NULL);
+	BUG_ON(active_stack->sp_count == 0);
+
+	active_stack->sp_count--;
+	if (!active_stack->sp_count) {
+		module_put(active_stack->sp_owner);
+		active_stack = NULL;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+	int rc;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (!ocfs2_stack_lookup(plugin->sp_name)) {
+		plugin->sp_count = 0;
+		plugin->sp_proto = lproto;
+		list_add(&plugin->sp_list, &ocfs2_stack_list);
+		printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+		       plugin->sp_name);
+		rc = 0;
+	} else {
+		printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+		       plugin->sp_name);
+		rc = -EEXIST;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	p = ocfs2_stack_lookup(plugin->sp_name);
+	if (p) {
+		BUG_ON(p != plugin);
+		BUG_ON(plugin == active_stack);
+		BUG_ON(plugin->sp_count != 0);
+		list_del_init(&plugin->sp_list);
+		printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+		       plugin->sp_name);
+	} else {
+		printk(KERN_ERR "Stack \"%s\" is not registered\n",
+		       plugin->sp_name);
+	}
+	spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+{
+	struct ocfs2_stack_plugin *p;
+
+	BUG_ON(proto == NULL);
+
+	spin_lock(&ocfs2_stack_lock);
+	BUG_ON(active_stack != NULL);
+
+	lproto = proto;
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		p->sp_proto = lproto;
+	}
+
+	spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
 
 
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -39,26 +198,29 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   unsigned int namelen,
 		   void *astarg)
 {
-	BUG_ON(stack_glue_lproto == NULL);
+	BUG_ON(lproto == NULL);
 
-	return o2cb_stack_ops.dlm_lock(conn, mode, lksb, flags,
-				       name, namelen, astarg);
+	return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+					      name, namelen, astarg);
 }
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
 
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
 		     void *astarg)
 {
-	BUG_ON(stack_glue_lproto == NULL);
+	BUG_ON(lproto == NULL);
 
-	return o2cb_stack_ops.dlm_unlock(conn, lksb, flags, astarg);
+	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
 }
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
 
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 {
-	return o2cb_stack_ops.lock_status(lksb);
+	return active_stack->sp_ops->lock_status(lksb);
 }
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
 
 /*
  * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
@@ -67,13 +229,15 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
  */
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
-	return o2cb_stack_ops.lock_lvb(lksb);
+	return active_stack->sp_ops->lock_lvb(lksb);
 }
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
 
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
-	o2cb_stack_ops.dump_lksb(lksb);
+	active_stack->sp_ops->dump_lksb(lksb);
 }
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
 
 int ocfs2_cluster_connect(const char *group,
 			  int grouplen,
@@ -107,11 +271,16 @@ int ocfs2_cluster_connect(const char *group,
 	new_conn->cc_recovery_data = recovery_data;
 
 	/* Start the new connection at our maximum compatibility level */
-	new_conn->cc_version = stack_glue_lproto->lp_max_version;
+	new_conn->cc_version = lproto->lp_max_version;
+
+	/* This will pin the stack driver if successful */
+	rc = ocfs2_stack_driver_get("o2cb");
+	if (rc)
+		goto out_free;
 
-	rc = o2cb_stack_ops.connect(new_conn);
+	rc = active_stack->sp_ops->connect(new_conn);
 	if (rc) {
-		mlog_errno(rc);
+		ocfs2_stack_driver_put();
 		goto out_free;
 	}
 
@@ -124,39 +293,60 @@ out_free:
 out:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
 
-int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+			     int hangup_pending)
 {
 	int ret;
 
 	BUG_ON(conn == NULL);
 
-	ret = o2cb_stack_ops.disconnect(conn);
+	ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
 
 	/* XXX Should we free it anyway? */
-	if (!ret)
+	if (!ret) {
 		kfree(conn);
+		if (!hangup_pending)
+			ocfs2_stack_driver_put();
+	}
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
 
 void ocfs2_cluster_hangup(const char *group, int grouplen)
 {
 	BUG_ON(group == NULL);
 	BUG_ON(group[grouplen] != '\0');
 
-	o2cb_stack_ops.hangup(group, grouplen);
+	active_stack->sp_ops->hangup(group, grouplen);
+
+	/* cluster_disconnect() was called with hangup_pending==1 */
+	ocfs2_stack_driver_put();
 }
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
 
 int ocfs2_cluster_this_node(unsigned int *node)
 {
-	return o2cb_stack_ops.this_node(node);
+	return active_stack->sp_ops->this_node(node);
 }
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
 
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+
+static int __init ocfs2_stack_glue_init(void)
 {
-	BUG_ON(proto != NULL);
+	return 0;
+}
 
-	stack_glue_lproto = proto;
+static void __exit ocfs2_stack_glue_exit(void)
+{
+	lproto = NULL;
 }
 
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 083632215dc5..c96c8bb76863 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -119,14 +119,21 @@ struct ocfs2_stack_operations {
 	 * Once ->disconnect() has returned, the connection structure will
 	 * be freed.  Thus, a stack must not return from ->disconnect()
 	 * until it will no longer reference the conn pointer.
+	 *
+	 * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
+	 * be dropping the reference on the module.
 	 */
-	int (*disconnect)(struct ocfs2_cluster_connection *conn);
+	int (*disconnect)(struct ocfs2_cluster_connection *conn,
+			  int hangup_pending);
 
 	/*
 	 * ocfs2_cluster_hangup() exists for compatibility with older
 	 * ocfs2 tools.  Only the classic stack really needs it.  As such
 	 * ->hangup() is not required of all stacks.  See the comment by
 	 * ocfs2_cluster_hangup() for more details.
+	 *
+	 * Note that ocfs2_cluster_hangup() can only be called if
+	 * hangup_pending was passed to ocfs2_cluster_disconnect().
 	 */
 	void (*hangup)(const char *group, int grouplen);
 
@@ -184,13 +191,32 @@ struct ocfs2_stack_operations {
 	void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
 };
 
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure.  This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+	char *sp_name;
+	struct ocfs2_stack_operations *sp_ops;
+	struct module *sp_owner;
+
+	/* These are managed by the stackglue code. */
+	struct list_head sp_list;
+	unsigned int sp_count;
+	struct ocfs2_locking_protocol *sp_proto;
+};
+
+
+/* Used by the filesystem */
 int ocfs2_cluster_connect(const char *group,
 			  int grouplen,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
 			  void *recovery_data,
 			  struct ocfs2_cluster_connection **conn);
-int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+			     int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
 int ocfs2_cluster_this_node(unsigned int *node);
 
@@ -212,6 +238,8 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
 
-extern struct ocfs2_locking_protocol *stack_glue_lproto;
-extern struct ocfs2_stack_operations o2cb_stack_ops;
+
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 #endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b4a02a00665d..e27a0d47ea2b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1186,7 +1186,7 @@ leave:
 
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
-	int tmp;
+	int tmp, hangup_needed = 0;
 	struct ocfs2_super *osb = NULL;
 	char nodestr[8];
 
@@ -1225,19 +1225,21 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_release_system_inodes(osb);
 
-	if (osb->cconn)
-		ocfs2_dlm_shutdown(osb);
-
-	debugfs_remove(osb->osb_debug_root);
-
 	/*
-	 * This is a small hack to move ocfs2_hb_ctl into stackglue.
 	 * If we're dismounting due to mount error, mount.ocfs2 will clean
 	 * up heartbeat.  If we're a local mount, there is no heartbeat.
 	 * If we failed before we got a uuid_str yet, we can't stop
 	 * heartbeat.  Otherwise, do it.
 	 */
 	if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+		hangup_needed = 1;
+
+	if (osb->cconn)
+		ocfs2_dlm_shutdown(osb, hangup_needed);
+
+	debugfs_remove(osb->osb_debug_root);
+
+	if (hangup_needed)
 		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
 
 	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
-- 
cgit v1.2.3


From 74ae4e104dfc57017783fc07d5f2f9129062207f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 31 Jan 2008 23:56:17 -0800
Subject: ocfs2: Create stack glue sysfs files.

Introduce a set of sysfs files that describe the current stack glue
state.  The files live under /sys/fs/ocfs2.  The locking_protocol file
displays the version of ocfs2's locking code.  The
loaded_cluster_plugins file displays all of the currently loaded stack
plugins.  When filesystems are mounted, the active_cluster_plugin file
will display the plugin in use.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stackglue.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 1978c9cff0e9..76ae4fcebcbd 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -23,6 +23,9 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 
 #include "stackglue.h"
 
@@ -335,14 +338,130 @@ int ocfs2_cluster_this_node(unsigned int *node)
 EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
 
 
-static int __init ocfs2_stack_glue_init(void)
+/*
+ * Sysfs bits
+ */
+
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+					       struct kobj_attribute *attr,
+					       char *buf)
+{
+	ssize_t ret = 0;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (lproto)
+		ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+			       lproto->lp_max_version.pv_major,
+			       lproto->lp_max_version.pv_minor);
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+	__ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+	       ocfs2_max_locking_protocol_show, NULL);
+
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+						 struct kobj_attribute *attr,
+						 char *buf)
 {
+	ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+	struct ocfs2_stack_plugin *p;
+
+	spin_lock(&ocfs2_stack_lock);
+	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+		ret = snprintf(buf, remain, "%s\n",
+			       p->sp_name);
+		if (ret < 0) {
+			total = ret;
+			break;
+		}
+		if (ret == remain) {
+			/* snprintf() didn't fit */
+			total = -E2BIG;
+			break;
+		}
+		total += ret;
+		remain -= ret;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return total;
+}
+
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+	__ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+	       ocfs2_loaded_cluster_plugins_show, NULL);
+
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+						struct kobj_attribute *attr,
+						char *buf)
+{
+	ssize_t ret = 0;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (active_stack) {
+		ret = snprintf(buf, PAGE_SIZE, "%s\n",
+			       active_stack->sp_name);
+		if (ret == PAGE_SIZE)
+			ret = -E2BIG;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+	__ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+	       ocfs2_active_cluster_plugin_show, NULL);
+
+static struct attribute *ocfs2_attrs[] = {
+	&ocfs2_attr_max_locking_protocol.attr,
+	&ocfs2_attr_loaded_cluster_plugins.attr,
+	&ocfs2_attr_active_cluster_plugin.attr,
+	NULL,
+};
+
+static struct attribute_group ocfs2_attr_group = {
+	.attrs = ocfs2_attrs,
+};
+
+static struct kset *ocfs2_kset;
+
+static void ocfs2_sysfs_exit(void)
+{
+	kset_unregister(ocfs2_kset);
+}
+
+static int ocfs2_sysfs_init(void)
+{
+	int ret;
+
+	ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+	if (!ocfs2_kset)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+	if (ret)
+		goto error;
+
 	return 0;
+
+error:
+	kset_unregister(ocfs2_kset);
+	return ret;
+}
+
+static int __init ocfs2_stack_glue_init(void)
+{
+	return ocfs2_sysfs_init();
 }
 
 static void __exit ocfs2_stack_glue_exit(void)
 {
 	lproto = NULL;
+	ocfs2_sysfs_exit();
 }
 
 MODULE_AUTHOR("Oracle");
-- 
cgit v1.2.3


From b61817e1166c5e19c08baf05196477cc345e1b1a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 15:08:23 -0800
Subject: ocfs2: Add the USERSPACE_STACK incompat bit.

The filesystem gains the USERSPACE_STACK incomat bit and the
s_cluster_info field on the superblock.  When a userspace stack is in
use, the name of the stack is stored on-disk for mount-time
verification.

The "cluster_stack" option is added to mount(2) processing.  The mount
process needs to pass the matching stack name.  If the passed name and
the on-disk name do not match, the mount is failed.

When using the classic o2cb stack, the incompat bit is *not* set and no
mount option is used other than the usual heartbeat=local.  Thus, the
filesystem is compatible with older tools.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h    |  7 +++++
 fs/ocfs2/ocfs2_fs.h | 40 ++++++++++++++++++++++--
 fs/ocfs2/super.c    | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index af929eca5412..9ff5811345a9 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -248,6 +248,7 @@ struct ocfs2_super
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
 
+	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 	struct ocfs2_cluster_connection *cconn;
 	struct ocfs2_lock_res osb_super_lockres;
 	struct ocfs2_lock_res osb_rename_lockres;
@@ -368,6 +369,12 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
 	return ret;
 }
 
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+	return (osb->s_feature_incompat &
+		OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+}
+
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 {
 	return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c49502329ab0..52c426665154 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -89,7 +89,8 @@
 #define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
-					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP)
+					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
@@ -130,6 +131,17 @@
 #define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
 
 
+/*
+ * Support for alternate, userspace cluster stacks.  If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used.  This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -272,6 +284,10 @@ struct ocfs2_new_group_input {
 #define OCFS2_VOL_UUID_LEN		16
 #define OCFS2_MAX_VOL_LABEL_LEN		64
 
+/* The alternate, userspace stack fields */
+#define OCFS2_STACK_LABEL_LEN		4
+#define OCFS2_CLUSTER_NAME_LEN		16
+
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
 
@@ -513,6 +529,13 @@ struct ocfs2_slot_map_extended {
  */
 };
 
+struct ocfs2_cluster_info {
+/*00*/	__u8   ci_stack[OCFS2_STACK_LABEL_LEN];
+	__le32 ci_reserved;
+/*08*/	__u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+
 /*
  * On disk superblock for OCFS2
  * Note that it is contained inside an ocfs2_dinode, so all offsets
@@ -545,7 +568,20 @@ struct ocfs2_super_block {
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
 /*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
-/*A0*/
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
+						     stack.  Only valid
+						     with INCOMPAT flag. */
+/*B8*/  __le64 s_reserved2[17];		/* Fill out superblock */
+/*140*/
+
+	/*
+	 * NOTE: As stated above, all offsets are relative to
+	 * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+	 * 0xC0 + 0x140 = 0x200 or 512 bytes.  A superblock must fit within
+	 * our smallest blocksize, which is 512 bytes.  To ensure this,
+	 * we reserve the space in s_reserved2.  Anything past s_reserved2
+	 * will not be available on the smallest blocksize.
+	 */
 };
 
 /*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e27a0d47ea2b..96ebe36d5d77 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -87,6 +87,7 @@ struct mount_options
 	unsigned int	atime_quantum;
 	signed short	slot;
 	unsigned int	localalloc_opt;
+	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -152,6 +153,7 @@ enum {
 	Opt_commit,
 	Opt_localalloc,
 	Opt_localflocks,
+	Opt_stack,
 	Opt_err,
 };
 
@@ -170,6 +172,7 @@ static match_table_t tokens = {
 	{Opt_commit, "commit=%u"},
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
+	{Opt_stack, "cluster_stack=%s"},
 	{Opt_err, NULL}
 };
 
@@ -549,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
 		}
 	}
 
+	if (ocfs2_userspace_stack(osb)) {
+		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+			mlog(ML_ERROR, "Userspace stack expected, but "
+			     "o2cb heartbeat arguments passed to mount\n");
+			return -EINVAL;
+		}
+	}
+
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
-		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) {
+		if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+		    !ocfs2_userspace_stack(osb)) {
 			mlog(ML_ERROR, "Heartbeat has to be started to mount "
 			     "a read-write clustered device.\n");
 			return -EINVAL;
@@ -560,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
 	return 0;
 }
 
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk.  If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+					struct mount_options *mopt)
+{
+	if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+		mlog(ML_ERROR,
+		     "cluster stack passed to mount, but this filesystem "
+		     "does not support it\n");
+		return -EINVAL;
+	}
+
+	if (ocfs2_userspace_stack(osb) &&
+	    strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+		    OCFS2_STACK_LABEL_LEN)) {
+		mlog(ML_ERROR,
+		     "cluster stack passed to mount (\"%s\") does not "
+		     "match the filesystem (\"%s\")\n",
+		     mopt->cluster_stack,
+		     osb->osb_cluster_stack);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dentry *root;
@@ -598,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	osb->local_alloc_size = parsed_options.localalloc_opt;
 
+	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+	if (status)
+		goto read_super_error;
+
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -752,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 	mopt->slot = OCFS2_INVALID_SLOT;
 	mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+	mopt->cluster_stack[0] = '\0';
 
 	if (!options) {
 		status = 1;
@@ -853,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (!is_remount)
 				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
 			break;
+		case Opt_stack:
+			/* Check both that the option we were passed
+			 * is of the right length and that it is a proper
+			 * string of the right length.
+			 */
+			if (((args[0].to - args[0].from) !=
+			     OCFS2_STACK_LABEL_LEN) ||
+			    (strnlen(args[0].from,
+				     OCFS2_STACK_LABEL_LEN) !=
+			     OCFS2_STACK_LABEL_LEN)) {
+				mlog(ML_ERROR,
+				     "Invalid cluster_stack option\n");
+				status = 0;
+				goto bail;
+			}
+			memcpy(mopt->cluster_stack, args[0].from,
+			       OCFS2_STACK_LABEL_LEN);
+			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -911,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
 		seq_printf(s, ",localflocks,");
 
+	if (osb->osb_cluster_stack[0])
+		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+			   osb->osb_cluster_stack);
+
 	return 0;
 }
 
@@ -1403,6 +1472,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
+	if (ocfs2_userspace_stack(osb)) {
+		memcpy(osb->osb_cluster_stack,
+		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+		       OCFS2_STACK_LABEL_LEN);
+		osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+		if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+			mlog(ML_ERROR,
+			     "couldn't mount because of an invalid "
+			     "cluster stack label (%s) \n",
+			     osb->osb_cluster_stack);
+			status = -EINVAL;
+			goto bail;
+		}
+	} else {
+		/* The empty string is identical with classic tools that
+		 * don't know about s_cluster_info. */
+		osb->osb_cluster_stack[0] = '\0';
+	}
+
 	get_random_bytes(&osb->s_next_generation, sizeof(u32));
 
 	/* FIXME
-- 
cgit v1.2.3


From 9c6c877c04ce17d76a35d2173d3a3840d6b796a2 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 1 Feb 2008 15:17:30 -0800
Subject: ocfs2: Add the 'cluster_stack' sysfs file.

Userspace can now query and specify the cluster stack in use via the
/sys/fs/ocfs2/cluster_stack file.  By default, it is 'o2cb', which is
the classic stack.  Thus, old tools that do not know how to modify this
file will work just fine.  The stack cannot be modified if there is a
live filesystem.

ocfs2_cluster_connect() now takes the expected cluster stack as an
argument.  This way, the filesystem and the stack glue ensure they are
speaking to the same backend.

If the stack is 'o2cb', the o2cb stack plugin is used.  For any other
value, the fsdlm stack plugin is selected.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c   |   3 +-
 fs/ocfs2/stackglue.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/stackglue.h |   3 +-
 3 files changed, 104 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f62a9e4fc315..394d25a131a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2627,7 +2627,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 	}
 
 	/* for now, uuid == domain */
-	status = ocfs2_cluster_connect(osb->uuid_str,
+	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+				       osb->uuid_str,
 				       strlen(osb->uuid_str),
 				       ocfs2_do_node_down, osb,
 				       &conn);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 76ae4fcebcbd..bf45d9bff8a7 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -27,11 +27,17 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 
+#include "ocfs2_fs.h"
+
 #include "stackglue.h"
 
+#define OCFS2_STACK_PLUGIN_O2CB		"o2cb"
+#define OCFS2_STACK_PLUGIN_USER		"user"
+
 static struct ocfs2_locking_protocol *lproto;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
 
 /*
  * The stack currently in use.  If not null, active_stack->sp_count > 0,
@@ -53,26 +59,36 @@ static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
 	return NULL;
 }
 
-static int ocfs2_stack_driver_request(const char *name)
+static int ocfs2_stack_driver_request(const char *stack_name,
+				      const char *plugin_name)
 {
 	int rc;
 	struct ocfs2_stack_plugin *p;
 
 	spin_lock(&ocfs2_stack_lock);
 
+	/*
+	 * If the stack passed by the filesystem isn't the selected one,
+	 * we can't continue.
+	 */
+	if (strcmp(stack_name, cluster_stack_name)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
 	if (active_stack) {
 		/*
 		 * If the active stack isn't the one we want, it cannot
 		 * be selected right now.
 		 */
-		if (!strcmp(active_stack->sp_name, name))
+		if (!strcmp(active_stack->sp_name, plugin_name))
 			rc = 0;
 		else
 			rc = -EBUSY;
 		goto out;
 	}
 
-	p = ocfs2_stack_lookup(name);
+	p = ocfs2_stack_lookup(plugin_name);
 	if (!p || !try_module_get(p->sp_owner)) {
 		rc = -ENOENT;
 		goto out;
@@ -94,23 +110,42 @@ out:
  * there is no stack, it tries to load it.  It will fail if the stack still
  * cannot be found.  It will also fail if a different stack is in use.
  */
-static int ocfs2_stack_driver_get(const char *name)
+static int ocfs2_stack_driver_get(const char *stack_name)
 {
 	int rc;
+	char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+
+	/*
+	 * Classic stack does not pass in a stack name.  This is
+	 * compatible with older tools as well.
+	 */
+	if (!stack_name || !*stack_name)
+		stack_name = OCFS2_STACK_PLUGIN_O2CB;
+
+	if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+		printk(KERN_ERR
+		       "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+		       stack_name);
+		return -EINVAL;
+	}
 
-	rc = ocfs2_stack_driver_request(name);
+	/* Anything that isn't the classic stack is a user stack */
+	if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+		plugin_name = OCFS2_STACK_PLUGIN_USER;
+
+	rc = ocfs2_stack_driver_request(stack_name, plugin_name);
 	if (rc == -ENOENT) {
-		request_module("ocfs2_stack_%s", name);
-		rc = ocfs2_stack_driver_request(name);
+		request_module("ocfs2_stack_%s", plugin_name);
+		rc = ocfs2_stack_driver_request(stack_name, plugin_name);
 	}
 
 	if (rc == -ENOENT) {
 		printk(KERN_ERR
 		       "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
-		       name);
+		       plugin_name);
 	} else if (rc == -EBUSY) {
 		printk(KERN_ERR
-		       "ocfs2: A different cluster stack driver is in use\n");
+		       "ocfs2: A different cluster stack is in use\n");
 	}
 
 	return rc;
@@ -242,7 +277,8 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
 
-int ocfs2_cluster_connect(const char *group,
+int ocfs2_cluster_connect(const char *stack_name,
+			  const char *group,
 			  int grouplen,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
@@ -277,7 +313,7 @@ int ocfs2_cluster_connect(const char *group,
 	new_conn->cc_version = lproto->lp_max_version;
 
 	/* This will pin the stack driver if successful */
-	rc = ocfs2_stack_driver_get("o2cb");
+	rc = ocfs2_stack_driver_get(stack_name);
 	if (rc)
 		goto out_free;
 
@@ -416,10 +452,61 @@ static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
 	__ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
 	       ocfs2_active_cluster_plugin_show, NULL);
 
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	ssize_t ret;
+	spin_lock(&ocfs2_stack_lock);
+	ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	size_t len = count;
+	ssize_t ret;
+
+	if (len == 0)
+		return len;
+
+	if (buf[len - 1] == '\n')
+		len--;
+
+	if ((len != OCFS2_STACK_LABEL_LEN) ||
+	    (strnlen(buf, len) != len))
+		return -EINVAL;
+
+	spin_lock(&ocfs2_stack_lock);
+	if (active_stack) {
+		if (!strncmp(buf, cluster_stack_name, len))
+			ret = count;
+		else
+			ret = -EBUSY;
+	} else {
+		memcpy(cluster_stack_name, buf, len);
+		ret = count;
+	}
+	spin_unlock(&ocfs2_stack_lock);
+
+	return ret;
+}
+
+
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+	__ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+	       ocfs2_cluster_stack_show,
+	       ocfs2_cluster_stack_store);
+
 static struct attribute *ocfs2_attrs[] = {
 	&ocfs2_attr_max_locking_protocol.attr,
 	&ocfs2_attr_loaded_cluster_plugins.attr,
 	&ocfs2_attr_active_cluster_plugin.attr,
+	&ocfs2_attr_cluster_stack.attr,
 	NULL,
 };
 
@@ -455,6 +542,8 @@ error:
 
 static int __init ocfs2_stack_glue_init(void)
 {
+	strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+
 	return ocfs2_sysfs_init();
 }
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index c96c8bb76863..d88bc655644b 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -209,7 +209,8 @@ struct ocfs2_stack_plugin {
 
 
 /* Used by the filesystem */
-int ocfs2_cluster_connect(const char *group,
+int ocfs2_cluster_connect(const char *stack_name,
+			  const char *group,
 			  int grouplen,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
-- 
cgit v1.2.3


From 8adf0536c9fb578a8542dcf81104d3438a5287e4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 28 Nov 2007 14:38:40 -0800
Subject: ocfs2: Add the user stack module.

Add a skeleton for the stack_user module.  It's just the barebones module
code.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 fs/ocfs2/stack_user.c

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000000..920eb1111b66
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+
+#include "stackglue.h"
+
+
+static int __init user_stack_init(void)
+{
+	return 0;
+}
+
+static void __exit user_stack_exit(void)
+{
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(user_stack_init);
+module_exit(user_stack_exit);
-- 
cgit v1.2.3


From 6427a727557d9c964b7b162ae11bb156e2c501d5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 18 Feb 2008 19:23:28 -0800
Subject: ocfs2: Add the ocfs2_control misc device.

The ocfs2_control misc device is how a userspace control daemon (controld)
talks to the filesystem.  Introduce the bare-bones filesystem ops.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 183 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 920eb1111b66..fdca5d3c7668 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -18,17 +18,199 @@
  */
 
 #include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/reboot.h>
 
 #include "stackglue.h"
 
 
-static int __init user_stack_init(void)
+/*
+ * The control protocol starts with a handshake.  Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple.  First, the client reads until EOF.  Each line
+ * of output is a supported protocol tag.  All protocol tags are a single
+ * character followed by a two hex digit version number.  Currently the
+ * only things supported is T01, for "Text-base version 0x01".  Next, the
+ * client writes the version they would like to use.  If the version tag
+ * written is unknown, -EINVAL is returned.  Once the negotiation is
+ * complete, the client can start sending messages.
+ */
+
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order.  Let's just be safe.
+ */
+struct ocfs2_live_connection {
+	struct list_head		oc_list;
+	struct ocfs2_cluster_connection	*oc_conn;
+};
+
+static atomic_t ocfs2_control_opened;
+
+static LIST_HEAD(ocfs2_live_connection_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+	size_t len = strlen(name);
+	struct ocfs2_live_connection *c;
+
+	BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+
+	list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+		if ((c->oc_conn->cc_namelen == len) &&
+		    !strncmp(c->oc_conn->cc_name, name, len))
+			return c;
+	}
+
+	return c;
+}
+
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path.  Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+				     struct ocfs2_live_connection **c_ret)
+{
+	int rc = 0;
+	struct ocfs2_live_connection *c;
+
+	c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	mutex_lock(&ocfs2_control_lock);
+	c->oc_conn = conn;
+
+	if (atomic_read(&ocfs2_control_opened))
+		list_add(&c->oc_list, &ocfs2_live_connection_list);
+	else {
+		printk(KERN_ERR
+		       "ocfs2: Userspace control daemon is not present\n");
+		rc = -ESRCH;
+	}
+
+	mutex_unlock(&ocfs2_control_lock);
+
+	if (!rc)
+		*c_ret = c;
+	else
+		kfree(c);
+
+	return rc;
+}
+
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+	mutex_lock(&ocfs2_control_lock);
+	list_del_init(&c->oc_list);
+	c->oc_conn = NULL;
+	mutex_unlock(&ocfs2_control_lock);
+
+	kfree(c);
+}
+
+
+static ssize_t ocfs2_control_write(struct file *file,
+				   const char __user *buf,
+				   size_t count,
+				   loff_t *ppos)
 {
 	return 0;
 }
 
+static ssize_t ocfs2_control_read(struct file *file,
+				  char __user *buf,
+				  size_t count,
+				  loff_t *ppos)
+{
+	return 0;
+}
+
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+	if (atomic_dec_and_test(&ocfs2_control_opened)) {
+		mutex_lock(&ocfs2_control_lock);
+		if (!list_empty(&ocfs2_live_connection_list)) {
+			/* XXX: Do bad things! */
+			printk(KERN_ERR
+			       "ocfs2: Unexpected release of ocfs2_control!\n"
+			       "       Loss of cluster connection requires "
+			       "an emergency restart!\n");
+			emergency_restart();
+		}
+		mutex_unlock(&ocfs2_control_lock);
+	}
+
+	return 0;
+}
+
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+	atomic_inc(&ocfs2_control_opened);
+
+	return 0;
+}
+
+static const struct file_operations ocfs2_control_fops = {
+	.open    = ocfs2_control_open,
+	.release = ocfs2_control_release,
+	.read    = ocfs2_control_read,
+	.write   = ocfs2_control_write,
+	.owner   = THIS_MODULE,
+};
+
+struct miscdevice ocfs2_control_device = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "ocfs2_control",
+	.fops		= &ocfs2_control_fops,
+};
+
+static int ocfs2_control_init(void)
+{
+	int rc;
+
+	atomic_set(&ocfs2_control_opened, 0);
+
+	rc = misc_register(&ocfs2_control_device);
+	if (rc)
+		printk(KERN_ERR
+		       "ocfs2: Unable to register ocfs2_control device "
+		       "(errno %d)\n",
+		       -rc);
+
+	return rc;
+}
+
+static void ocfs2_control_exit(void)
+{
+	int rc;
+
+	rc = misc_deregister(&ocfs2_control_device);
+	if (rc)
+		printk(KERN_ERR
+		       "ocfs2: Unable to deregister ocfs2_control device "
+		       "(errno %d)\n",
+		       -rc);
+}
+
+static int __init user_stack_init(void)
+{
+	return ocfs2_control_init();
+}
+
 static void __exit user_stack_exit(void)
 {
+	ocfs2_control_exit();
 }
 
 MODULE_AUTHOR("Oracle");
-- 
cgit v1.2.3


From 462c7e6a257e547eebe1648396cf7c45e684091b Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 18 Feb 2008 19:40:12 -0800
Subject: ocfs2: Start the ocfs2_control handshake.

When a control daemon opens the ocfs2_control device, it must perform a
handshake to tell the filesystem it is something capable of monitoring
cluster status.  Only after the handshake is complete will the filesystem
allow mounts.

This is the first part of the handshake.  The daemon reads all supported
ocfs2_control protocols, then writes in the protocol it will use.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 139 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index fdca5d3c7668..ff8d30757924 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,6 +22,7 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/reboot.h>
+#include <asm/uaccess.h>
 
 #include "stackglue.h"
 
@@ -39,6 +40,16 @@
  * complete, the client can start sending messages.
  */
 
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO			"T01\n"
+#define OCFS2_CONTROL_PROTO_LEN			4
+#define OCFS2_CONTROL_HANDSHAKE_INVALID		(0)
+#define OCFS2_CONTROL_HANDSHAKE_READ		(1)
+#define OCFS2_CONTROL_HANDSHAKE_VALID		(2)
+
 /*
  * ocfs2_live_connection is refcounted because the filesystem and
  * miscdevice sides can detach in different order.  Let's just be safe.
@@ -48,11 +59,30 @@ struct ocfs2_live_connection {
 	struct ocfs2_cluster_connection	*oc_conn;
 };
 
+struct ocfs2_control_private {
+	struct list_head op_list;
+	int op_state;
+};
+
 static atomic_t ocfs2_control_opened;
 
 static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
 static DEFINE_MUTEX(ocfs2_control_lock);
 
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+						     int state)
+{
+	struct ocfs2_control_private *p = file->private_data;
+	p->op_state = state;
+}
+
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+	struct ocfs2_control_private *p = file->private_data;
+	return p->op_state;
+}
+
 static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
 {
 	size_t len = strlen(name);
@@ -119,27 +149,115 @@ static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
 	kfree(c);
 }
 
+static ssize_t ocfs2_control_cfu(char *target, size_t target_len,
+				 const char __user *buf, size_t count)
+{
+	/* The T01 expects write(2) calls to have exactly one command */
+	if (count != target_len)
+		return -EINVAL;
+
+	if (copy_from_user(target, buf, target_len))
+		return -EFAULT;
+
+	return count;
+}
+
+static ssize_t ocfs2_control_validate_handshake(struct file *file,
+						const char __user *buf,
+						size_t count)
+{
+	ssize_t ret;
+	char kbuf[OCFS2_CONTROL_PROTO_LEN];
+
+	ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+				buf, count);
+	if (ret != count)
+		return ret;
+
+	if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+		return -EINVAL;
+
+	atomic_inc(&ocfs2_control_opened);
+	ocfs2_control_set_handshake_state(file,
+					  OCFS2_CONTROL_HANDSHAKE_VALID);
+
+
+	return count;
+}
+
 
 static ssize_t ocfs2_control_write(struct file *file,
 				   const char __user *buf,
 				   size_t count,
 				   loff_t *ppos)
 {
-	return 0;
+	ssize_t ret;
+
+	switch (ocfs2_control_get_handshake_state(file)) {
+		case OCFS2_CONTROL_HANDSHAKE_INVALID:
+			ret = -EINVAL;
+			break;
+
+		case OCFS2_CONTROL_HANDSHAKE_READ:
+			ret = ocfs2_control_validate_handshake(file, buf,
+							       count);
+			break;
+
+		case OCFS2_CONTROL_HANDSHAKE_VALID:
+			ret = count;  /* XXX */
+			break;
+
+		default:
+			BUG();
+			ret = -EIO;
+			break;
+	}
+
+	return ret;
 }
 
+/*
+ * This is a naive version.  If we ever have a new protocol, we'll expand
+ * it.  Probably using seq_file.
+ */
 static ssize_t ocfs2_control_read(struct file *file,
 				  char __user *buf,
 				  size_t count,
 				  loff_t *ppos)
 {
-	return 0;
+	char *proto_string = OCFS2_CONTROL_PROTO;
+	size_t to_write = 0;
+
+	if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+		return 0;
+
+	to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
+	if (to_write > count)
+		to_write = count;
+	if (copy_to_user(buf, proto_string + *ppos, to_write))
+		return -EFAULT;
+
+	*ppos += to_write;
+
+	/* Have we read the whole protocol list? */
+	if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+		ocfs2_control_set_handshake_state(file,
+						  OCFS2_CONTROL_HANDSHAKE_READ);
+
+	return to_write;
 }
 
 static int ocfs2_control_release(struct inode *inode, struct file *file)
 {
+	struct ocfs2_control_private *p = file->private_data;
+
+	mutex_lock(&ocfs2_control_lock);
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_VALID)
+		goto out;
+
 	if (atomic_dec_and_test(&ocfs2_control_opened)) {
-		mutex_lock(&ocfs2_control_lock);
 		if (!list_empty(&ocfs2_live_connection_list)) {
 			/* XXX: Do bad things! */
 			printk(KERN_ERR
@@ -148,15 +266,31 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
 			       "an emergency restart!\n");
 			emergency_restart();
 		}
-		mutex_unlock(&ocfs2_control_lock);
 	}
 
+out:
+	list_del_init(&p->op_list);
+	file->private_data = NULL;
+
+	mutex_unlock(&ocfs2_control_lock);
+
+	kfree(p);
+
 	return 0;
 }
 
 static int ocfs2_control_open(struct inode *inode, struct file *file)
 {
-	atomic_inc(&ocfs2_control_opened);
+	struct ocfs2_control_private *p;
+
+	p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	mutex_lock(&ocfs2_control_lock);
+	file->private_data = p;
+	list_add(&p->op_list, &ocfs2_control_private_list);
+	mutex_unlock(&ocfs2_control_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From de870ef02295c9f5601dbf2efdc1be6df44b187b Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 18 Feb 2008 17:07:09 -0800
Subject: ocfs2: Introduce the DOWN message to ocfs2_control

When the control daemon sees a node go down, it sends a DOWN message
through the ocfs2_control device.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ff8d30757924..a5e58e2b9d59 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -35,9 +35,21 @@
  * of output is a supported protocol tag.  All protocol tags are a single
  * character followed by a two hex digit version number.  Currently the
  * only things supported is T01, for "Text-base version 0x01".  Next, the
- * client writes the version they would like to use.  If the version tag
- * written is unknown, -EINVAL is returned.  Once the negotiation is
- * complete, the client can start sending messages.
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'.  If the version tag written is
+ * unknown, -EINVAL is returned.  Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol only has one message, "DOWN".  It has the following
+ * syntax:
+ *
+ *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
  */
 
 /*
@@ -49,6 +61,11 @@
 #define OCFS2_CONTROL_HANDSHAKE_INVALID		(0)
 #define OCFS2_CONTROL_HANDSHAKE_READ		(1)
 #define OCFS2_CONTROL_HANDSHAKE_VALID		(2)
+#define OCFS2_CONTROL_MESSAGE_DOWN		"DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_LEN		4
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN	47
+#define OCFS2_TEXT_UUID_LEN			32
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
 
 /*
  * ocfs2_live_connection is refcounted because the filesystem and
@@ -149,7 +166,7 @@ static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
 	kfree(c);
 }
 
-static ssize_t ocfs2_control_cfu(char *target, size_t target_len,
+static ssize_t ocfs2_control_cfu(void *target, size_t target_len,
 				 const char __user *buf, size_t count)
 {
 	/* The T01 expects write(2) calls to have exactly one command */
@@ -185,6 +202,73 @@ static ssize_t ocfs2_control_validate_handshake(struct file *file,
 	return count;
 }
 
+static void ocfs2_control_send_down(const char *uuid,
+				    int nodenum)
+{
+	struct ocfs2_live_connection *c;
+
+	mutex_lock(&ocfs2_control_lock);
+
+	c = ocfs2_connection_find(uuid);
+	if (c) {
+		BUG_ON(c->oc_conn == NULL);
+		c->oc_conn->cc_recovery_handler(nodenum,
+						c->oc_conn->cc_recovery_data);
+	}
+
+	mutex_unlock(&ocfs2_control_lock);
+}
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+	char	tag[OCFS2_CONTROL_MESSAGE_DOWN_LEN];
+	char	space1;
+	char	uuid[OCFS2_TEXT_UUID_LEN];
+	char	space2;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+static ssize_t ocfs2_control_message(struct file *file,
+				     const char __user *buf,
+				     size_t count)
+{
+	ssize_t ret;
+	char *p = NULL;
+	long nodenum;
+	struct ocfs2_control_message_down msg;
+
+	/* Try to catch padding issues */
+	WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+		(sizeof(msg.tag) + sizeof(msg.space1)));
+
+	memset(&msg, 0, sizeof(struct ocfs2_control_message_down));
+	ret = ocfs2_control_cfu(&msg, OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN,
+				buf, count);
+	if (ret != count)
+		return ret;
+
+	if (strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN,
+		    strlen(OCFS2_CONTROL_MESSAGE_DOWN)))
+		return -EINVAL;
+
+	if ((msg.space1 != ' ') || (msg.space2 != ' ') ||
+	    (msg.newline != '\n'))
+		return -EINVAL;
+	msg.space1 = msg.space2 = msg.newline = '\0';
+
+	nodenum = simple_strtol(msg.nodestr, &p, 16);
+	if (!p || *p)
+		return -EINVAL;
+
+	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+	    (nodenum > INT_MAX) || (nodenum < 0))
+		return -ERANGE;
+
+	ocfs2_control_send_down(msg.uuid, nodenum);
+
+	return count;
+}
 
 static ssize_t ocfs2_control_write(struct file *file,
 				   const char __user *buf,
@@ -204,7 +288,7 @@ static ssize_t ocfs2_control_write(struct file *file,
 			break;
 
 		case OCFS2_CONTROL_HANDSHAKE_VALID:
-			ret = count;  /* XXX */
+			ret = ocfs2_control_message(file, buf, count);
 			break;
 
 		default:
-- 
cgit v1.2.3


From 3cfd4ab6b6b4bee2035b62e1c293801c3d257502 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Feb 2008 14:44:34 -0800
Subject: ocfs2: Add the local node id to the handshake.

This is the second part of the ocfs2_control handshake.  After
negotiating the ocfs2_control protocol, the daemon tells the filesystem
what the local node id is via the SETN message.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 222 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 173 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a5e58e2b9d59..43e6105369c6 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -40,8 +40,18 @@
  * unknown, -EINVAL is returned.  Once the negotiation is complete, the
  * client can start sending messages.
  *
- * The T01 protocol only has one message, "DOWN".  It has the following
- * syntax:
+ * The T01 protocol only has two messages.  First is the "SETN" message.
+ * It has the following syntax:
+ *
+ *  SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Once the local node number has been set, the "DOWN" message can be
+ * sent for node down notification.  It has the following syntax:
  *
  *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
  *
@@ -58,11 +68,18 @@
  */
 #define OCFS2_CONTROL_PROTO			"T01\n"
 #define OCFS2_CONTROL_PROTO_LEN			4
+
+/* Handshake states */
 #define OCFS2_CONTROL_HANDSHAKE_INVALID		(0)
 #define OCFS2_CONTROL_HANDSHAKE_READ		(1)
-#define OCFS2_CONTROL_HANDSHAKE_VALID		(2)
-#define OCFS2_CONTROL_MESSAGE_DOWN		"DOWN"
-#define OCFS2_CONTROL_MESSAGE_DOWN_LEN		4
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL	(2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID		(3)
+
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN		4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP	"SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN	14
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP		"DOWN"
 #define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN	47
 #define OCFS2_TEXT_UUID_LEN			32
 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
@@ -79,9 +96,35 @@ struct ocfs2_live_connection {
 struct ocfs2_control_private {
 	struct list_head op_list;
 	int op_state;
+	int op_this_node;
+};
+
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space1;
+	char	uuid[OCFS2_TEXT_UUID_LEN];
+	char	space2;
+	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+	char	newline;
+};
+
+union ocfs2_control_message {
+	char					tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	struct ocfs2_control_message_setn	u_setn;
+	struct ocfs2_control_message_down	u_down;
 };
 
 static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
 
 static LIST_HEAD(ocfs2_live_connection_list);
 static LIST_HEAD(ocfs2_control_private_list);
@@ -166,38 +209,37 @@ static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
 	kfree(c);
 }
 
-static ssize_t ocfs2_control_cfu(void *target, size_t target_len,
-				 const char __user *buf, size_t count)
+static int ocfs2_control_cfu(void *target, size_t target_len,
+			     const char __user *buf, size_t count)
 {
 	/* The T01 expects write(2) calls to have exactly one command */
-	if (count != target_len)
+	if ((count != target_len) ||
+	    (count > sizeof(union ocfs2_control_message)))
 		return -EINVAL;
 
 	if (copy_from_user(target, buf, target_len))
 		return -EFAULT;
 
-	return count;
+	return 0;
 }
 
-static ssize_t ocfs2_control_validate_handshake(struct file *file,
-						const char __user *buf,
-						size_t count)
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+					       const char __user *buf,
+					       size_t count)
 {
 	ssize_t ret;
 	char kbuf[OCFS2_CONTROL_PROTO_LEN];
 
 	ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
 				buf, count);
-	if (ret != count)
+	if (ret)
 		return ret;
 
 	if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
 		return -EINVAL;
 
-	atomic_inc(&ocfs2_control_opened);
 	ocfs2_control_set_handshake_state(file,
-					  OCFS2_CONTROL_HANDSHAKE_VALID);
-
+					  OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
 
 	return count;
 }
@@ -219,45 +261,92 @@ static void ocfs2_control_send_down(const char *uuid,
 	mutex_unlock(&ocfs2_control_lock);
 }
 
-/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
-struct ocfs2_control_message_down {
-	char	tag[OCFS2_CONTROL_MESSAGE_DOWN_LEN];
-	char	space1;
-	char	uuid[OCFS2_TEXT_UUID_LEN];
-	char	space2;
-	char	nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
-	char	newline;
-};
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values.  If not, return -EAGAIN.  If there is a problem, return a
+ * different error.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+	int rc = 0;
+	int set_p = 1;
+	struct ocfs2_control_private *p = file->private_data;
 
-static ssize_t ocfs2_control_message(struct file *file,
-				     const char __user *buf,
-				     size_t count)
+	BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+	if (p->op_this_node < 0)
+		set_p = 0;
+
+	mutex_lock(&ocfs2_control_lock);
+	if (ocfs2_control_this_node < 0) {
+		if (set_p)
+			ocfs2_control_this_node = p->op_this_node;
+	} else if (ocfs2_control_this_node != p->op_this_node)
+		rc = -EINVAL;
+	mutex_unlock(&ocfs2_control_lock);
+
+	if (!rc && set_p) {
+		/* We set the global values successfully */
+		atomic_inc(&ocfs2_control_opened);
+		ocfs2_control_set_handshake_state(file,
+					OCFS2_CONTROL_HANDSHAKE_VALID);
+	}
+
+	return rc;
+}
+
+static int ocfs2_control_do_setnode_msg(struct file *file,
+					struct ocfs2_control_message_setn *msg)
 {
-	ssize_t ret;
-	char *p = NULL;
 	long nodenum;
-	struct ocfs2_control_message_down msg;
+	char *ptr = NULL;
+	struct ocfs2_control_private *p = file->private_data;
 
-	/* Try to catch padding issues */
-	WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
-		(sizeof(msg.tag) + sizeof(msg.space1)));
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+		return -EINVAL;
 
-	memset(&msg, 0, sizeof(struct ocfs2_control_message_down));
-	ret = ocfs2_control_cfu(&msg, OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN,
-				buf, count);
-	if (ret != count)
-		return ret;
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space != ' ') || (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space = msg->newline = '\0';
+
+	nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+
+	if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+	    (nodenum > INT_MAX) || (nodenum < 0))
+		return -ERANGE;
+	p->op_this_node = nodenum;
+
+	return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_down_msg(struct file *file,
+				     struct ocfs2_control_message_down *msg)
+{
+	long nodenum;
+	char *p = NULL;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_VALID)
+		return -EINVAL;
 
-	if (strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN,
-		    strlen(OCFS2_CONTROL_MESSAGE_DOWN)))
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
 		return -EINVAL;
 
-	if ((msg.space1 != ' ') || (msg.space2 != ' ') ||
-	    (msg.newline != '\n'))
+	if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+	    (msg->newline != '\n'))
 		return -EINVAL;
-	msg.space1 = msg.space2 = msg.newline = '\0';
+	msg->space1 = msg->space2 = msg->newline = '\0';
 
-	nodenum = simple_strtol(msg.nodestr, &p, 16);
+	nodenum = simple_strtol(msg->nodestr, &p, 16);
 	if (!p || *p)
 		return -EINVAL;
 
@@ -265,9 +354,40 @@ static ssize_t ocfs2_control_message(struct file *file,
 	    (nodenum > INT_MAX) || (nodenum < 0))
 		return -ERANGE;
 
-	ocfs2_control_send_down(msg.uuid, nodenum);
+	ocfs2_control_send_down(msg->uuid, nodenum);
 
-	return count;
+	return 0;
+}
+
+static ssize_t ocfs2_control_message(struct file *file,
+				     const char __user *buf,
+				     size_t count)
+{
+	ssize_t ret;
+	union ocfs2_control_message msg;
+
+	/* Try to catch padding issues */
+	WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+		(sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+
+	memset(&msg, 0, sizeof(union ocfs2_control_message));
+	ret = ocfs2_control_cfu(&msg, count, buf, count);
+	if (ret)
+		goto out;
+
+	if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+	    !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+		     OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+	else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+			  OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+	else
+		ret = -EINVAL;
+
+out:
+	return ret ? ret : count;
 }
 
 static ssize_t ocfs2_control_write(struct file *file,
@@ -283,10 +403,11 @@ static ssize_t ocfs2_control_write(struct file *file,
 			break;
 
 		case OCFS2_CONTROL_HANDSHAKE_READ:
-			ret = ocfs2_control_validate_handshake(file, buf,
-							       count);
+			ret = ocfs2_control_validate_protocol(file, buf,
+							      count);
 			break;
 
+		case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
 		case OCFS2_CONTROL_HANDSHAKE_VALID:
 			ret = ocfs2_control_message(file, buf, count);
 			break;
@@ -350,6 +471,8 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
 			       "an emergency restart!\n");
 			emergency_restart();
 		}
+		/* Last valid close clears the node number */
+		ocfs2_control_this_node = -1;
 	}
 
 out:
@@ -370,6 +493,7 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
 	p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
+	p->op_this_node = -1;
 
 	mutex_lock(&ocfs2_control_lock);
 	file->private_data = p;
-- 
cgit v1.2.3


From d4b95eef4dc4a59bcd42bdf783638a2eaa57b4c8 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 20 Feb 2008 15:39:44 -0800
Subject: ocfs2: Add the 'set version' message to the ocfs2_control device.

The "SETV" message sets the filesystem locking protocol version as
negotiated by the client.  The client negotiates based on the maximum
version advertised in /sys/fs/ocfs2/max_locking_protocol.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 131 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 119 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 43e6105369c6..de982c11e69b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -40,7 +40,7 @@
  * unknown, -EINVAL is returned.  Once the negotiation is complete, the
  * client can start sending messages.
  *
- * The T01 protocol only has two messages.  First is the "SETN" message.
+ * The T01 protocol has three messages.  First is the "SETN" message.
  * It has the following syntax:
  *
  *  SETN<space><8-char-hex-nodenum><newline>
@@ -50,8 +50,22 @@
  * The "SETN" message must be the first message following the protocol.
  * It tells ocfs2_control the local node number.
  *
- * Once the local node number has been set, the "DOWN" message can be
- * sent for node down notification.  It has the following syntax:
+ * Next comes the "SETV" message.  It has the following syntax:
+ *
+ *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client.  The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
+ * number from the "SETV" message must match
+ * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * must be less than or equal to ...->lp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed.  From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
  *
  *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
  *
@@ -79,9 +93,12 @@
 #define OCFS2_CONTROL_MESSAGE_OP_LEN		4
 #define OCFS2_CONTROL_MESSAGE_SETNODE_OP	"SETN"
 #define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN	14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP	"SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN	11
 #define OCFS2_CONTROL_MESSAGE_DOWN_OP		"DOWN"
 #define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN	47
 #define OCFS2_TEXT_UUID_LEN			32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN	2
 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8
 
 /*
@@ -97,6 +114,7 @@ struct ocfs2_control_private {
 	struct list_head op_list;
 	int op_state;
 	int op_this_node;
+	struct ocfs2_protocol_version op_proto;
 };
 
 /* SETN<space><8-char-hex-nodenum><newline> */
@@ -107,6 +125,16 @@ struct ocfs2_control_message_setn {
 	char	newline;
 };
 
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+	char	space1;
+	char	major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+	char	space2;
+	char	minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+	char	newline;
+};
+
 /* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
 struct ocfs2_control_message_down {
 	char	tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
@@ -120,11 +148,13 @@ struct ocfs2_control_message_down {
 union ocfs2_control_message {
 	char					tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
 	struct ocfs2_control_message_setn	u_setn;
+	struct ocfs2_control_message_setv	u_setv;
 	struct ocfs2_control_message_down	u_down;
 };
 
 static atomic_t ocfs2_control_opened;
 static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
 
 static LIST_HEAD(ocfs2_live_connection_list);
 static LIST_HEAD(ocfs2_control_private_list);
@@ -264,8 +294,9 @@ static void ocfs2_control_send_down(const char *uuid,
 /*
  * Called whenever configuration elements are sent to /dev/ocfs2_control.
  * If all configuration elements are present, try to set the global
- * values.  If not, return -EAGAIN.  If there is a problem, return a
- * different error.
+ * values.  If there is a problem, return an error.  Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
  */
 static int ocfs2_control_install_private(struct file *file)
 {
@@ -275,15 +306,32 @@ static int ocfs2_control_install_private(struct file *file)
 
 	BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
 
-	if (p->op_this_node < 0)
+	mutex_lock(&ocfs2_control_lock);
+
+	if (p->op_this_node < 0) {
 		set_p = 0;
+	} else if ((ocfs2_control_this_node >= 0) &&
+		   (ocfs2_control_this_node != p->op_this_node)) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
 
-	mutex_lock(&ocfs2_control_lock);
-	if (ocfs2_control_this_node < 0) {
-		if (set_p)
-			ocfs2_control_this_node = p->op_this_node;
-	} else if (ocfs2_control_this_node != p->op_this_node)
+	if (!p->op_proto.pv_major) {
+		set_p = 0;
+	} else if (!list_empty(&ocfs2_live_connection_list) &&
+		   ((running_proto.pv_major != p->op_proto.pv_major) ||
+		    (running_proto.pv_minor != p->op_proto.pv_minor))) {
 		rc = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (set_p) {
+		ocfs2_control_this_node = p->op_this_node;
+		running_proto.pv_major = p->op_proto.pv_major;
+		running_proto.pv_minor = p->op_proto.pv_minor;
+	}
+
+out_unlock:
 	mutex_unlock(&ocfs2_control_lock);
 
 	if (!rc && set_p) {
@@ -327,6 +375,56 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
 	return ocfs2_control_install_private(file);
 }
 
+static int ocfs2_control_do_setversion_msg(struct file *file,
+					   struct ocfs2_control_message_setv *msg)
+ {
+	long major, minor;
+	char *ptr = NULL;
+	struct ocfs2_control_private *p = file->private_data;
+	struct ocfs2_protocol_version *max =
+		&user_stack.sp_proto->lp_max_version;
+
+	if (ocfs2_control_get_handshake_state(file) !=
+	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+		return -EINVAL;
+
+	if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+		    OCFS2_CONTROL_MESSAGE_OP_LEN))
+		return -EINVAL;
+
+	if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+	    (msg->newline != '\n'))
+		return -EINVAL;
+	msg->space1 = msg->space2 = msg->newline = '\0';
+
+	major = simple_strtol(msg->major, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+	minor = simple_strtol(msg->minor, &ptr, 16);
+	if (!ptr || *ptr)
+		return -EINVAL;
+
+	/*
+	 * The major must be between 1 and 255, inclusive.  The minor
+	 * must be between 0 and 255, inclusive.  The version passed in
+	 * must be within the maximum version supported by the filesystem.
+	 */
+	if ((major == LONG_MIN) || (major == LONG_MAX) ||
+	    (major > (u8)-1) || (major < 1))
+		return -ERANGE;
+	if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+	    (minor > (u8)-1) || (minor < 0))
+		return -ERANGE;
+	if ((major != max->pv_major) ||
+	    (minor > max->pv_minor))
+		return -EINVAL;
+
+	p->op_proto.pv_major = major;
+	p->op_proto.pv_minor = minor;
+
+	return ocfs2_control_install_private(file);
+}
+
 static int ocfs2_control_do_down_msg(struct file *file,
 				     struct ocfs2_control_message_down *msg)
 {
@@ -379,6 +477,10 @@ static ssize_t ocfs2_control_message(struct file *file,
 	    !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
 		     OCFS2_CONTROL_MESSAGE_OP_LEN))
 		ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+	else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+			  OCFS2_CONTROL_MESSAGE_OP_LEN))
+		ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
 	else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
 		 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
 			  OCFS2_CONTROL_MESSAGE_OP_LEN))
@@ -471,8 +573,13 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
 			       "an emergency restart!\n");
 			emergency_restart();
 		}
-		/* Last valid close clears the node number */
+		/*
+		 * Last valid close clears the node number and resets
+		 * the locking protocol version
+		 */
 		ocfs2_control_this_node = -1;
+		running_proto.pv_major = 0;
+		running_proto.pv_major = 0;
 	}
 
 out:
-- 
cgit v1.2.3


From cf4d8d75d8aba537a19b313a9364fd08ddbd5622 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 20 Feb 2008 14:29:27 -0800
Subject: ocfs2: add fsdlm to stackglue

Add code to use fs/dlm.

[ Modified to be part of the stack_user module -- Joel ]

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 216 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/stackglue.c  |  14 +++-
 fs/ocfs2/stackglue.h  |  19 ++++-
 3 files changed, 243 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index de982c11e69b..7428663f9cbb 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -24,6 +24,7 @@
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
+#include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 
 
@@ -152,6 +153,8 @@ union ocfs2_control_message {
 	struct ocfs2_control_message_down	u_down;
 };
 
+static struct ocfs2_stack_plugin user_stack;
+
 static atomic_t ocfs2_control_opened;
 static int ocfs2_control_this_node = -1;
 static struct ocfs2_protocol_version running_proto;
@@ -344,6 +347,20 @@ out_unlock:
 	return rc;
 }
 
+static int ocfs2_control_get_this_node(void)
+{
+	int rc;
+
+	mutex_lock(&ocfs2_control_lock);
+	if (ocfs2_control_this_node < 0)
+		rc = -EINVAL;
+	else
+		rc = ocfs2_control_this_node;
+	mutex_unlock(&ocfs2_control_lock);
+
+	return rc;
+}
+
 static int ocfs2_control_do_setnode_msg(struct file *file,
 					struct ocfs2_control_message_setn *msg)
 {
@@ -652,13 +669,210 @@ static void ocfs2_control_exit(void)
 		       -rc);
 }
 
+static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
+{
+	struct ocfs2_lock_res *res = astarg;
+	return &res->l_lksb.lksb_fsdlm;
+}
+
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+	struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
+	int status = lksb->sb_status;
+
+	BUG_ON(user_stack.sp_proto == NULL);
+
+	/*
+	 * For now we're punting on the issue of other non-standard errors
+	 * where we can't tell if the unlock_ast or lock_ast should be called.
+	 * The main "other error" that's possible is EINVAL which means the
+	 * function was called with invalid args, which shouldn't be possible
+	 * since the caller here is under our control.  Other non-standard
+	 * errors probably fall into the same category, or otherwise are fatal
+	 * which means we can't carry on anyway.
+	 */
+
+	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+		user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+	else
+		user_stack.sp_proto->lp_lock_ast(astarg);
+}
+
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+	BUG_ON(user_stack.sp_proto == NULL);
+
+	user_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+			 int mode,
+			 union ocfs2_dlm_lksb *lksb,
+			 u32 flags,
+			 void *name,
+			 unsigned int namelen,
+			 void *astarg)
+{
+	int ret;
+
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
+
+	ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+		       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+		       fsdlm_lock_ast_wrapper, astarg,
+		       fsdlm_blocking_ast_wrapper);
+	return ret;
+}
+
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+			   union ocfs2_dlm_lksb *lksb,
+			   u32 flags,
+			   void *astarg)
+{
+	int ret;
+
+	ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+			 flags, &lksb->lksb_fsdlm, astarg);
+	return ret;
+}
+
+static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+	return lksb->lksb_fsdlm.sb_status;
+}
+
+static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+
+static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+			       struct ocfs2_protocol_version *request)
+{
+	if (existing->pv_major != request->pv_major)
+		return 1;
+
+	if (existing->pv_minor > request->pv_minor)
+		return 1;
+
+	if (existing->pv_minor < request->pv_minor)
+		request->pv_minor = existing->pv_minor;
+
+	return 0;
+}
+
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+	dlm_lockspace_t *fsdlm;
+	struct ocfs2_live_connection *control;
+	int rc = 0;
+
+	BUG_ON(conn == NULL);
+
+	rc = ocfs2_live_connection_new(conn, &control);
+	if (rc)
+		goto out;
+
+	/*
+	 * running_proto must have been set before we allowed any mounts
+	 * to proceed.
+	 */
+	if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+		printk(KERN_ERR
+		       "Unable to mount with fs locking protocol version "
+		       "%u.%u because the userspace control daemon has "
+		       "negotiated %u.%u\n",
+		       conn->cc_version.pv_major, conn->cc_version.pv_minor,
+		       running_proto.pv_major, running_proto.pv_minor);
+		rc = -EPROTO;
+		ocfs2_live_connection_drop(control);
+		goto out;
+	}
+
+	rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+			       &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+	if (rc) {
+		ocfs2_live_connection_drop(control);
+		goto out;
+	}
+
+	conn->cc_private = control;
+	conn->cc_lockspace = fsdlm;
+out:
+	return rc;
+}
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+				   int hangup_pending)
+{
+	dlm_release_lockspace(conn->cc_lockspace, 2);
+	conn->cc_lockspace = NULL;
+	ocfs2_live_connection_drop(conn->cc_private);
+	conn->cc_private = NULL;
+	return 0;
+}
+
+static int user_cluster_this_node(unsigned int *this_node)
+{
+	int rc;
+
+	rc = ocfs2_control_get_this_node();
+	if (rc < 0)
+		return rc;
+
+	*this_node = rc;
+	return 0;
+}
+
+static struct ocfs2_stack_operations user_stack_ops = {
+	.connect	= user_cluster_connect,
+	.disconnect	= user_cluster_disconnect,
+	.this_node	= user_cluster_this_node,
+	.dlm_lock	= user_dlm_lock,
+	.dlm_unlock	= user_dlm_unlock,
+	.lock_status	= user_dlm_lock_status,
+	.lock_lvb	= user_dlm_lvb,
+	.dump_lksb	= user_dlm_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin user_stack = {
+	.sp_name	= "user",
+	.sp_ops		= &user_stack_ops,
+	.sp_owner	= THIS_MODULE,
+};
+
+
 static int __init user_stack_init(void)
 {
-	return ocfs2_control_init();
+	int rc;
+
+	rc = ocfs2_control_init();
+	if (!rc) {
+		rc = ocfs2_stack_glue_register(&user_stack);
+		if (rc)
+			ocfs2_control_exit();
+	}
+
+	return rc;
 }
 
 static void __exit user_stack_exit(void)
 {
+	ocfs2_stack_glue_unregister(&user_stack);
 	ocfs2_control_exit();
 }
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index bf45d9bff8a7..119f60cea9cc 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -228,13 +228,20 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
 
 
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
+ * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
+ * underlying stack plugins need to pilfer the lksb off of the lock_res.
+ * If some other structure needs to be passed as an astarg, the plugins
+ * will need to be given a different avenue to the lksb.
+ */
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
 		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen,
-		   void *astarg)
+		   struct ocfs2_lock_res *astarg)
 {
 	BUG_ON(lproto == NULL);
 
@@ -246,7 +253,7 @@ EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
-		     void *astarg)
+		     struct ocfs2_lock_res *astarg)
 {
 	BUG_ON(lproto == NULL);
 
@@ -360,7 +367,8 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
 	BUG_ON(group == NULL);
 	BUG_ON(group[grouplen] != '\0');
 
-	active_stack->sp_ops->hangup(group, grouplen);
+	if (active_stack->sp_ops->hangup)
+		active_stack->sp_ops->hangup(group, grouplen);
 
 	/* cluster_disconnect() was called with hangup_pending==1 */
 	ocfs2_stack_driver_put();
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index d88bc655644b..005e4f170e0f 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -26,6 +26,7 @@
 #include <linux/dlmconstants.h>
 
 #include "dlm/dlmapi.h"
+#include <linux/dlm.h>
 
 /*
  * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
@@ -60,6 +61,17 @@ struct ocfs2_locking_protocol {
 	void (*lp_unlock_ast)(void *astarg, int error);
 };
 
+
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space.  This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+	struct dlm_lksb lksb;
+	char lvb[DLM_LVB_LEN];
+};
+
 /*
  * A union of all lock status structures.  We define it here so that the
  * size of the union is known.  Lock status structures are embedded in
@@ -67,6 +79,8 @@ struct ocfs2_locking_protocol {
  */
 union ocfs2_dlm_lksb {
 	struct dlm_lockstatus lksb_o2dlm;
+	struct dlm_lksb lksb_fsdlm;
+	struct fsdlm_lksb_plus_lvb padding;
 };
 
 /*
@@ -221,17 +235,18 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 void ocfs2_cluster_hangup(const char *group, int grouplen);
 int ocfs2_cluster_this_node(unsigned int *node);
 
+struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
 		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen,
-		   void *astarg);
+		   struct ocfs2_lock_res *astarg);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
 		     u32 flags,
-		     void *astarg);
+		     struct ocfs2_lock_res *astarg);
 
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
-- 
cgit v1.2.3


From 8f318311faf57481452895448e6ffaec7c38a146 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 4 Mar 2008 16:09:39 -0800
Subject: ocfs2: Change mlog_bug_on to BUG_ON in ocfs2_lockid.h

The masklog code is in the o2cb stack, but ocfs2_lockid.h now needs to
be included by the user stack.  The BUG() in ocfs2_lock_type_string()
does not need masklog support, so change it to a regular BUG_ON().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_lockid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2b..82c200f7a8f1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
 {
 #ifdef __KERNEL__
-	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
 #endif
 	return ocfs2_lock_type_strings[type];
 }
-- 
cgit v1.2.3


From b92eccdd28e1e3870a5b2aa625282c9ae8e35cec Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 28 Nov 2007 14:53:30 -0800
Subject: ocfs2: Add kbuild for ocfs2_stack_user.ko

Add ocfs2_stack_user.ko to the Makefile so that it builds.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index b734254329b2..b8d6d0225737 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,11 @@ EXTRA_CFLAGS += -Ifs/ocfs2
 
 EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o ocfs2_stackglue.o ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS) += 	\
+	ocfs2.o			\
+	ocfs2_stackglue.o	\
+	ocfs2_stack_o2cb.o	\
+	ocfs2_stack_user.o
 
 ocfs2-objs := \
 	alloc.o 		\
@@ -33,6 +37,7 @@ ocfs2-objs := \
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
 
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS) += dlm/
-- 
cgit v1.2.3


From 9341d22942d63d6a1e4cc90f246980dbb7e1ca94 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 4 Mar 2008 17:58:56 -0800
Subject: ocfs2: Allow selection of cluster plug-ins.

ocfs2 now supports plug-ins for the classic O2CB stack as well as
userspace cluster stacks in conjunction with fs/dlm.  This allows zero,
one, or both of the plug-ins to be selected in Kconfig.  For local mounts
(non-clustered), neither plug-in is needed.  Both plugins can be loaded
at one time, the runtime will select the one needed for the cluster
systme in use.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/Kconfig        | 26 ++++++++++++++++++++++++++
 fs/ocfs2/Makefile | 10 ++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c509123bea49..028ae38ecc52 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -444,6 +444,32 @@ config OCFS2_FS
 	  For more information on OCFS2, see the file
 	  <file:Documentation/filesystems/ocfs2.txt>.
 
+config OCFS2_FS_O2CB
+	tristate "O2CB Kernelspace Clustering"
+	depends on OCFS2_FS
+	default y
+	help
+	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
+	  Cluster Base.  It only requires a very small userspace component
+	  to configure it. This comes with the standard ocfs2-tools package.
+	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
+	  It cannot manage any other cluster applications.
+
+	  It is always safe to say Y here, as the clustering method is
+	  run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+	tristate "OCFS2 Userspace Clustering"
+	depends on OCFS2_FS && DLM
+	default y
+	help
+	  This option will allow OCFS2 to use userspace clustering services
+	  in conjunction with the DLM in fs/dlm.  If you are using a
+	  userspace cluster manager, say Y here.
+
+	  It is safe to say Y, as the clustering method is run-time
+	  selectable.
+
 config OCFS2_DEBUG_MASKLOG
 	bool "OCFS2 logging support"
 	depends on OCFS2_FS
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index b8d6d0225737..f6956de56fdb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -4,9 +4,10 @@ EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
 
 obj-$(CONFIG_OCFS2_FS) += 	\
 	ocfs2.o			\
-	ocfs2_stackglue.o	\
-	ocfs2_stack_o2cb.o	\
-	ocfs2_stack_user.o
+	ocfs2_stackglue.o
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 
 ocfs2-objs := \
 	alloc.o 		\
@@ -39,5 +40,6 @@ ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
 
+# cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
-obj-$(CONFIG_OCFS2_FS) += dlm/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
-- 
cgit v1.2.3


From 12eb0035d6f0466038ef2c6e5f6f9296b9b74d91 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:19 -0700
Subject: ocfs2/dlm: Rename slabcache dlm_mle_cache to o2dlm_mle

This patch renames dlm_mle_slabcache to prevent namespace clashes with fs/dlm.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdomain.c | 4 +++-
 fs/ocfs2/dlm/dlmmaster.c | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0879d86113e3..2ce620742f9e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1816,8 +1816,10 @@ static int __init dlm_init(void)
 	dlm_print_version();
 
 	status = dlm_init_mle_cache();
-	if (status)
+	if (status) {
+		mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
 		return -1;
+	}
 
 	status = dlm_register_net_handlers();
 	if (status) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ea6b89577860..90797c591018 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -507,7 +507,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
 
 int dlm_init_mle_cache(void)
 {
-	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+	dlm_mle_cache = kmem_cache_create("o2dlm_mle",
 					  sizeof(struct dlm_master_list_entry),
 					  0, SLAB_HWCACHE_ALIGN,
 					  NULL);
-- 
cgit v1.2.3


From 724bdca9b8449d9ee5f779dc27ee3d906a04508c Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:20 -0700
Subject: ocfs2/dlm: Create slabcaches for lock and lockres

This patch makes the o2dlm allocate memory for lockres, lockname and lock
structures from slabcaches rather than kmalloc. This allows us to not only
make these allocs more efficient but also allows us to track the memory being
consumed by these structures.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmcommon.h |  7 ++++++
 fs/ocfs2/dlm/dlmdomain.c | 26 ++++++++++++++++++---
 fs/ocfs2/dlm/dlmlock.c   | 22 +++++++++++++++--
 fs/ocfs2/dlm/dlmmaster.c | 61 ++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 99 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index dc8ea666efdb..7525a8ae3943 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -963,9 +963,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
 					  DLM_LOCK_RES_MIGRATING));
 }
 
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
 
 int dlm_init_mle_cache(void);
 void dlm_destroy_mle_cache(void);
+
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
 			 struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ce620742f9e..b092364d0d52 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1818,21 +1818,41 @@ static int __init dlm_init(void)
 	status = dlm_init_mle_cache();
 	if (status) {
 		mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
-		return -1;
+		goto error;
+	}
+
+	status = dlm_init_master_caches();
+	if (status) {
+		mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+		     "o2dlm_lockname slabcaches\n");
+		goto error;
+	}
+
+	status = dlm_init_lock_cache();
+	if (status) {
+		mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+		goto error;
 	}
 
 	status = dlm_register_net_handlers();
 	if (status) {
-		dlm_destroy_mle_cache();
-		return -1;
+		mlog(ML_ERROR, "Unable to register network handlers\n");
+		goto error;
 	}
 
 	return 0;
+error:
+	dlm_destroy_lock_cache();
+	dlm_destroy_master_caches();
+	dlm_destroy_mle_cache();
+	return -1;
 }
 
 static void __exit dlm_exit (void)
 {
 	dlm_unregister_net_handlers();
+	dlm_destroy_lock_cache();
+	dlm_destroy_master_caches();
 	dlm_destroy_mle_cache();
 }
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9a..83a9f2972ac8 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
+static struct kmem_cache *dlm_lock_cache = NULL;
+
 static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
 static void dlm_lock_release(struct kref *kref);
 static void dlm_lock_detach_lockres(struct dlm_lock *lock);
 
+int dlm_init_lock_cache(void)
+{
+	dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+					   sizeof(struct dlm_lock),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (dlm_lock_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_lock_cache(void)
+{
+	if (dlm_lock_cache)
+		kmem_cache_destroy(dlm_lock_cache);
+}
+
 /* Tell us whether we can grant a new lock request.
  * locking:
  *   caller needs:  res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
 		mlog(0, "freeing kernel-allocated lksb\n");
 		kfree(lock->lksb);
 	}
-	kfree(lock);
+	kmem_cache_free(dlm_lock_cache, lock);
 }
 
 /* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
 	struct dlm_lock *lock;
 	int kernel_allocated = 0;
 
-	lock = kzalloc(sizeof(*lock), GFP_NOFS);
+	lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
 	if (!lock)
 		return NULL;
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 90797c591018..ac9ed31e5445 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -216,10 +216,10 @@ EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
 
 #endif  /*  0  */
 
-
+static struct kmem_cache *dlm_lockres_cache = NULL;
+static struct kmem_cache *dlm_lockname_cache = NULL;
 static struct kmem_cache *dlm_mle_cache = NULL;
 
-
 static void dlm_mle_release(struct kref *kref);
 static void dlm_init_mle(struct dlm_master_list_entry *mle,
 			enum dlm_mle_type type,
@@ -560,6 +560,35 @@ static void dlm_mle_release(struct kref *kref)
  * LOCK RESOURCE FUNCTIONS
  */
 
+int dlm_init_master_caches(void)
+{
+	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+					      sizeof(struct dlm_lock_resource),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!dlm_lockres_cache)
+		goto bail;
+
+	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+					       DLM_LOCKID_NAME_MAX, 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+	if (!dlm_lockname_cache)
+		goto bail;
+
+	return 0;
+bail:
+	dlm_destroy_master_caches();
+	return -ENOMEM;
+}
+
+void dlm_destroy_master_caches(void)
+{
+	if (dlm_lockname_cache)
+		kmem_cache_destroy(dlm_lockname_cache);
+
+	if (dlm_lockres_cache)
+		kmem_cache_destroy(dlm_lockres_cache);
+}
+
 static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
 				  struct dlm_lock_resource *res,
 				  u8 owner)
@@ -642,9 +671,9 @@ static void dlm_lockres_release(struct kref *kref)
 	BUG_ON(!list_empty(&res->recovering));
 	BUG_ON(!list_empty(&res->purge));
 
-	kfree(res->lockname.name);
+	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
 
-	kfree(res);
+	kmem_cache_free(dlm_lockres_cache, res);
 }
 
 void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -700,20 +729,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 				   const char *name,
 				   unsigned int namelen)
 {
-	struct dlm_lock_resource *res;
+	struct dlm_lock_resource *res = NULL;
 
-	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
+	res = (struct dlm_lock_resource *)
+				kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
 	if (!res)
-		return NULL;
+		goto error;
 
-	res->lockname.name = kmalloc(namelen, GFP_NOFS);
-	if (!res->lockname.name) {
-		kfree(res);
-		return NULL;
-	}
+	res->lockname.name = (char *)
+				kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+	if (!res->lockname.name)
+		goto error;
 
 	dlm_init_lockres(dlm, res, name, namelen);
 	return res;
+
+error:
+	if (res && res->lockname.name)
+		kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+	if (res)
+		kmem_cache_free(dlm_lockres_cache, res);
+	return NULL;
 }
 
 void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
-- 
cgit v1.2.3


From 29576f8bb54045be944ba809d4fca1ad77c94165 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:21 -0700
Subject: ocfs2/dlm: Link all lockres' to a tracking list

This patch links all the lockres' to a tracking list in dlm_ctxt.
We will use this in an upcoming patch that will walk the entire
list and to dump the lockres states to a debugfs file.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmcommon.h |  4 ++++
 fs/ocfs2/dlm/dlmdomain.c | 11 +++++++++++
 fs/ocfs2/dlm/dlmmaster.c | 11 +++++++++++
 3 files changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 7525a8ae3943..cc31abeadb8e 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -101,6 +101,7 @@ struct dlm_ctxt
 	struct list_head purge_list;
 	struct list_head pending_asts;
 	struct list_head pending_basts;
+	struct list_head tracking_list;
 	unsigned int purge_count;
 	spinlock_t spinlock;
 	spinlock_t ast_lock;
@@ -270,6 +271,9 @@ struct dlm_lock_resource
 	struct list_head dirty;
 	struct list_head recovering; // dlm_recovery_ctxt.resources list
 
+	/* Added during init and removed during release */
+	struct list_head tracking;	/* dlm->tracking_list */
+
 	/* unused lock resources have their last_used stamped and are
 	 * put on a list for the dlm thread to run. */
 	unsigned long    last_used;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index b092364d0d52..4f7695c851ed 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -644,6 +644,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
 void dlm_unregister_domain(struct dlm_ctxt *dlm)
 {
 	int leave = 0;
+	struct dlm_lock_resource *res;
 
 	spin_lock(&dlm_domain_lock);
 	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +674,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 			msleep(500);
 			mlog(0, "%s: more migration to do\n", dlm->name);
 		}
+
+		/* This list should be empty. If not, print remaining lockres */
+		if (!list_empty(&dlm->tracking_list)) {
+			mlog(ML_ERROR, "Following lockres' are still on the "
+			     "tracking list:\n");
+			list_for_each_entry(res, &dlm->tracking_list, tracking)
+				dlm_print_one_lock_resource(res);
+		}
+
 		dlm_mark_domain_leaving(dlm);
 		dlm_leave_domain(dlm);
 		dlm_complete_dlm_shutdown(dlm);
@@ -1526,6 +1536,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	INIT_LIST_HEAD(&dlm->reco.node_data);
 	INIT_LIST_HEAD(&dlm->purge_list);
 	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+	INIT_LIST_HEAD(&dlm->tracking_list);
 	dlm->reco.state = 0;
 
 	INIT_LIST_HEAD(&dlm->pending_asts);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ac9ed31e5445..97133465891c 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -639,6 +639,14 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
+	if (!list_empty(&res->tracking))
+		list_del_init(&res->tracking);
+	else {
+		mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+		     res->lockname.len, res->lockname.name);
+		dlm_print_one_lock_resource(res);
+	}
+
 	if (!hlist_unhashed(&res->hash_node) ||
 	    !list_empty(&res->granted) ||
 	    !list_empty(&res->converting) ||
@@ -706,6 +714,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	INIT_LIST_HEAD(&res->dirty);
 	INIT_LIST_HEAD(&res->recovering);
 	INIT_LIST_HEAD(&res->purge);
+	INIT_LIST_HEAD(&res->tracking);
 	atomic_set(&res->asts_reserved, 0);
 	res->migration_pending = 0;
 	res->inflight_locks = 0;
@@ -721,6 +730,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
 	res->last_used = 0;
 
+	list_add_tail(&res->tracking, &dlm->tracking_list);
+
 	memset(res->lvb, 0, DLM_LVB_LEN);
 	memset(res->refmap, 0, sizeof(res->refmap));
 }
-- 
cgit v1.2.3


From 6325b4a22b8f5e40ea9353288b3d6a32181f9718 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:22 -0700
Subject: ocfs2/dlm: Create debugfs dirs

This patch creates the debugfs directories that will hold the
files to be used to dump the dlm state.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmcommon.h |  2 ++
 fs/ocfs2/dlm/dlmdebug.c  | 50 +++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/dlm/dlmdebug.h  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdomain.c | 21 ++++++++++++++++++-
 4 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 fs/ocfs2/dlm/dlmdebug.h

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index cc31abeadb8e..6a491403bbfb 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -123,6 +123,8 @@ struct dlm_ctxt
 	atomic_t remote_resources;
 	atomic_t unknown_resources;
 
+	struct dentry *dlm_debugfs_subroot;
+
 	/* NOTE: Next three are protected by dlm_domain_lock */
 	struct kref dlm_refs;
 	enum dlm_ctxt_state dlm_state;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d4..62e2a4cbf60d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
+#include <linux/debugfs.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -37,8 +38,8 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
@@ -266,3 +267,50 @@ const char *dlm_errname(enum dlm_status err)
 	return dlm_errnames[err];
 }
 EXPORT_SYMBOL_GPL(dlm_errname);
+
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root = NULL;
+
+#define DLM_DEBUGFS_DIR				"o2dlm"
+
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+						      dlm_debugfs_root);
+	if (!dlm->dlm_debugfs_subroot) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	return 0;
+bail:
+	dlm_destroy_debugfs_subroot(dlm);
+	return -ENOMEM;
+}
+
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_debugfs_subroot)
+		debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+	dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+	if (!dlm_debugfs_root) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void dlm_destroy_debugfs_root(void)
+{
+	if (dlm_debugfs_root)
+		debugfs_remove(dlm_debugfs_root);
+}
+#endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..b96959512d96
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,54 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+#ifdef CONFIG_DEBUG_FS
+
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+
+#else
+
+static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+	return 0;
+}
+static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_root(void)
+{
+	return 0;
+}
+static void dlm_destroy_debugfs_root(void)
+{
+}
+
+#endif	/* CONFIG_DEBUG_FS */
+#endif	/* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4f7695c851ed..c137d693cef3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/debugfs.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 
 #include "dlmver.h"
 
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
 
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
+	dlm_destroy_debugfs_subroot(dlm);
+
 	if (dlm->lockres_hash)
 		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
@@ -1494,6 +1497,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 				u32 key)
 {
 	int i;
+	int ret;
 	struct dlm_ctxt *dlm = NULL;
 
 	dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1526,6 +1530,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	dlm->key = key;
 	dlm->node_num = o2nm_this_node();
 
+	ret = dlm_create_debugfs_subroot(dlm);
+	if (ret < 0) {
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
 	spin_lock_init(&dlm->ast_lock);
@@ -1851,8 +1864,13 @@ static int __init dlm_init(void)
 		goto error;
 	}
 
+	status = dlm_create_debugfs_root();
+	if (status)
+		goto error;
+
 	return 0;
 error:
+	dlm_unregister_net_handlers();
 	dlm_destroy_lock_cache();
 	dlm_destroy_master_caches();
 	dlm_destroy_mle_cache();
@@ -1861,6 +1879,7 @@ error:
 
 static void __exit dlm_exit (void)
 {
+	dlm_destroy_debugfs_root();
 	dlm_unregister_net_handlers();
 	dlm_destroy_lock_cache();
 	dlm_destroy_master_caches();
-- 
cgit v1.2.3


From 007dce53a29ccffc000ab5373d188f73881390fd Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:23 -0700
Subject: ocfs2/dlm: Dump the dlm state in a debugfs file

This patch dumps the dlm state (dlm_ctxt) into a debugfs file.
Useful for debugging.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmcommon.h |   1 +
 fs/ocfs2/dlm/dlmdebug.c  | 296 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdebug.h  |  20 ++++
 fs/ocfs2/dlm/dlmdomain.c |   8 ++
 4 files changed, 325 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 6a491403bbfb..f7a51ca64a2e 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -123,6 +123,7 @@ struct dlm_ctxt
 	atomic_t remote_resources;
 	atomic_t unknown_resources;
 
+	struct dlm_debug_ctxt *dlm_debug_ctxt;
 	struct dentry *dlm_debugfs_subroot;
 
 	/* NOTE: Next three are protected by dlm_domain_lock */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 62e2a4cbf60d..e33540380013 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -274,6 +274,294 @@ EXPORT_SYMBOL_GPL(dlm_errname);
 static struct dentry *dlm_debugfs_root = NULL;
 
 #define DLM_DEBUGFS_DIR				"o2dlm"
+#define DLM_DEBUGFS_DLM_STATE			"dlm_state"
+
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+	struct dlm_debug_ctxt *dc;
+
+	dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+
+	kfree(dc);
+}
+
+void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+	if (dc)
+		kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+	kref_get(&dc->debug_refcnt);
+}
+
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+			     char *buf, int len)
+{
+	int out = 0;
+	int i = -1;
+
+	while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+		out += snprintf(buf + out, len - out, "%d ", i);
+
+	return out;
+}
+
+static struct debug_buffer *debug_buffer_allocate(void)
+{
+	struct debug_buffer *db = NULL;
+
+	db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+	if (!db)
+		goto bail;
+
+	db->len = PAGE_SIZE;
+	db->buf = kmalloc(db->len, GFP_KERNEL);
+	if (!db->buf)
+		goto bail;
+
+	return db;
+bail:
+	kfree(db);
+	return NULL;
+}
+
+static ssize_t debug_buffer_read(struct file *file, char __user *buf,
+				 size_t nbytes, loff_t *ppos)
+{
+	struct debug_buffer *db = file->private_data;
+
+	return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
+}
+
+static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
+{
+	struct debug_buffer *db = file->private_data;
+	loff_t new = -1;
+
+	switch (whence) {
+	case 0:
+		new = off;
+		break;
+	case 1:
+		new = file->f_pos + off;
+		break;
+	}
+
+	if (new < 0 || new > db->len)
+		return -EINVAL;
+
+	return (file->f_pos = new);
+}
+
+static int debug_buffer_release(struct inode *inode, struct file *file)
+{
+	struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+
+	if (db)
+		kfree(db->buf);
+	kfree(db);
+
+	return 0;
+}
+/* end - util funcs */
+
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+	int out = 0;
+	struct dlm_reco_node_data *node;
+	char *state;
+	int lres, rres, ures, tres;
+
+	lres = atomic_read(&dlm->local_resources);
+	rres = atomic_read(&dlm->remote_resources);
+	ures = atomic_read(&dlm->unknown_resources);
+	tres = lres + rres + ures;
+
+	spin_lock(&dlm->spinlock);
+
+	switch (dlm->dlm_state) {
+	case DLM_CTXT_NEW:
+		state = "NEW"; break;
+	case DLM_CTXT_JOINED:
+		state = "JOINED"; break;
+	case DLM_CTXT_IN_SHUTDOWN:
+		state = "SHUTDOWN"; break;
+	case DLM_CTXT_LEAVING:
+		state = "LEAVING"; break;
+	default:
+		state = "UNKNOWN"; break;
+	}
+
+	/* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
+	out += snprintf(db->buf + out, db->len - out,
+			"Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+
+	/* Thread Pid: xxx  Node: xxx  State: xxxxx */
+	out += snprintf(db->buf + out, db->len - out,
+			"Thread Pid: %d  Node: %d  State: %s\n",
+			dlm->dlm_thread_task->pid, dlm->node_num, state);
+
+	/* Number of Joins: xxx  Joining Node: xxx */
+	out += snprintf(db->buf + out, db->len - out,
+			"Number of Joins: %d  Joining Node: %d\n",
+			dlm->num_joins, dlm->joining_node);
+
+	/* Domain Map: xx xx xx */
+	out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+	out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+				 db->buf + out, db->len - out);
+	out += snprintf(db->buf + out, db->len - out, "\n");
+
+	/* Live Map: xx xx xx */
+	out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+	out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+				 db->buf + out, db->len - out);
+	out += snprintf(db->buf + out, db->len - out, "\n");
+
+	/* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+	out += snprintf(db->buf + out, db->len - out,
+			"Mastered Resources Total: %d  Locally: %d  "
+			"Remotely: %d  Unknown: %d\n",
+			tres, lres, rres, ures);
+
+	/* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
+	out += snprintf(db->buf + out, db->len - out,
+			"Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
+			"PendingBASTs=%s  Master=%s\n",
+			(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+			(list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+			(list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+			(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+			(list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+
+	/* Purge Count: xxx  Refs: xxx */
+	out += snprintf(db->buf + out, db->len - out,
+			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
+			atomic_read(&dlm->dlm_refs.refcount));
+
+	/* Dead Node: xxx */
+	out += snprintf(db->buf + out, db->len - out,
+			"Dead Node: %d\n", dlm->reco.dead_node);
+
+	/* What about DLM_RECO_STATE_FINALIZE? */
+	if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+		state = "ACTIVE";
+	else
+		state = "INACTIVE";
+
+	/* Recovery Pid: xxxx  Master: xxx  State: xxxx */
+	out += snprintf(db->buf + out, db->len - out,
+			"Recovery Pid: %d  Master: %d  State: %s\n",
+			dlm->dlm_reco_thread_task->pid,
+			dlm->reco.new_master, state);
+
+	/* Recovery Map: xx xx */
+	out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+	out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+				 db->buf + out, db->len - out);
+	out += snprintf(db->buf + out, db->len - out, "\n");
+
+	/* Recovery Node State: */
+	out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+	list_for_each_entry(node, &dlm->reco.node_data, list) {
+		switch (node->state) {
+		case DLM_RECO_NODE_DATA_INIT:
+			state = "INIT";
+			break;
+		case DLM_RECO_NODE_DATA_REQUESTING:
+			state = "REQUESTING";
+			break;
+		case DLM_RECO_NODE_DATA_DEAD:
+			state = "DEAD";
+			break;
+		case DLM_RECO_NODE_DATA_RECEIVING:
+			state = "RECEIVING";
+			break;
+		case DLM_RECO_NODE_DATA_REQUESTED:
+			state = "REQUESTED";
+			break;
+		case DLM_RECO_NODE_DATA_DONE:
+			state = "DONE";
+			break;
+		case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			state = "FINALIZE-SENT";
+			break;
+		default:
+			state = "BAD";
+			break;
+		}
+		out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+				node->node_num, state);
+	}
+
+	spin_unlock(&dlm->spinlock);
+
+	return out;
+}
+
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	struct debug_buffer *db = NULL;
+
+	db = debug_buffer_allocate();
+	if (!db)
+		goto bail;
+
+	db->len = debug_state_print(dlm, db);
+
+	file->private_data = db;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static struct file_operations debug_state_fops = {
+	.open =		debug_state_open,
+	.release =	debug_buffer_release,
+	.read =		debug_buffer_read,
+	.llseek =	debug_buffer_llseek,
+};
+/* end  - debug state funcs */
+
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+	/* for dumping dlm_ctxt */
+	dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+						     S_IFREG|S_IRUSR,
+						     dlm->dlm_debugfs_subroot,
+						     dlm, &debug_state_fops);
+	if (!dc->debug_state_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	dlm_debug_get(dc);
+	return 0;
+
+bail:
+	dlm_debug_shutdown(dlm);
+	return -ENOMEM;
+}
+
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+	if (dc) {
+		if (dc->debug_state_dentry)
+			debugfs_remove(dc->debug_state_dentry);
+		dlm_debug_put(dc);
+	}
+}
 
 /* subroot - domain dir */
 int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
@@ -285,6 +573,14 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+				      GFP_KERNEL);
+	if (!dlm->dlm_debug_ctxt) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+	kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+
 	return 0;
 bail:
 	dlm_destroy_debugfs_subroot(dlm);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index b96959512d96..94cc10a4e19c 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -27,6 +27,19 @@
 
 #ifdef CONFIG_DEBUG_FS
 
+struct dlm_debug_ctxt {
+	struct kref debug_refcnt;
+	struct dentry *debug_state_dentry;
+};
+
+struct debug_buffer {
+	int len;
+	char *buf;
+};
+
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+
 int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
 
@@ -35,6 +48,13 @@ void dlm_destroy_debugfs_root(void);
 
 #else
 
+static int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+	return 0;
+}
+static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
 static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 	return 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c137d693cef3..63f8125824e8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -398,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
 	dlm_unregister_domain_handlers(dlm);
+	dlm_debug_shutdown(dlm);
 	dlm_complete_thread(dlm);
 	dlm_complete_recovery_thread(dlm);
 	dlm_destroy_dlm_worker(dlm);
@@ -1418,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	status = dlm_debug_init(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
 	status = dlm_launch_thread(dlm);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1485,6 +1492,7 @@ bail:
 
 	if (status) {
 		dlm_unregister_domain_handlers(dlm);
+		dlm_debug_shutdown(dlm);
 		dlm_complete_thread(dlm);
 		dlm_complete_recovery_thread(dlm);
 		dlm_destroy_dlm_worker(dlm);
-- 
cgit v1.2.3


From 4e3d24ed1a1285fe3289653aacc965642706bacb Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:24 -0700
Subject: ocfs2/dlm: Dumps the lockres' into a debugfs file

This patch dumps all the lockres' alongwith all the locks into
a debugfs file. Useful for debugging.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdebug.h |   8 ++
 2 files changed, 255 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e33540380013..cccb1ce33723 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -273,8 +273,35 @@ EXPORT_SYMBOL_GPL(dlm_errname);
 
 static struct dentry *dlm_debugfs_root = NULL;
 
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+static int stringify_lockname(const char *lockname, int locklen,
+			      char *buf, int len)
+{
+	int out = 0;
+	__be64 inode_blkno_be;
+
+#define OCFS2_DENTRY_LOCK_INO_START	18
+	if (*lockname == 'N') {
+		memcpy((__be64 *)&inode_blkno_be,
+		       (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+		       sizeof(__be64));
+		out += snprintf(buf + out, len - out, "%.*s%08x",
+				OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+				(unsigned int)be64_to_cpu(inode_blkno_be));
+	} else
+		out += snprintf(buf + out, len - out, "%.*s",
+				locklen, lockname);
+	return out;
+}
+
 #define DLM_DEBUGFS_DIR				"o2dlm"
 #define DLM_DEBUGFS_DLM_STATE			"dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE		"locking_state"
 
 /* begin - utils funcs */
 static void dlm_debug_free(struct kref *kref)
@@ -368,6 +395,213 @@ static int debug_buffer_release(struct inode *inode, struct file *file)
 }
 /* end - util funcs */
 
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+	int out;
+
+#define DEBUG_LOCK_VERSION	1
+	spin_lock(&lock->spinlock);
+	out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+		       "%d,%d,%d,%d\n",
+		       DEBUG_LOCK_VERSION,
+		       list_type, lock->ml.type, lock->ml.convert_type,
+		       lock->ml.node,
+		       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+		       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+		       !list_empty(&lock->ast_list),
+		       !list_empty(&lock->bast_list),
+		       lock->ast_pending, lock->bast_pending,
+		       lock->convert_pending, lock->lock_pending,
+		       lock->cancel_pending, lock->unlock_pending,
+		       atomic_read(&lock->lock_refs.refcount));
+	spin_unlock(&lock->spinlock);
+
+	return out;
+}
+
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+	struct dlm_lock *lock;
+	int i;
+	int out = 0;
+
+	out += snprintf(buf + out, len - out, "NAME:");
+	out += stringify_lockname(res->lockname.name, res->lockname.len,
+				  buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+#define DEBUG_LRES_VERSION	1
+	out += snprintf(buf + out, len - out,
+			"LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+			DEBUG_LRES_VERSION,
+			res->owner, res->state, res->last_used,
+			!list_empty(&res->purge),
+			!list_empty(&res->dirty),
+			!list_empty(&res->recovering),
+			res->inflight_locks, res->migration_pending,
+			atomic_read(&res->asts_reserved),
+			atomic_read(&res->refs.refcount));
+
+	/* refmap */
+	out += snprintf(buf + out, len - out, "RMAP:");
+	out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* lvb */
+	out += snprintf(buf + out, len - out, "LVBX:");
+	for (i = 0; i < DLM_LVB_LEN; i++)
+		out += snprintf(buf + out, len - out,
+					"%02x", (unsigned char)res->lvb[i]);
+	out += snprintf(buf + out, len - out, "\n");
+
+	/* granted */
+	list_for_each_entry(lock, &res->granted, list)
+		out += dump_lock(lock, 0, buf + out, len - out);
+
+	/* converting */
+	list_for_each_entry(lock, &res->converting, list)
+		out += dump_lock(lock, 1, buf + out, len - out);
+
+	/* blocked */
+	list_for_each_entry(lock, &res->blocked, list)
+		out += dump_lock(lock, 2, buf + out, len - out);
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct debug_lockres *dl = m->private;
+	struct dlm_ctxt *dlm = dl->dl_ctxt;
+	struct dlm_lock_resource *res = NULL;
+
+	spin_lock(&dlm->spinlock);
+
+	if (dl->dl_res) {
+		list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+			if (dl->dl_res) {
+				dlm_lockres_put(dl->dl_res);
+				dl->dl_res = NULL;
+			}
+			if (&res->tracking == &dlm->tracking_list) {
+				mlog(0, "End of list found, %p\n", res);
+				dl = NULL;
+				break;
+			}
+			dlm_lockres_get(res);
+			dl->dl_res = res;
+			break;
+		}
+	} else {
+		if (!list_empty(&dlm->tracking_list)) {
+			list_for_each_entry(res, &dlm->tracking_list, tracking)
+				break;
+			dlm_lockres_get(res);
+			dl->dl_res = res;
+		} else
+			dl = NULL;
+	}
+
+	if (dl) {
+		spin_lock(&dl->dl_res->spinlock);
+		dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
+		spin_unlock(&dl->dl_res->spinlock);
+	}
+
+	spin_unlock(&dlm->spinlock);
+
+	return dl;
+}
+
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+	struct debug_lockres *dl = (struct debug_lockres *)v;
+
+	seq_printf(s, "%s", dl->dl_buf);
+
+	return 0;
+}
+
+static struct seq_operations debug_lockres_ops = {
+	.start =	lockres_seq_start,
+	.stop =		lockres_seq_stop,
+	.next =		lockres_seq_next,
+	.show =		lockres_seq_show,
+};
+
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	int ret = -ENOMEM;
+	struct seq_file *seq;
+	struct debug_lockres *dl = NULL;
+
+	dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+	if (!dl) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	dl->dl_len = PAGE_SIZE;
+	dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+	if (!dl->dl_buf) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = seq_open(file, &debug_lockres_ops);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	seq = (struct seq_file *) file->private_data;
+	seq->private = dl;
+
+	dlm_grab(dlm);
+	dl->dl_ctxt = dlm;
+
+	return 0;
+bail:
+	if (dl)
+		kfree(dl->dl_buf);
+	kfree(dl);
+	return ret;
+}
+
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+
+	if (dl->dl_res)
+		dlm_lockres_put(dl->dl_res);
+	dlm_put(dl->dl_ctxt);
+	kfree(dl->dl_buf);
+	return seq_release_private(inode, file);
+}
+
+static struct file_operations debug_lockres_fops = {
+	.open =		debug_lockres_open,
+	.release =	debug_lockres_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+/* end - debug lockres funcs */
+
 /* begin - debug state funcs */
 static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
@@ -544,6 +778,17 @@ int dlm_debug_init(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	/* for dumping lockres */
+	dc->debug_lockres_dentry =
+			debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+					    S_IFREG|S_IRUSR,
+					    dlm->dlm_debugfs_subroot,
+					    dlm, &debug_lockres_fops);
+	if (!dc->debug_lockres_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
 	dlm_debug_get(dc);
 	return 0;
 
@@ -557,6 +802,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
 
 	if (dc) {
+		if (dc->debug_lockres_dentry)
+			debugfs_remove(dc->debug_lockres_dentry);
 		if (dc->debug_state_dentry)
 			debugfs_remove(dc->debug_state_dentry);
 		dlm_debug_put(dc);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 94cc10a4e19c..7c5b2b0a05ed 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -30,6 +30,7 @@
 struct dlm_debug_ctxt {
 	struct kref debug_refcnt;
 	struct dentry *debug_state_dentry;
+	struct dentry *debug_lockres_dentry;
 };
 
 struct debug_buffer {
@@ -37,6 +38,13 @@ struct debug_buffer {
 	char *buf;
 };
 
+struct debug_lockres {
+	int dl_len;
+	char *dl_buf;
+	struct dlm_ctxt *dl_ctxt;
+	struct dlm_lock_resource *dl_res;
+};
+
 int dlm_debug_init(struct dlm_ctxt *dlm);
 void dlm_debug_shutdown(struct dlm_ctxt *dlm);
 
-- 
cgit v1.2.3


From 751155a953e1fe558d3d3c3db7087712ffc15c3e Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:25 -0700
Subject: ocfs2/dlm: Move struct dlm_master_list_entry to dlmcommon.h

This patch moves some mle related definitions from dlmmaster.c
to dlmcommon.h. Future patches need these definitions to dump mle
debugging information.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.beckeroracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmcommon.h | 35 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmmaster.c | 37 -------------------------------------
 2 files changed, 35 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index f7a51ca64a2e..d5a86fb81a49 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
 /* Intended to make it easier for us to switch out hash functions */
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 
+enum dlm_mle_type {
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER,
+	DLM_MLE_MIGRATION
+};
+
+struct dlm_lock_name {
+	u8 len;
+	u8 name[DLM_LOCKID_NAME_MAX];
+};
+
+struct dlm_master_list_entry {
+	struct list_head list;
+	struct list_head hb_events;
+	struct dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	struct kref mle_refs;
+	int inuse;
+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	u8 master;
+	u8 new_master;
+	enum dlm_mle_type type;
+	struct o2hb_callback_func mle_hb_up;
+	struct o2hb_callback_func mle_hb_down;
+	union {
+		struct dlm_lock_resource *res;
+		struct dlm_lock_name name;
+	} u;
+};
+
 enum dlm_ast_type {
 	DLM_AST = 0,
 	DLM_BAST,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 97133465891c..94cadcb0cba2 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -52,43 +52,6 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
 
-enum dlm_mle_type {
-	DLM_MLE_BLOCK,
-	DLM_MLE_MASTER,
-	DLM_MLE_MIGRATION
-};
-
-struct dlm_lock_name
-{
-	u8 len;
-	u8 name[DLM_LOCKID_NAME_MAX];
-};
-
-struct dlm_master_list_entry
-{
-	struct list_head list;
-	struct list_head hb_events;
-	struct dlm_ctxt *dlm;
-	spinlock_t spinlock;
-	wait_queue_head_t wq;
-	atomic_t woken;
-	struct kref mle_refs;
-	int inuse;
-	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-	u8 master;
-	u8 new_master;
-	enum dlm_mle_type type;
-	struct o2hb_callback_func mle_hb_up;
-	struct o2hb_callback_func mle_hb_down;
-	union {
-		struct dlm_lock_resource *res;
-		struct dlm_lock_name name;
-	} u;
-};
-
 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
 			      struct dlm_master_list_entry *mle,
 			      struct o2nm_node *node,
-- 
cgit v1.2.3


From d0129aceaecc2b1f5171b8e8036eb469b6e0fe81 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:26 -0700
Subject: ocfs2/dlm: Dumps the mles into a debugfs file

This patch dumps all mles it can fit in one page into a debugfs file.
Useful for debugging.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdebug.h |   1 +
 2 files changed, 120 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index cccb1ce33723..6de326bf603a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -302,6 +302,7 @@ static int stringify_lockname(const char *lockname, int locklen,
 #define DLM_DEBUGFS_DIR				"o2dlm"
 #define DLM_DEBUGFS_DLM_STATE			"dlm_state"
 #define DLM_DEBUGFS_LOCKING_STATE		"locking_state"
+#define DLM_DEBUGFS_MLE_STATE			"mle_state"
 
 /* begin - utils funcs */
 static void dlm_debug_free(struct kref *kref)
@@ -395,6 +396,112 @@ static int debug_buffer_release(struct inode *inode, struct file *file)
 }
 /* end - util funcs */
 
+/* begin - debug mle funcs */
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+	int out = 0;
+	unsigned int namelen;
+	const char *name;
+	char *mle_type;
+
+	if (mle->type != DLM_MLE_MASTER) {
+		namelen = mle->u.name.len;
+		name = mle->u.name.name;
+	} else {
+		namelen = mle->u.res->lockname.len;
+		name = mle->u.res->lockname.name;
+	}
+
+	if (mle->type == DLM_MLE_BLOCK)
+		mle_type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		mle_type = "MAS";
+	else
+		mle_type = "MIG";
+
+	out += stringify_lockname(name, namelen, buf + out, len - out);
+	out += snprintf(buf + out, len - out,
+			"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+			mle_type, mle->master, mle->new_master,
+			!list_empty(&mle->hb_events),
+			!!mle->inuse,
+			atomic_read(&mle->mle_refs.refcount));
+
+	out += snprintf(buf + out, len - out, "Maybe=");
+	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Vote=");
+	out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Response=");
+	out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Node=");
+	out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+	struct dlm_master_list_entry *mle;
+	int out = 0;
+	unsigned long total = 0;
+
+	out += snprintf(db->buf + out, db->len - out,
+			"Dumping MLEs for Domain: %s\n", dlm->name);
+
+	spin_lock(&dlm->master_lock);
+	list_for_each_entry(mle, &dlm->master_list, list) {
+		++total;
+		if (db->len - out < 200)
+			continue;
+		out += dump_mle(mle, db->buf + out, db->len - out);
+	}
+	spin_unlock(&dlm->master_lock);
+
+	out += snprintf(db->buf + out, db->len - out,
+			"Total on list: %ld\n", total);
+	return out;
+}
+
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	struct debug_buffer *db;
+
+	db = debug_buffer_allocate();
+	if (!db)
+		goto bail;
+
+	db->len = debug_mle_print(dlm, db);
+
+	file->private_data = db;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static struct file_operations debug_mle_fops = {
+	.open =		debug_mle_open,
+	.release =	debug_buffer_release,
+	.read =		debug_buffer_read,
+	.llseek =	debug_buffer_llseek,
+};
+
+/* end - debug mle funcs */
+
 /* begin - debug lockres funcs */
 static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
 {
@@ -789,6 +896,16 @@ int dlm_debug_init(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	/* for dumping mles */
+	dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+						   S_IFREG|S_IRUSR,
+						   dlm->dlm_debugfs_subroot,
+						   dlm, &debug_mle_fops);
+	if (!dc->debug_mle_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
 	dlm_debug_get(dc);
 	return 0;
 
@@ -802,6 +919,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
 
 	if (dc) {
+		if (dc->debug_mle_dentry)
+			debugfs_remove(dc->debug_mle_dentry);
 		if (dc->debug_lockres_dentry)
 			debugfs_remove(dc->debug_lockres_dentry);
 		if (dc->debug_state_dentry)
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 7c5b2b0a05ed..cbc69f295782 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -31,6 +31,7 @@ struct dlm_debug_ctxt {
 	struct kref debug_refcnt;
 	struct dentry *debug_state_dentry;
 	struct dentry *debug_lockres_dentry;
+	struct dentry *debug_mle_dentry;
 };
 
 struct debug_buffer {
-- 
cgit v1.2.3


From 7209300a9b987e017cae2ef9d7ef55b0fdd71869 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:27 -0700
Subject: ocfs2/dlm: Dumps the purgelist into a debugfs file

This patch dumps all the lockres' on the purgelist it can fit in one page
into a debugfs file. Useful for debugging.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdebug.h |  1 +
 2 files changed, 72 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 6de326bf603a..a109005f70da 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -303,6 +303,7 @@ static int stringify_lockname(const char *lockname, int locklen,
 #define DLM_DEBUGFS_DLM_STATE			"dlm_state"
 #define DLM_DEBUGFS_LOCKING_STATE		"locking_state"
 #define DLM_DEBUGFS_MLE_STATE			"mle_state"
+#define DLM_DEBUGFS_PURGE_LIST			"purge_list"
 
 /* begin - utils funcs */
 static void dlm_debug_free(struct kref *kref)
@@ -396,6 +397,63 @@ static int debug_buffer_release(struct inode *inode, struct file *file)
 }
 /* end - util funcs */
 
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+	struct dlm_lock_resource *res;
+	int out = 0;
+	unsigned long total = 0;
+
+	out += snprintf(db->buf + out, db->len - out,
+			"Dumping Purgelist for Domain: %s\n", dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_entry(res, &dlm->purge_list, purge) {
+		++total;
+		if (db->len - out < 100)
+			continue;
+		spin_lock(&res->spinlock);
+		out += stringify_lockname(res->lockname.name,
+					  res->lockname.len,
+					  db->buf + out, db->len - out);
+		out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+				(jiffies - res->last_used)/HZ);
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	out += snprintf(db->buf + out, db->len - out,
+			"Total on list: %ld\n", total);
+
+	return out;
+}
+
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+	struct dlm_ctxt *dlm = inode->i_private;
+	struct debug_buffer *db;
+
+	db = debug_buffer_allocate();
+	if (!db)
+		goto bail;
+
+	db->len = debug_purgelist_print(dlm, db);
+
+	file->private_data = db;
+
+	return 0;
+bail:
+	return -ENOMEM;
+}
+
+static struct file_operations debug_purgelist_fops = {
+	.open =		debug_purgelist_open,
+	.release =	debug_buffer_release,
+	.read =		debug_buffer_read,
+	.llseek =	debug_buffer_llseek,
+};
+/* end - purge list funcs */
+
 /* begin - debug mle funcs */
 static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 {
@@ -906,6 +964,17 @@ int dlm_debug_init(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	/* for dumping lockres on the purge list */
+	dc->debug_purgelist_dentry =
+			debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+					    S_IFREG|S_IRUSR,
+					    dlm->dlm_debugfs_subroot,
+					    dlm, &debug_purgelist_fops);
+	if (!dc->debug_purgelist_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
 	dlm_debug_get(dc);
 	return 0;
 
@@ -919,6 +988,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
 
 	if (dc) {
+		if (dc->debug_purgelist_dentry)
+			debugfs_remove(dc->debug_purgelist_dentry);
 		if (dc->debug_mle_dentry)
 			debugfs_remove(dc->debug_mle_dentry);
 		if (dc->debug_lockres_dentry)
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index cbc69f295782..8857743e8f32 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -32,6 +32,7 @@ struct dlm_debug_ctxt {
 	struct dentry *debug_state_dentry;
 	struct dentry *debug_lockres_dentry;
 	struct dentry *debug_mle_dentry;
+	struct dentry *debug_purgelist_dentry;
 };
 
 struct debug_buffer {
-- 
cgit v1.2.3


From e5a0334cbd65e27f8dfd9985aa805874fe59e879 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:28 -0700
Subject: ocfs2/dlm: Move dlm_print_one_mle() from dlmmaster.c to dlmdebug.c

This patch helps in consolidating debugging related functions in dlmdebug.c.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.c  | 154 +++++++++++++++++++++++++----------------------
 fs/ocfs2/dlm/dlmdebug.h  |   2 +
 fs/ocfs2/dlm/dlmmaster.c |  89 +--------------------------
 3 files changed, 85 insertions(+), 160 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index a109005f70da..58e45795a6c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -268,11 +268,6 @@ const char *dlm_errname(enum dlm_status err)
 }
 EXPORT_SYMBOL_GPL(dlm_errname);
 
-
-#ifdef CONFIG_DEBUG_FS
-
-static struct dentry *dlm_debugfs_root = NULL;
-
 /* NOTE: This function converts a lockname into a string. It uses knowledge
  * of the format of the lockname that should be outside the purview of the dlm.
  * We are adding only to make dlm debugging slightly easier.
@@ -299,6 +294,88 @@ static int stringify_lockname(const char *lockname, int locklen,
 	return out;
 }
 
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+			     char *buf, int len)
+{
+	int out = 0;
+	int i = -1;
+
+	while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+		out += snprintf(buf + out, len - out, "%d ", i);
+
+	return out;
+}
+
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+	int out = 0;
+	unsigned int namelen;
+	const char *name;
+	char *mle_type;
+
+	if (mle->type != DLM_MLE_MASTER) {
+		namelen = mle->u.name.len;
+		name = mle->u.name.name;
+	} else {
+		namelen = mle->u.res->lockname.len;
+		name = mle->u.res->lockname.name;
+	}
+
+	if (mle->type == DLM_MLE_BLOCK)
+		mle_type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		mle_type = "MAS";
+	else
+		mle_type = "MIG";
+
+	out += stringify_lockname(name, namelen, buf + out, len - out);
+	out += snprintf(buf + out, len - out,
+			"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+			mle_type, mle->master, mle->new_master,
+			!list_empty(&mle->hb_events),
+			!!mle->inuse,
+			atomic_read(&mle->mle_refs.refcount));
+
+	out += snprintf(buf + out, len - out, "Maybe=");
+	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Vote=");
+	out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Response=");
+	out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "Node=");
+	out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+				 buf + out, len - out);
+	out += snprintf(buf + out, len - out, "\n");
+
+	out += snprintf(buf + out, len - out, "\n");
+
+	return out;
+}
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+	char *buf;
+
+	buf = (char *) get_zeroed_page(GFP_NOFS);
+	if (buf) {
+		dump_mle(mle, buf, PAGE_SIZE - 1);
+		free_page((unsigned long)buf);
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root = NULL;
+
 #define DLM_DEBUGFS_DIR				"o2dlm"
 #define DLM_DEBUGFS_DLM_STATE			"dlm_state"
 #define DLM_DEBUGFS_LOCKING_STATE		"locking_state"
@@ -326,18 +403,6 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
 	kref_get(&dc->debug_refcnt);
 }
 
-static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
-			     char *buf, int len)
-{
-	int out = 0;
-	int i = -1;
-
-	while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
-		out += snprintf(buf + out, len - out, "%d ", i);
-
-	return out;
-}
-
 static struct debug_buffer *debug_buffer_allocate(void)
 {
 	struct debug_buffer *db = NULL;
@@ -455,61 +520,6 @@ static struct file_operations debug_purgelist_fops = {
 /* end - purge list funcs */
 
 /* begin - debug mle funcs */
-static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
-{
-	int out = 0;
-	unsigned int namelen;
-	const char *name;
-	char *mle_type;
-
-	if (mle->type != DLM_MLE_MASTER) {
-		namelen = mle->u.name.len;
-		name = mle->u.name.name;
-	} else {
-		namelen = mle->u.res->lockname.len;
-		name = mle->u.res->lockname.name;
-	}
-
-	if (mle->type == DLM_MLE_BLOCK)
-		mle_type = "BLK";
-	else if (mle->type == DLM_MLE_MASTER)
-		mle_type = "MAS";
-	else
-		mle_type = "MIG";
-
-	out += stringify_lockname(name, namelen, buf + out, len - out);
-	out += snprintf(buf + out, len - out,
-			"\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
-			mle_type, mle->master, mle->new_master,
-			!list_empty(&mle->hb_events),
-			!!mle->inuse,
-			atomic_read(&mle->mle_refs.refcount));
-
-	out += snprintf(buf + out, len - out, "Maybe=");
-	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
-
-	out += snprintf(buf + out, len - out, "Vote=");
-	out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
-
-	out += snprintf(buf + out, len - out, "Response=");
-	out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
-
-	out += snprintf(buf + out, len - out, "Node=");
-	out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
-				 buf + out, len - out);
-	out += snprintf(buf + out, len - out, "\n");
-
-	out += snprintf(buf + out, len - out, "\n");
-
-	return out;
-}
-
 static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
 	struct dlm_master_list_entry *mle;
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8857743e8f32..d34a62a3a625 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -25,6 +25,8 @@
 #ifndef DLMDEBUG_H
 #define DLMDEBUG_H
 
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+
 #ifdef CONFIG_DEBUG_FS
 
 struct dlm_debug_ctxt {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94cadcb0cba2..efc015c6128a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,6 +48,7 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
@@ -91,94 +92,6 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
 	return 1;
 }
 
-#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
-static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
-{
-	int i;
-	printk("%s=[ ", mapname);
-	for (i=0; i<O2NM_MAX_NODES; i++)
-		if (test_bit(i, map))
-			printk("%d ", i);
-	printk("]");
-}
-
-static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
-{
-	int refs;
-	char *type;
-	char attached;
-	u8 master;
-	unsigned int namelen;
-	const char *name;
-	struct kref *k;
-	unsigned long *maybe = mle->maybe_map,
-		      *vote = mle->vote_map,
-		      *resp = mle->response_map,
-		      *node = mle->node_map;
-
-	k = &mle->mle_refs;
-	if (mle->type == DLM_MLE_BLOCK)
-		type = "BLK";
-	else if (mle->type == DLM_MLE_MASTER)
-		type = "MAS";
-	else
-		type = "MIG";
-	refs = atomic_read(&k->refcount);
-	master = mle->master;
-	attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
-
-	if (mle->type != DLM_MLE_MASTER) {
-		namelen = mle->u.name.len;
-		name = mle->u.name.name;
-	} else {
-		namelen = mle->u.res->lockname.len;
-		name = mle->u.res->lockname.name;
-	}
-
-	mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
-		  namelen, name, type, refs, master, mle->new_master, attached,
-		  mle->inuse);
-	dlm_print_nodemap(maybe);
-	printk(", ");
-	dlm_print_nodemap(vote);
-	printk(", ");
-	dlm_print_nodemap(resp);
-	printk(", ");
-	dlm_print_nodemap(node);
-	printk(", ");
-	printk("\n");
-}
-
-#if 0
-/* Code here is included but defined out as it aids debugging */
-
-static void dlm_dump_mles(struct dlm_ctxt *dlm)
-{
-	struct dlm_master_list_entry *mle;
-	
-	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-	spin_lock(&dlm->master_lock);
-	list_for_each_entry(mle, &dlm->master_list, list)
-		dlm_print_one_mle(mle);
-	spin_unlock(&dlm->master_lock);
-}
-
-int dlm_dump_all_mles(const char __user *data, unsigned int len)
-{
-	struct dlm_ctxt *dlm;
-
-	spin_lock(&dlm_domain_lock);
-	list_for_each_entry(dlm, &dlm_domains, list) {
-		mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
-		dlm_dump_mles(dlm);
-	}
-	spin_unlock(&dlm_domain_lock);
-	return len;
-}
-EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
-
-#endif  /*  0  */
-
 static struct kmem_cache *dlm_lockres_cache = NULL;
 static struct kmem_cache *dlm_lockname_cache = NULL;
 static struct kmem_cache *dlm_mle_cache = NULL;
-- 
cgit v1.2.3


From c834cdb15702dd0147875b352cc7d4df93d7d900 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 10 Mar 2008 15:16:29 -0700
Subject: ocfs2/dlm: Fix lockname in lockres print function

__dlm_print_one_lock_resource was printing lockname incorrectly.
Also, we now use printk directly instead of mlog as the latter prints
the line context which is not useful for this print.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 126 ++++++++++++++++++++----------------------------
 1 file changed, 51 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 58e45795a6c6..53a9e6093e6f 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
  *
  * debug functionality for the dlm
  *
- * Copyright (C) 2004 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -44,11 +44,10 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
+
 void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
-	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
-	       res->lockname.len, res->lockname.name,
-	       res->owner, res->state);
 	spin_lock(&res->spinlock);
 	__dlm_print_one_lock_resource(res);
 	spin_unlock(&res->spinlock);
@@ -59,75 +58,78 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
 	int bit;
 	assert_spin_locked(&res->spinlock);
 
-	mlog(ML_NOTICE, "  refmap nodes: [ ");
+	printk(KERN_NOTICE "  refmap nodes: [ ");
 	bit = 0;
 	while (1) {
 		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
 		if (bit >= O2NM_MAX_NODES)
 			break;
-		printk("%u ", bit);
+		printk(KERN_NOTICE "%u ", bit);
 		bit++;
 	}
-	printk("], inflight=%u\n", res->inflight_locks);
+	printk(KERN_NOTICE "], inflight=%u\n", res->inflight_locks);
+}
+
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+	spin_lock(&lock->spinlock);
+
+	printk(KERN_NOTICE "    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+	       "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+	       "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
+	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	       atomic_read(&lock->lock_refs.refcount),
+	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
+	       (lock->ast_pending ? 'y' : 'n'),
+	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
+	       (lock->bast_pending ? 'y' : 'n'),
+	       (lock->convert_pending ? 'y' : 'n'),
+	       (lock->lock_pending ? 'y' : 'n'),
+	       (lock->cancel_pending ? 'y' : 'n'),
+	       (lock->unlock_pending ? 'y' : 'n'));
+
+	spin_unlock(&lock->spinlock);
 }
 
 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
 	struct list_head *iter2;
 	struct dlm_lock *lock;
+	char buf[DLM_LOCKID_NAME_MAX];
 
 	assert_spin_locked(&res->spinlock);
 
-	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
-	       res->lockname.len, res->lockname.name,
-	       res->owner, res->state);
-	mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
-	     res->last_used, list_empty(&res->purge) ? "no" : "yes");
+	stringify_lockname(res->lockname.name, res->lockname.len,
+			   buf, sizeof(buf) - 1);
+	printk(KERN_NOTICE "lockres: %s, owner=%u, state=%u\n",
+	       buf, res->owner, res->state);
+	printk(KERN_NOTICE "  last used: %lu, refcnt: %u, on purge list: %s\n",
+	       res->last_used, atomic_read(&res->refs.refcount),
+	       list_empty(&res->purge) ? "no" : "yes");
+	printk(KERN_NOTICE "  on dirty list: %s, on reco list: %s, "
+	       "migrating pending: %s\n",
+	       list_empty(&res->dirty) ? "no" : "yes",
+	       list_empty(&res->recovering) ? "no" : "yes",
+	       res->migration_pending ? "yes" : "no");
+	printk(KERN_NOTICE "  inflight locks: %d, asts reserved: %d\n",
+	       res->inflight_locks, atomic_read(&res->asts_reserved));
 	dlm_print_lockres_refmap(res);
-	mlog(ML_NOTICE, "  granted queue: \n");
+	printk(KERN_NOTICE "  granted queue:\n");
 	list_for_each(iter2, &res->granted) {
 		lock = list_entry(iter2, struct dlm_lock, list);
-		spin_lock(&lock->spinlock);
-		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-		       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-		       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		       list_empty(&lock->ast_list) ? 'y' : 'n',
-		       lock->ast_pending ? 'y' : 'n',
-		       list_empty(&lock->bast_list) ? 'y' : 'n',
-		       lock->bast_pending ? 'y' : 'n');
-		spin_unlock(&lock->spinlock);
+		__dlm_print_lock(lock);
 	}
-	mlog(ML_NOTICE, "  converting queue: \n");
+	printk(KERN_NOTICE "  converting queue:\n");
 	list_for_each(iter2, &res->converting) {
 		lock = list_entry(iter2, struct dlm_lock, list);
-		spin_lock(&lock->spinlock);
-		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-		       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-		       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		       list_empty(&lock->ast_list) ? 'y' : 'n',
-		       lock->ast_pending ? 'y' : 'n',
-		       list_empty(&lock->bast_list) ? 'y' : 'n',
-		       lock->bast_pending ? 'y' : 'n');
-		spin_unlock(&lock->spinlock);
+		__dlm_print_lock(lock);
 	}
-	mlog(ML_NOTICE, "  blocked queue: \n");
+	printk(KERN_NOTICE "  blocked queue:\n");
 	list_for_each(iter2, &res->blocked) {
 		lock = list_entry(iter2, struct dlm_lock, list);
-		spin_lock(&lock->spinlock);
-		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-		       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-		       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		       list_empty(&lock->ast_list) ? 'y' : 'n',
-		       lock->ast_pending ? 'y' : 'n',
-		       list_empty(&lock->bast_list) ? 'y' : 'n',
-		       lock->bast_pending ? 'y' : 'n');
-		spin_unlock(&lock->spinlock);
+		__dlm_print_lock(lock);
 	}
 }
 
@@ -137,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 
-#if 0
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
-{
-	struct dlm_lock_resource *res;
-	struct hlist_node *iter;
-	struct hlist_head *bucket;
-	int i;
-
-	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
-		  dlm->name, dlm->node_num, dlm->key);
-	if (!dlm || !dlm->name) {
-		mlog(ML_ERROR, "dlm=%p\n", dlm);
-		return;
-	}
-
-	spin_lock(&dlm->spinlock);
-	for (i=0; i<DLM_HASH_BUCKETS; i++) {
-		bucket = dlm_lockres_hash(dlm, i);
-		hlist_for_each_entry(res, iter, bucket, hash_node)
-			dlm_print_one_lock_resource(res);
-	}
-	spin_unlock(&dlm->spinlock);
-}
-#endif  /*  0  */
-
 static const char *dlm_errnames[] = {
 	[DLM_NORMAL] =			"DLM_NORMAL",
 	[DLM_GRANTED] =			"DLM_GRANTED",
@@ -274,8 +251,7 @@ EXPORT_SYMBOL_GPL(dlm_errname);
  *
  * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
  */
-static int stringify_lockname(const char *lockname, int locklen,
-			      char *buf, int len)
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
 {
 	int out = 0;
 	__be64 inode_blkno_be;
-- 
cgit v1.2.3


From 8f50eb978935431ccbf89b0344efd4ce6a924875 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 14 Mar 2008 11:18:24 -0700
Subject: ocfs2/dlm: Cleanup lockres print

A previous patch added KERN_NOTICE to printks printing the lockres that
cluttered the output. This patch removes the log level. For people concerned
with syslog clutter, please note we now use this facility to print lockres
only during an error.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 53a9e6093e6f..5f6d858770a2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -58,23 +58,23 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
 	int bit;
 	assert_spin_locked(&res->spinlock);
 
-	printk(KERN_NOTICE "  refmap nodes: [ ");
+	printk("  refmap nodes: [ ");
 	bit = 0;
 	while (1) {
 		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
 		if (bit >= O2NM_MAX_NODES)
 			break;
-		printk(KERN_NOTICE "%u ", bit);
+		printk("%u ", bit);
 		bit++;
 	}
-	printk(KERN_NOTICE "], inflight=%u\n", res->inflight_locks);
+	printk("], inflight=%u\n", res->inflight_locks);
 }
 
 static void __dlm_print_lock(struct dlm_lock *lock)
 {
 	spin_lock(&lock->spinlock);
 
-	printk(KERN_NOTICE "    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+	printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
 	       "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
 	       "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
@@ -103,30 +103,30 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 
 	stringify_lockname(res->lockname.name, res->lockname.len,
 			   buf, sizeof(buf) - 1);
-	printk(KERN_NOTICE "lockres: %s, owner=%u, state=%u\n",
+	printk("lockres: %s, owner=%u, state=%u\n",
 	       buf, res->owner, res->state);
-	printk(KERN_NOTICE "  last used: %lu, refcnt: %u, on purge list: %s\n",
+	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
 	       res->last_used, atomic_read(&res->refs.refcount),
 	       list_empty(&res->purge) ? "no" : "yes");
-	printk(KERN_NOTICE "  on dirty list: %s, on reco list: %s, "
+	printk("  on dirty list: %s, on reco list: %s, "
 	       "migrating pending: %s\n",
 	       list_empty(&res->dirty) ? "no" : "yes",
 	       list_empty(&res->recovering) ? "no" : "yes",
 	       res->migration_pending ? "yes" : "no");
-	printk(KERN_NOTICE "  inflight locks: %d, asts reserved: %d\n",
+	printk("  inflight locks: %d, asts reserved: %d\n",
 	       res->inflight_locks, atomic_read(&res->asts_reserved));
 	dlm_print_lockres_refmap(res);
-	printk(KERN_NOTICE "  granted queue:\n");
+	printk("  granted queue:\n");
 	list_for_each(iter2, &res->granted) {
 		lock = list_entry(iter2, struct dlm_lock, list);
 		__dlm_print_lock(lock);
 	}
-	printk(KERN_NOTICE "  converting queue:\n");
+	printk("  converting queue:\n");
 	list_for_each(iter2, &res->converting) {
 		lock = list_entry(iter2, struct dlm_lock, list);
 		__dlm_print_lock(lock);
 	}
-	printk(KERN_NOTICE "  blocked queue:\n");
+	printk("  blocked queue:\n");
 	list_for_each(iter2, &res->blocked) {
 		lock = list_entry(iter2, struct dlm_lock, list);
 		__dlm_print_lock(lock);
-- 
cgit v1.2.3


From 5cc3bf2786f63cceb191c3c02ddd83c6f38a7d64 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 5 Mar 2008 15:50:12 +0800
Subject: ocfs2:  Reconnect after idle time out.

Currently, o2net connects to a node on hb_up and disconnects on
hb_down and net timeout.

It disconnects on net timeout is ok, but it should attempt to
reconnect back. This is because sometimes nodes get overloaded
enough that the network connection breaks but the disk hb does not.
And if we get into that situation, we either fence (unnecessarily)
or wait for its disk hb to die (and sometimes hang in the process).

So in this updated scheme, when the network disconnects, we keep
attempting to reconnect till we succeed or we get a disk hb down
event.

If the other node is really dead, then we will eventually get a
node down event. If not, we should be able to connect again and
continue.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/tcp.c          | 51 +++++++++++++++++++++++++++++------------
 fs/ocfs2/cluster/tcp_internal.h |  2 ++
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b205..4ea4b0a26975 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -399,8 +399,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
 	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
 
-	/* we won't reconnect after our valid conn goes away for
-	 * this hb iteration.. here so it shows up in the logs */
 	if (was_valid && !valid && err == 0)
 		err = -ENOTCONN;
 
@@ -430,11 +428,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 
 	if (!was_valid && valid) {
 		o2quo_conn_up(o2net_num_from_nn(nn));
-		/* this is a bit of a hack.  we only try reconnecting
-		 * when heartbeating starts until we get a connection.
-		 * if that connection then dies we don't try reconnecting.
-		 * the only way to start connecting again is to down
-		 * heartbeat and bring it back up. */
 		cancel_delayed_work(&nn->nn_connect_expired);
 		printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
 		       o2nm_this_node() > sc->sc_node->nd_num ?
@@ -457,6 +450,18 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 			delay = 0;
 		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
 		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+
+		/*
+		 * Delay the expired work after idle timeout.
+		 *
+		 * We might have lots of failed connection attempts that run
+		 * through here but we only cancel the connect_expired work when
+		 * a connection attempt succeeds.  So only the first enqueue of
+		 * the connect_expired work will do anything.  The rest will see
+		 * that it's already queued and do nothing.
+		 */
+		delay += msecs_to_jiffies(o2net_idle_timeout(NULL));
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
 	}
 
 	/* keep track of the nn's sc ref for the caller */
@@ -1193,6 +1198,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
 	 * shut down already */
 	if (nn->nn_sc == sc) {
 		o2net_sc_reset_idle_timer(sc);
+		atomic_set(&nn->nn_timeout, 0);
 		o2net_set_nn_state(nn, sc, 1, 0);
 	}
 	spin_unlock(&nn->nn_lock);
@@ -1391,6 +1397,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 static void o2net_idle_timer(unsigned long data)
 {
 	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 	struct timeval now;
 
 	do_gettimeofday(&now);
@@ -1413,6 +1420,12 @@ static void o2net_idle_timer(unsigned long data)
 	     sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
 	     sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
 
+	/*
+	 * Initialize the nn_timeout so that the next connection attempt
+	 * will continue in o2net_start_connect.
+	 */
+	atomic_set(&nn->nn_timeout, 1);
+
 	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
 
@@ -1447,6 +1460,7 @@ static void o2net_start_connect(struct work_struct *work)
 	struct socket *sock = NULL;
 	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
 	int ret = 0, stop;
+	unsigned int timeout;
 
 	/* if we're greater we initiate tx, otherwise we accept */
 	if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1480,17 @@ static void o2net_start_connect(struct work_struct *work)
 	}
 
 	spin_lock(&nn->nn_lock);
-	/* see if we already have one pending or have given up */
-	stop = (nn->nn_sc || nn->nn_persistent_error);
+	/*
+	 * see if we already have one pending or have given up.
+	 * For nn_timeout, it is set when we close the connection
+	 * because of the idle time out. So it means that we have
+	 * at least connected to that node successfully once,
+	 * now try to connect to it again.
+	 */
+	timeout = atomic_read(&nn->nn_timeout);
+	stop = (nn->nn_sc ||
+		(nn->nn_persistent_error &&
+		(nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
 	spin_unlock(&nn->nn_lock);
 	if (stop)
 		goto out;
@@ -1579,6 +1602,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
 
 	/* don't reconnect until it's heartbeating again */
 	spin_lock(&nn->nn_lock);
+	atomic_set(&nn->nn_timeout, 0);
 	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	spin_unlock(&nn->nn_lock);
 
@@ -1613,17 +1637,12 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 		(msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
 
 	if (node_num != o2nm_this_node()) {
-		/* heartbeat doesn't work unless a local node number is
-		 * configured and doing so brings up the o2net_wq, so we can
-		 * use it.. */
-		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
-		                   msecs_to_jiffies(o2net_idle_timeout(node)));
-
 		/* believe it or not, accept and node hearbeating testing
 		 * can succeed for this node before we got here.. so
 		 * only use set_nn_state to clear the persistent error
 		 * if that hasn't already happened */
 		spin_lock(&nn->nn_lock);
+		atomic_set(&nn->nn_timeout, 0);
 		if (nn->nn_persistent_error)
 			o2net_set_nn_state(nn, NULL, 0, 0);
 		spin_unlock(&nn->nn_lock);
@@ -1747,6 +1766,7 @@ static int o2net_accept_one(struct socket *sock)
 	new_sock = NULL;
 
 	spin_lock(&nn->nn_lock);
+	atomic_set(&nn->nn_timeout, 0);
 	o2net_set_nn_state(nn, sc, 0, 0);
 	spin_unlock(&nn->nn_lock);
 
@@ -1941,6 +1961,7 @@ int o2net_init(void)
 	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
 		struct o2net_node *nn = o2net_nn_from_num(i);
 
+		atomic_set(&nn->nn_timeout, 0);
 		spin_lock_init(&nn->nn_lock);
 		INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
 		INIT_DELAYED_WORK(&nn->nn_connect_expired,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..b4c5586f46ea 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
 	unsigned			nn_sc_valid:1;
 	/* if this is set tx just returns it */
 	int				nn_persistent_error;
+	/* It is only set to 1 after the idle time out. */
+	atomic_t			nn_timeout;
 
 	/* threads waiting for an sc to arrive wait on the wq for generation
 	 * to increase.  it is increased when a connecting socket succeeds
-- 
cgit v1.2.3


From a839c5afcdc345aecb35b0d3bcd0e09b571329c3 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 29 Jan 2008 14:35:18 -0800
Subject: sysfs: Allow removal of symlinks in the sysfs root

Allow callers of sysfs_remove_link() to pass a NULL kobj, in which case
sysfs_root will be used as the parent directory. This allows us to tear down
top level symlinks created via sysfs_create_link(), which already has
similar handling of a NULL parent object.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/symlink.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 5f66c4466151..817f5966edca 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 void sysfs_remove_link(struct kobject * kobj, const char * name)
 {
-	sysfs_hash_and_remove(kobj->sd, name);
+	struct sysfs_dirent *parent_sd = NULL;
+
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
+		parent_sd = kobj->sd;
+
+	sysfs_hash_and_remove(parent_sd, name);
 }
 
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-- 
cgit v1.2.3


From 52f7c21b613f80cb425d115c9e5b4ed958a133c0 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 29 Jan 2008 17:08:26 -0800
Subject: ocfs2: Move /sys/o2cb to /sys/fs/o2cb

/sys/fs is where we really want file system specific sysfs objects.

Ocfs2-tools has been updated to look in /sys/fs/o2cb. We can maintain
backwards compatibility with old ocfs2-tools by using a sysfs symlink. After
some time (2 years), the symlink can be safely removed. This patch also adds
documentation to make it easier for people to figure out what /sys/fs/o2cb
is used for.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 Documentation/ABI/obsolete/o2cb            | 11 +++++++++++
 Documentation/ABI/stable/o2cb              | 10 ++++++++++
 Documentation/feature-removal-schedule.txt | 10 ++++++++++
 fs/ocfs2/cluster/sys.c                     |  9 +++++++++
 4 files changed, 40 insertions(+)
 create mode 100644 Documentation/ABI/obsolete/o2cb
 create mode 100644 Documentation/ABI/stable/o2cb

(limited to 'fs')

diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb
new file mode 100644
index 000000000000..9c49d8e6c0cc
--- /dev/null
+++ b/Documentation/ABI/obsolete/o2cb
@@ -0,0 +1,11 @@
+What:		/sys/o2cb symlink
+Date:		Dec 2005
+KernelVersion:	2.6.16
+Contact:	ocfs2-devel@oss.oracle.com
+Description:	This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
+		be removed when new versions of ocfs2-tools which know to look
+		in /sys/fs/o2cb are sufficiently prevalent. Don't code new
+		software to look here, it should try /sys/fs/o2cb instead.
+		See Documentation/ABI/stable/o2cb for more information on usage.
+Users:		ocfs2-tools. It's sufficient to mail proposed changes to
+		ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/stable/o2cb b/Documentation/ABI/stable/o2cb
new file mode 100644
index 000000000000..5eb1545e0b8d
--- /dev/null
+++ b/Documentation/ABI/stable/o2cb
@@ -0,0 +1,10 @@
+What:		/sys/fs/o2cb/ (was /sys/o2cb)
+Date:		Dec 2005
+KernelVersion:	2.6.16
+Contact:	ocfs2-devel@oss.oracle.com
+Description:	Ocfs2-tools looks at 'interface-revision' for versioning
+		information. Each logmask/ file controls a set of debug prints
+		and can be written into with the strings "allow", "deny", or
+		"off". Reading the file returns the current state.
+Users:		ocfs2-tools. It's sufficient to mail proposed changes to
+		ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index bf0e3df8e7a1..4101f1f2d1eb 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -318,3 +318,13 @@ Why:	Not used in-tree. The current out-of-tree users used it to
 	code / infrastructure should be in the kernel and not in some
 	out-of-tree driver.
 Who:	Thomas Gleixner <tglx@linutronix.de>
+
+---------------------------
+
+What:	/sys/o2cb symlink
+When:	January 2010
+Why:	/sys/fs/o2cb is the proper location for this information - /sys/o2cb
+	exists as a symlink for backwards compatibility for old versions of
+	ocfs2-tools. 2 years should be sufficient time to phase in new versions
+	which know to look in /sys/fs/o2cb.
+Who:	ocfs2-devel@oss.oracle.com
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723d..98429fd68499 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
 	mlog_sys_shutdown();
+	sysfs_remove_link(NULL, "o2cb");
 	kset_unregister(o2cb_kset);
 }
 
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
 	if (!o2cb_kset)
 		return -ENOMEM;
 
+	/*
+	 * Create this symlink for backwards compatibility with old
+	 * versions of ocfs2-tools which look for things in /sys/o2cb.
+	 */
+	ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
+	if (ret)
+		goto error;
+
 	ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
 	if (ret)
 		goto error;
-- 
cgit v1.2.3


From 677b975282e48d1818df4181336307377d56b04e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 30 Jan 2008 14:21:05 +0800
Subject: ocfs2: Add support for cross extent block

In ocfs2_merge_rec_left, when we find the merge extent is "CONTIG_RIGHT"
with the first extent record of the next extent block, we will merge it to
the next extent block and change all the related extent blocks accordingly.

In ocfs2_merge_rec_right, when we find the merge extent is "CONTIG_LEFT"
with the last extent record of the previous extent block, we will merge
it to the prevoius extent block and change all the related extent blocks
accordingly.

As for CONTIG_LEFTRIGHT, we will handle CONTIG_RIGHT first so that when
the index is zero, the merge process will be more efficient and easier.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 325 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2e..f63cb32e224d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1450,6 +1450,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
  *   - When our insert into the right path leaf is at the leftmost edge
  *     and requires an update of the path immediately to it's left. This
  *     can occur at the end of some types of rotation and appending inserts.
+ *   - When we've adjusted the last extent record in the left path leaf and the
+ *     1st extent record in the right path leaf during cross extent block merge.
  */
 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
 				       struct ocfs2_path *left_path,
@@ -2712,24 +2714,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
 	}
 }
 
+static int ocfs2_get_right_path(struct inode *inode,
+				struct ocfs2_path *left_path,
+				struct ocfs2_path **ret_right_path)
+{
+	int ret;
+	u32 right_cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_extent_list *left_el;
+
+	*ret_right_path = NULL;
+
+	/* This function shouldn't be called for non-trees. */
+	BUG_ON(left_path->p_tree_depth == 0);
+
+	left_el = path_leaf_el(left_path);
+	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+
+	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+					     &right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This function shouldn't be called for the rightmost leaf. */
+	BUG_ON(right_cpos == 0);
+
+	right_path = ocfs2_new_path(path_root_bh(left_path),
+				    path_root_el(left_path));
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(inode, right_path, right_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_right_path = right_path;
+out:
+	if (ret)
+		ocfs2_free_path(right_path);
+	return ret;
+}
+
 /*
  * Remove split_rec clusters from the record at index and merge them
- * onto the beginning of the record at index + 1.
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
  */
-static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
-				handle_t *handle,
-				struct ocfs2_extent_rec *split_rec,
-				struct ocfs2_extent_list *el, int index)
+static int ocfs2_merge_rec_right(struct inode *inode,
+				 struct ocfs2_path *left_path,
+				 handle_t *handle,
+				 struct ocfs2_extent_rec *split_rec,
+				 int index)
 {
-	int ret;
+	int ret, next_free, i;
 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
 	struct ocfs2_extent_rec *left_rec;
 	struct ocfs2_extent_rec *right_rec;
+	struct ocfs2_extent_list *right_el;
+	struct ocfs2_path *right_path = NULL;
+	int subtree_index = 0;
+	struct ocfs2_extent_list *el = path_leaf_el(left_path);
+	struct buffer_head *bh = path_leaf_bh(left_path);
+	struct buffer_head *root_bh = NULL;
 
 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
-
 	left_rec = &el->l_recs[index];
-	right_rec = &el->l_recs[index + 1];
+
+	if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+		/* we meet with a cross extent block merge. */
+		ret = ocfs2_get_right_path(inode, left_path, &right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		right_el = path_leaf_el(right_path);
+		next_free = le16_to_cpu(right_el->l_next_free_rec);
+		BUG_ON(next_free <= 0);
+		right_rec = &right_el->l_recs[0];
+		if (ocfs2_is_empty_extent(right_rec)) {
+			BUG_ON(le16_to_cpu(next_free) <= 1);
+			right_rec = &right_el->l_recs[1];
+		}
+
+		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+		       le16_to_cpu(left_rec->e_leaf_clusters) !=
+		       le32_to_cpu(right_rec->e_cpos));
+
+		subtree_index = ocfs2_find_subtree_root(inode,
+							left_path, right_path);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+						      handle->h_buffer_credits,
+						      right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		root_bh = left_path->p_node[subtree_index].bh;
+		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+		ret = ocfs2_journal_access(handle, inode, root_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = subtree_index + 1;
+		     i < path_num_items(right_path); i++) {
+			ret = ocfs2_journal_access(handle, inode,
+						   right_path->p_node[i].bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_journal_access(handle, inode,
+						   left_path->p_node[i].bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+	} else {
+		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+		right_rec = &el->l_recs[index + 1];
+	}
 
 	ret = ocfs2_journal_access(handle, inode, bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2876,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
 	if (ret)
 		mlog_errno(ret);
 
+	if (right_path) {
+		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+		if (ret)
+			mlog_errno(ret);
+
+		ocfs2_complete_edge_insert(inode, handle, left_path,
+					   right_path, subtree_index);
+	}
+out:
+	if (right_path)
+		ocfs2_free_path(right_path);
+	return ret;
+}
+
+static int ocfs2_get_left_path(struct inode *inode,
+			       struct ocfs2_path *right_path,
+			       struct ocfs2_path **ret_left_path)
+{
+	int ret;
+	u32 left_cpos;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/* This function shouldn't be called for non-trees. */
+	BUG_ON(right_path->p_tree_depth == 0);
+
+	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+					    right_path, &left_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* This function shouldn't be called for the leftmost leaf. */
+	BUG_ON(left_cpos == 0);
+
+	left_path = ocfs2_new_path(path_root_bh(right_path),
+				   path_root_el(right_path));
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(inode, left_path, left_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*ret_left_path = left_path;
 out:
+	if (ret)
+		ocfs2_free_path(left_path);
 	return ret;
 }
 
 /*
  * Remove split_rec clusters from the record at index and merge them
- * onto the tail of the record at index - 1.
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
  */
-static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_left(struct inode *inode,
+				struct ocfs2_path *right_path,
 				handle_t *handle,
 				struct ocfs2_extent_rec *split_rec,
-				struct ocfs2_extent_list *el, int index)
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				int index)
 {
-	int ret, has_empty_extent = 0;
+	int ret, i, subtree_index = 0, has_empty_extent = 0;
 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
 	struct ocfs2_extent_rec *left_rec;
 	struct ocfs2_extent_rec *right_rec;
+	struct ocfs2_extent_list *el = path_leaf_el(right_path);
+	struct buffer_head *bh = path_leaf_bh(right_path);
+	struct buffer_head *root_bh = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_extent_list *left_el;
 
-	BUG_ON(index <= 0);
+	BUG_ON(index < 0);
 
-	left_rec = &el->l_recs[index - 1];
 	right_rec = &el->l_recs[index];
-	if (ocfs2_is_empty_extent(&el->l_recs[0]))
-		has_empty_extent = 1;
+	if (index == 0) {
+		/* we meet with a cross extent block merge. */
+		ret = ocfs2_get_left_path(inode, right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		left_el = path_leaf_el(left_path);
+		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+		       le16_to_cpu(left_el->l_count));
+
+		left_rec = &left_el->l_recs[
+				le16_to_cpu(left_el->l_next_free_rec) - 1];
+		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+		       le16_to_cpu(left_rec->e_leaf_clusters) !=
+		       le32_to_cpu(split_rec->e_cpos));
+
+		subtree_index = ocfs2_find_subtree_root(inode,
+							left_path, right_path);
+
+		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+						      handle->h_buffer_credits,
+						      left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		root_bh = left_path->p_node[subtree_index].bh;
+		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+		ret = ocfs2_journal_access(handle, inode, root_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = subtree_index + 1;
+		     i < path_num_items(right_path); i++) {
+			ret = ocfs2_journal_access(handle, inode,
+						   right_path->p_node[i].bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_journal_access(handle, inode,
+						   left_path->p_node[i].bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+	} else {
+		left_rec = &el->l_recs[index - 1];
+		if (ocfs2_is_empty_extent(&el->l_recs[0]))
+			has_empty_extent = 1;
+	}
 
 	ret = ocfs2_journal_access(handle, inode, bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3041,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
 		*left_rec = *split_rec;
 
 		has_empty_extent = 0;
-	} else {
+	} else
 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
-	}
 
 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
 	le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3055,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
 	if (ret)
 		mlog_errno(ret);
 
+	if (left_path) {
+		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+		if (ret)
+			mlog_errno(ret);
+
+		/*
+		 * In the situation that the right_rec is empty and the extent
+		 * block is empty also,  ocfs2_complete_edge_insert can't handle
+		 * it and we need to delete the right extent block.
+		 */
+		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+		    le16_to_cpu(el->l_next_free_rec) == 1) {
+
+			ret = ocfs2_remove_rightmost_path(inode, handle,
+							  right_path, dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/* Now the rightmost extent block has been deleted.
+			 * So we use the new rightmost path.
+			 */
+			ocfs2_mv_path(right_path, left_path);
+			left_path = NULL;
+		} else
+			ocfs2_complete_edge_insert(inode, handle, left_path,
+						   right_path, subtree_index);
+	}
 out:
+	if (left_path)
+		ocfs2_free_path(left_path);
 	return ret;
 }
 
 static int ocfs2_try_to_merge_extent(struct inode *inode,
 				     handle_t *handle,
-				     struct ocfs2_path *left_path,
+				     struct ocfs2_path *path,
 				     int split_index,
 				     struct ocfs2_extent_rec *split_rec,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3100,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 
 {
 	int ret = 0;
-	struct ocfs2_extent_list *el = path_leaf_el(left_path);
+	struct ocfs2_extent_list *el = path_leaf_el(path);
 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
 
 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3113,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * extents - having more than one in a leaf is
 		 * illegal.
 		 */
-		ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+		ret = ocfs2_rotate_tree_left(inode, handle, path,
 					     dealloc);
 		if (ret) {
 			mlog_errno(ret);
@@ -2847,7 +3128,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * Left-right contig implies this.
 		 */
 		BUG_ON(!ctxt->c_split_covers_rec);
-		BUG_ON(split_index == 0);
 
 		/*
 		 * Since the leftright insert always covers the entire
@@ -2858,9 +3138,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * Since the adding of an empty extent shifts
 		 * everything back to the right, there's no need to
 		 * update split_index here.
+		 *
+		 * When the split_index is zero, we need to merge it to the
+		 * prevoius extent block. It is more efficient and easier
+		 * if we do merge_right first and merge_left later.
 		 */
-		ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
-					   handle, split_rec, el, split_index);
+		ret = ocfs2_merge_rec_right(inode, path,
+					    handle, split_rec,
+					    split_index);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2871,32 +3156,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 */
 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
-		/*
-		 * The left merge left us with an empty extent, remove
-		 * it.
-		 */
-		ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+		/* The merge left us with an empty extent, remove it. */
+		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-		split_index--;
+
 		rec = &el->l_recs[split_index];
 
 		/*
 		 * Note that we don't pass split_rec here on purpose -
-		 * we've merged it into the left side.
+		 * we've merged it into the rec already.
 		 */
-		ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
-					    handle, rec, el, split_index);
+		ret = ocfs2_merge_rec_left(inode, path,
+					   handle, rec,
+					   dealloc,
+					   split_index);
+
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
-
-		ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+		ret = ocfs2_rotate_tree_left(inode, handle, path,
 					     dealloc);
 		/*
 		 * Error from this last rotate is not critical, so
@@ -2915,8 +3198,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 */
 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
 			ret = ocfs2_merge_rec_left(inode,
-						   path_leaf_bh(left_path),
-						   handle, split_rec, el,
+						   path,
+						   handle, split_rec,
+						   dealloc,
 						   split_index);
 			if (ret) {
 				mlog_errno(ret);
@@ -2924,8 +3208,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			}
 		} else {
 			ret = ocfs2_merge_rec_right(inode,
-						    path_leaf_bh(left_path),
-						    handle, split_rec, el,
+						    path,
+						    handle, split_rec,
 						    split_index);
 			if (ret) {
 				mlog_errno(ret);
@@ -2938,7 +3222,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			 * The merge may have left an empty extent in
 			 * our leaf. Try to rotate it away.
 			 */
-			ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+			ret = ocfs2_rotate_tree_left(inode, handle, path,
 						     dealloc);
 			if (ret)
 				mlog_errno(ret);
-- 
cgit v1.2.3


From ad5a4d7093a76fa245e277e6f0f0e168a08aeff7 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 30 Jan 2008 14:21:32 +0800
Subject: ocfs2: Enable cross extent block merge.

In ocfs2_figure_merge_contig_type, we judge whether there exists
a cross extent block merge and enable it by setting CONTIG_LEFT
and CONTIG_RIGHT accordingly.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 86 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f63cb32e224d..7d81aa6f5672 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3782,20 +3782,57 @@ out:
 }
 
 static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode,
+ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			       struct ocfs2_extent_list *el, int index,
 			       struct ocfs2_extent_rec *split_rec)
 {
-	struct ocfs2_extent_rec *rec;
+	int status;
 	enum ocfs2_contig_type ret = CONTIG_NONE;
+	u32 left_cpos, right_cpos;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_extent_list *new_el;
+	struct ocfs2_path *left_path = NULL, *right_path = NULL;
+	struct buffer_head *bh;
+	struct ocfs2_extent_block *eb;
+
+	if (index > 0) {
+		rec = &el->l_recs[index - 1];
+	} else if (path->p_tree_depth > 0) {
+		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+						       path, &left_cpos);
+		if (status)
+			goto out;
+
+		if (left_cpos != 0) {
+			left_path = ocfs2_new_path(path_root_bh(path),
+						   path_root_el(path));
+			if (!left_path)
+				goto out;
+
+			status = ocfs2_find_path(inode, left_path, left_cpos);
+			if (status)
+				goto out;
+
+			new_el = path_leaf_el(left_path);
+
+			if (le16_to_cpu(new_el->l_next_free_rec) !=
+			    le16_to_cpu(new_el->l_count)) {
+				bh = path_leaf_bh(left_path);
+				eb = (struct ocfs2_extent_block *)bh->b_data;
+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+								 eb);
+				goto out;
+			}
+			rec = &new_el->l_recs[
+				le16_to_cpu(new_el->l_next_free_rec) - 1];
+		}
+	}
 
 	/*
 	 * We're careful to check for an empty extent record here -
 	 * the merge code will know what to do if it sees one.
 	 */
-
-	if (index > 0) {
-		rec = &el->l_recs[index - 1];
+	if (rec) {
 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
 				ret = CONTIG_RIGHT;
@@ -3804,10 +3841,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
 		}
 	}
 
-	if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+	rec = NULL;
+	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+		rec = &el->l_recs[index + 1];
+	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+		 path->p_tree_depth > 0) {
+		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
+							path, &right_cpos);
+		if (status)
+			goto out;
+
+		if (right_cpos == 0)
+			goto out;
+
+		right_path = ocfs2_new_path(path_root_bh(path),
+					    path_root_el(path));
+		if (!right_path)
+			goto out;
+
+		status = ocfs2_find_path(inode, right_path, right_cpos);
+		if (status)
+			goto out;
+
+		new_el = path_leaf_el(right_path);
+		rec = &new_el->l_recs[0];
+		if (ocfs2_is_empty_extent(rec)) {
+			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+				bh = path_leaf_bh(right_path);
+				eb = (struct ocfs2_extent_block *)bh->b_data;
+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+								 eb);
+				goto out;
+			}
+			rec = &new_el->l_recs[1];
+		}
+	}
+
+	if (rec) {
 		enum ocfs2_contig_type contig_type;
 
-		rec = &el->l_recs[index + 1];
 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
 
 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3816,6 +3888,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
 			ret = contig_type;
 	}
 
+out:
+	if (left_path)
+		ocfs2_free_path(left_path);
+	if (right_path)
+		ocfs2_free_path(right_path);
+
 	return ret;
 }
 
@@ -4278,7 +4356,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 		goto out;
 	}
 
-	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
 							    split_index,
 							    split_rec);
 
-- 
cgit v1.2.3


From ffda89a3bf3b968bdc268584c6bc1da5c173cf12 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 3 Mar 2008 17:12:09 +0800
Subject: ocfs2: Add a new parameter for ocfs2_reserve_suballoc_bits

In some cases(Inode stealing from other nodes), we may not want
ocfs2_reserve_suballoc_bits to allocate new groups from the
global_bitmap since it may already be full. So add a new parameter
for this.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/suballoc.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004df..3be4e73e8b13 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,9 @@
 
 #include "buffer_head_io.h"
 
+#define NOT_ALLOC_NEW_GROUP		0
+#define ALLOC_NEW_GROUP			1
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -391,7 +394,8 @@ bail:
 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 				       struct ocfs2_alloc_context *ac,
 				       int type,
-				       u32 slot)
+				       u32 slot,
+				       int alloc_new_group)
 {
 	int status;
 	u32 bits_wanted = ac->ac_bits_wanted;
@@ -446,6 +450,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 			goto bail;
 		}
 
+		if (alloc_new_group != ALLOC_NEW_GROUP) {
+			mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
+			     "and we don't alloc a new group for it.\n",
+			     slot, bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
 		if (status < 0) {
 			if (status != -ENOSPC)
@@ -490,7 +502,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
-					     EXTENT_ALLOC_SYSTEM_INODE, slot);
+					     EXTENT_ALLOC_SYSTEM_INODE,
+					     slot, ALLOC_NEW_GROUP);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -527,7 +540,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 
 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
 					     INODE_ALLOC_SYSTEM_INODE,
-					     osb->slot_num);
+					     osb->slot_num, ALLOC_NEW_GROUP);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -557,7 +570,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 
 	status = ocfs2_reserve_suballoc_bits(osb, ac,
 					     GLOBAL_BITMAP_SYSTEM_INODE,
-					     OCFS2_INVALID_SLOT);
+					     OCFS2_INVALID_SLOT,
+					     ALLOC_NEW_GROUP);
 	if (status < 0 && status != -ENOSPC) {
 		mlog_errno(status);
 		goto bail;
-- 
cgit v1.2.3


From a4a4891164d4f6f383cc17e7c90828a7ca6a1146 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 3 Mar 2008 17:12:30 +0800
Subject: ocfs2: Add ac_alloc_slot in ocfs2_alloc_context

In inode stealing, we no longer restrict the allocation to
happen in the local node. So it is neccessary for us to add
a new member in ocfs2_alloc_context to indicate which slot
we are using for allocation. We also modify the process of
local alloc so that this member can be used there also.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/localalloc.c | 2 ++
 fs/ocfs2/suballoc.c   | 1 +
 fs/ocfs2/suballoc.h   | 1 +
 3 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ab83fd562429..b6d07198118c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -523,6 +523,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 	}
 
 	ac->ac_inode = local_alloc_inode;
+	/* We should never use localalloc from another slot */
+	ac->ac_alloc_slot = osb->slot_num;
 	ac->ac_which = OCFS2_AC_USE_LOCAL;
 	get_bh(osb->local_alloc_bh);
 	ac->ac_bh = osb->local_alloc_bh;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 3be4e73e8b13..33d55734c514 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -424,6 +424,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	}
 
 	ac->ac_inode = alloc_inode;
+	ac->ac_alloc_slot = slot;
 
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb459..544c600662bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
 struct ocfs2_alloc_context {
 	struct inode *ac_inode;    /* which bitmap are we allocating from? */
 	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_alloc_slot;   /* which slot are we allocating from? */
 	u32    ac_bits_wanted;
 	u32    ac_bits_given;
 #define OCFS2_AC_USE_LOCAL 1
-- 
cgit v1.2.3


From 4d0ddb2ce25db2254d468233d942276ecf40bff8 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 5 Mar 2008 16:11:46 +0800
Subject: ocfs2: Add inode stealing for ocfs2_reserve_new_inode

Inode allocation is modified to look in other nodes allocators during
extreme out of space situations. We retry our own slot when space is freed
back to the global bitmap, or whenever we've allocated more than 1024 inodes
from another slot.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c      |  2 ++
 fs/ocfs2/localalloc.c |  2 ++
 fs/ocfs2/namei.c      |  2 +-
 fs/ocfs2/ocfs2.h      | 34 ++++++++++++++++++++--
 fs/ocfs2/suballoc.c   | 80 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/super.c      |  1 +
 6 files changed, 116 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7d81aa6f5672..a26882144e8f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5150,6 +5150,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
 	status = ocfs2_flush_truncate_log(osb);
 	if (status < 0)
 		mlog_errno(status);
+	else
+		ocfs2_init_inode_steal_slot(osb);
 
 	mlog_exit(status);
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index b6d07198118c..ce0dc147602a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -447,6 +447,8 @@ out_mutex:
 	iput(main_bm_inode);
 
 out:
+	if (!status)
+		ocfs2_init_inode_steal_slot(osb);
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad9587516..ab5a2272d0eb 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
 	fe->i_blkno = cpu_to_le64(fe_blkno);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
-	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
 	fe->i_uid = cpu_to_le32(current->fsuid);
 	if (dir->i_mode & S_ISGID) {
 		fe->i_gid = cpu_to_le32(dir->i_gid);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 9ff5811345a9..31692379c170 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -208,11 +208,14 @@ struct ocfs2_super
 	u32 s_feature_incompat;
 	u32 s_feature_ro_compat;
 
-	/* Protects s_next_generaion, osb_flags. Could protect more on
-	 * osb as it's very short lived. */
+	/* Protects s_next_generation, osb_flags and s_inode_steal_slot.
+	 * Could protect more on osb as it's very short lived.
+	 */
 	spinlock_t osb_lock;
 	u32 s_next_generation;
 	unsigned long osb_flags;
+	s16 s_inode_steal_slot;
+	atomic_t s_num_inodes_stolen;
 
 	unsigned long s_mount_opt;
 	unsigned int s_atime_quantum;
@@ -537,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 	return pages_per_cluster;
 }
 
+static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
+					      s16 slot)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_inode_steal_slot = slot;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+	s16 slot;
+
+	spin_lock(&osb->osb_lock);
+	slot = osb->s_inode_steal_slot;
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 33d55734c514..d2d278fb9819 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -49,6 +49,8 @@
 #define NOT_ALLOC_NEW_GROUP		0
 #define ALLOC_NEW_GROUP			1
 
+#define OCFS2_MAX_INODES_TO_STEAL	1024
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -109,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 *bg_blkno,
 						u16 *bg_bit_off);
 
-void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
 	struct inode *inode = ac->ac_inode;
 
@@ -120,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
 		mutex_unlock(&inode->i_mutex);
 
 		iput(inode);
+		ac->ac_inode = NULL;
 	}
-	if (ac->ac_bh)
+	if (ac->ac_bh) {
 		brelse(ac->ac_bh);
+		ac->ac_bh = NULL;
+	}
+}
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	ocfs2_free_ac_resource(ac);
 	kfree(ac);
 }
 
@@ -522,10 +532,42 @@ bail:
 	return status;
 }
 
+static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
+					      struct ocfs2_alloc_context *ac)
+{
+	int i, status = -ENOSPC;
+	s16 slot = ocfs2_get_inode_steal_slot(osb);
+
+	/* Start to steal inodes from the first slot after ours. */
+	if (slot == OCFS2_INVALID_SLOT)
+		slot = osb->slot_num + 1;
+
+	for (i = 0; i < osb->max_slots; i++, slot++) {
+		if (slot == osb->max_slots)
+			slot = 0;
+
+		if (slot == osb->slot_num)
+			continue;
+
+		status = ocfs2_reserve_suballoc_bits(osb, ac,
+						     INODE_ALLOC_SYSTEM_INODE,
+						     slot, NOT_ALLOC_NEW_GROUP);
+		if (status >= 0) {
+			ocfs2_set_inode_steal_slot(osb, slot);
+			break;
+		}
+
+		ocfs2_free_ac_resource(ac);
+	}
+
+	return status;
+}
+
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac)
 {
 	int status;
+	s16 slot = ocfs2_get_inode_steal_slot(osb);
 
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 	if (!(*ac)) {
@@ -539,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
+	/*
+	 * slot is set when we successfully steal inode from other nodes.
+	 * It is reset in 3 places:
+	 * 1. when we flush the truncate log
+	 * 2. when we complete local alloc recovery.
+	 * 3. when we successfully allocate from our own slot.
+	 * After it is set, we will go on stealing inodes until we find the
+	 * need to check our slots to see whether there is some space for us.
+	 */
+	if (slot != OCFS2_INVALID_SLOT &&
+	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+		goto inode_steal;
+
+	atomic_set(&osb->s_num_inodes_stolen, 0);
 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
 					     INODE_ALLOC_SYSTEM_INODE,
 					     osb->slot_num, ALLOC_NEW_GROUP);
+	if (status >= 0) {
+		status = 0;
+
+		/*
+		 * Some inodes must be freed by us, so try to allocate
+		 * from our own next time.
+		 */
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_inode_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+inode_steal:
+	status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+	atomic_inc(&osb->s_num_inodes_stolen);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 96ebe36d5d77..df63ba20ae90 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1394,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
+	ocfs2_init_inode_steal_slot(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
 	atomic_set(&osb->alloc_stats.local_data, 0);
-- 
cgit v1.2.3


From 58dadcdbc2584db050969f9781727fc5a3f618db Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 28 Mar 2008 14:43:10 -0700
Subject: fs/ocfs2/aops.c: test for IS_ERR rather than 0

The function ocfs2_start_trans always returns either a valid pointer or a
value made with ERR_PTR, so its result should be tested with IS_ERR, not
with a test for 0.

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/aops.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 90383ed61005..17964c0505a9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned to)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle = NULL;
+	handle_t *handle;
 	int ret = 0;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (!handle) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 	}
 out:
 	if (ret) {
-		if (handle)
+		if (!IS_ERR(handle))
 			ocfs2_commit_trans(osb, handle);
 		handle = ERR_PTR(ret);
 	}
-- 
cgit v1.2.3


From 5dabd69515765156605b09261abf969236a77803 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 Feb 2008 18:00:00 +0100
Subject: ocfs2: Improve rename locking

ocfs2_rename() was being too aggressive with the rename lock - we only need
it for certain forms of directory rename.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ab5a2272d0eb..d5d808fe0140 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	 *
 	 * And that's why, just like the VFS, we need a file system
 	 * rename lock. */
-	if (old_dentry != new_dentry) {
+	if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
 		status = ocfs2_rename_lock(osb);
 		if (status < 0) {
 			mlog_errno(status);
-- 
cgit v1.2.3


From c9ec14884d69a303eef4faae42bd3c4e25b19941 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sun, 27 Jan 2008 03:17:17 +0100
Subject: ocfs2: Convert ocfs2 over to unlocked_ioctl

As far as I can see there is nothing in ocfs2_ioctl that requires the BKL,
so use unlocked_ioctl

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c  |  4 ++--
 fs/ocfs2/ioctl.c | 12 +++---------
 fs/ocfs2/ioctl.h |  3 +--
 3 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85d..9154c82d3258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
 	.open		= ocfs2_file_open,
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
-	.ioctl		= ocfs2_ioctl,
+	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
-	.ioctl		= ocfs2_ioctl,
+	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index ab1c2167d7f4..b413166dd163 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -113,9 +113,9 @@ bail:
 	return status;
 }
 
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
-	unsigned int cmd, unsigned long arg)
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	unsigned int flags;
 	int new_clusters;
 	int status;
@@ -169,9 +169,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 #ifdef CONFIG_COMPAT
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	switch (cmd) {
 	case OCFS2_IOC32_GETFLAGS:
 		cmd = OCFS2_IOC_GETFLAGS;
@@ -191,9 +188,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return -ENOIOCTLCMD;
 	}
 
-	lock_kernel();
-	ret = ocfs2_ioctl(inode, file, cmd, arg);
-	unlock_kernel();
-	return ret;
+	return ocfs2_ioctl(file, cmd, arg);
 }
 #endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0d..cf9a5ee30fef 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
 #ifndef OCFS2_IOCTL_H
 #define OCFS2_IOCTL_H
 
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
-	unsigned int cmd, unsigned long arg);
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
 #endif /* OCFS2_IOCTL_H */
-- 
cgit v1.2.3


From b1f3550fa1471b691ad6c2f35b5b22e93eaa5855 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Tue, 4 Mar 2008 15:21:05 -0800
Subject: ocfs2: Use BUG_ON

if (...) BUG(); should be replaced with BUG_ON(...) when the test has no
side-effects to allow a definition of BUG_ON that drops the code completely.

The semantic patch that makes this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@ disable unlikely @ expression E,f; @@

(
  if (<... f(...) ...>) { BUG(); }
|
- if (unlikely(E)) { BUG(); }
+ BUG_ON(E);
)

@@ expression E,f; @@

(
  if (<... f(...) ...>) { BUG(); }
|
- if (E) { BUG(); }
+ BUG_ON(E);
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c   | 3 +--
 fs/ocfs2/journal.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a26882144e8f..41f84c92094f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
 	BUG_ON(!next_free);
 
 	/* The tree code before us didn't allow enough room in the leaf. */
-	if (el->l_next_free_rec == el->l_count && !has_empty)
-		BUG();
+	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
 
 	/*
 	 * The easiest way to approach this is to just remove the
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bffd2d714f67..9698338adc39 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -717,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
 
 	mlog_entry_void();
 
-	if (!journal)
-		BUG();
+	BUG_ON(!journal);
 
 	osb = journal->j_osb;
 
-- 
cgit v1.2.3


From 409753bf6da4a2db038027471abaf324e063db2f Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 28 Mar 2008 16:44:13 -0700
Subject: ocfs2/cluster: Get rid of arguments to the timeout routines

We keep seeing bug reports related to NULL pointer derefs in
o2net_set_nn_state(). When I originally wrote up the configurable timeout
patch, I had tried to plan for multiple clusters. This was silly.

The timeout routines all use o2nm_single_cluster so there's no point in
passing an argument at all. This patch removes the arguments and kills those
bugs dead.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/tcp.c | 47 ++++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 4ea4b0a26975..1170918a9311 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,17 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
-/*
- * FIXME: These should use to_o2nm_cluster_from_node(), but we end up
- * losing our parent link to the cluster during shutdown. This can be
- * solved by adding a pre-removal callback to configfs, or passing
- * around the cluster with the node. -jeffm
- */
-static inline int o2net_reconnect_delay(struct o2nm_node *node)
+static inline int o2net_reconnect_delay(void)
 {
 	return o2nm_single_cluster->cl_reconnect_delay_ms;
 }
 
-static inline int o2net_keepalive_delay(struct o2nm_node *node)
+static inline int o2net_keepalive_delay(void)
 {
 	return o2nm_single_cluster->cl_keepalive_delay_ms;
 }
 
-static inline int o2net_idle_timeout(struct o2nm_node *node)
+static inline int o2net_idle_timeout(void)
 {
 	return o2nm_single_cluster->cl_idle_timeout_ms;
 }
@@ -444,9 +438,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 		/* delay if we're withing a RECONNECT_DELAY of the
 		 * last attempt */
 		delay = (nn->nn_last_connect_attempt +
-			 msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+			 msecs_to_jiffies(o2net_reconnect_delay()))
 			- jiffies;
-		if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+		if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
 			delay = 0;
 		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
 		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
@@ -460,7 +454,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 		 * the connect_expired work will do anything.  The rest will see
 		 * that it's already queued and do nothing.
 		 */
-		delay += msecs_to_jiffies(o2net_idle_timeout(NULL));
+		delay += msecs_to_jiffies(o2net_idle_timeout());
 		queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
 	}
 
@@ -1159,23 +1153,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
 	 * but isn't. This can ultimately cause corruption.
 	 */
 	if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
-				o2net_idle_timeout(sc->sc_node)) {
+				o2net_idle_timeout()) {
 		mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
 		     "%u ms, but we use %u ms locally.  disconnecting\n",
 		     SC_NODEF_ARGS(sc),
 		     be32_to_cpu(hand->o2net_idle_timeout_ms),
-		     o2net_idle_timeout(sc->sc_node));
+		     o2net_idle_timeout());
 		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
 		return -1;
 	}
 
 	if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
-			o2net_keepalive_delay(sc->sc_node)) {
+			o2net_keepalive_delay()) {
 		mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
 		     "%u ms, but we use %u ms locally.  disconnecting\n",
 		     SC_NODEF_ARGS(sc),
 		     be32_to_cpu(hand->o2net_keepalive_delay_ms),
-		     o2net_keepalive_delay(sc->sc_node));
+		     o2net_keepalive_delay());
 		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
 		return -1;
 	}
@@ -1353,12 +1347,11 @@ static void o2net_initialize_handshake(void)
 {
 	o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
 		O2HB_MAX_WRITE_TIMEOUT_MS);
-	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
-		o2net_idle_timeout(NULL));
+	o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
 	o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
-		o2net_keepalive_delay(NULL));
+		o2net_keepalive_delay());
 	o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
-		o2net_reconnect_delay(NULL));
+		o2net_reconnect_delay());
 }
 
 /* ------------------------------------------------------------ */
@@ -1404,8 +1397,8 @@ static void o2net_idle_timer(unsigned long data)
 
 	printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
 	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
-		     o2net_idle_timeout(sc->sc_node) / 1000,
-		     o2net_idle_timeout(sc->sc_node) % 1000);
+		     o2net_idle_timeout() / 1000,
+		     o2net_idle_timeout() % 1000);
 	mlog(ML_NOTICE, "here are some times that might help debug the "
 	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
 	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1433,10 +1426,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 {
 	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
 	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
-		      msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node)));
+		      msecs_to_jiffies(o2net_keepalive_delay()));
 	do_gettimeofday(&sc->sc_tv_timer);
 	mod_timer(&sc->sc_idle_timeout,
-	       jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node)));
+	       jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
 
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1578,8 +1571,8 @@ static void o2net_connect_expired(struct work_struct *work)
 		mlog(ML_ERROR, "no connection established with node %u after "
 		     "%u.%u seconds, giving up and returning errors.\n",
 		     o2net_num_from_nn(nn),
-		     o2net_idle_timeout(NULL) / 1000,
-		     o2net_idle_timeout(NULL) % 1000);
+		     o2net_idle_timeout() / 1000,
+		     o2net_idle_timeout() % 1000);
 
 		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	}
@@ -1634,7 +1627,7 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 
 	/* ensure an immediate connect attempt */
 	nn->nn_last_connect_attempt = jiffies -
-		(msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
+		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
 
 	if (node_num != o2nm_this_node()) {
 		/* believe it or not, accept and node hearbeating testing
-- 
cgit v1.2.3


From 93b06edb5127315473d87e075b2b1d1acf74659c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Fri, 4 Apr 2008 12:45:55 -0700
Subject: ocfs2: Only build ocfs2/dlm with the o2cb stack module

fs/ocfs2/dlm/ocfs2_dlm.ko and fs/ocfs2/dlm/ocfs2_dlmfs.ko get built if
CONFIG_FS_OCFS2 is specified. This isn't quite how it should happen any more
- the "o2cb" dlm modules should only be built if CONFIG_FS_OCFS2_O2CB is
set, so update the dlm Makefile accordingly.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d270..190361375700 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
-- 
cgit v1.2.3


From 2309e9e040fe29469fb85a384636c455b62fe525 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 14 Apr 2008 10:46:19 -0700
Subject: ocfs2/net: Add debug interface to o2net

This patch exposes o2net information via debugfs. The information includes
the list of sockets (sock_containers) as well as the list of outstanding
messages (send_tracking). Useful for o2dlm debugging.

(This patch is derived from an earlier one written by Zach Brown that
exposed the same information via /proc.)

[Mark: checkpatch fixes]

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/Makefile       |   2 +-
 fs/ocfs2/cluster/netdebug.c     | 441 ++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/nodemanager.c  |   5 +-
 fs/ocfs2/cluster/tcp.c          |  68 +++++++
 fs/ocfs2/cluster/tcp.h          |  32 +++
 fs/ocfs2/cluster/tcp_internal.h |  24 ++-
 6 files changed, 569 insertions(+), 3 deletions(-)
 create mode 100644 fs/ocfs2/cluster/netdebug.c

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f13650..bc8c5e7d8608 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-	quorum.o tcp.o ver.o
+	quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000000..7bf3c0ea7bd9
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+
+#include "tcp_internal.h"
+
+#define O2NET_DEBUG_DIR		"o2net"
+#define SC_DEBUG_NAME		"sock_containers"
+#define NST_DEBUG_NAME		"send_tracking"
+
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+
+static DEFINE_SPINLOCK(o2net_debug_lock);
+
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+	spin_lock(&o2net_debug_lock);
+	list_add(&nst->st_net_debug_item, &send_tracking);
+	spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+	spin_lock(&o2net_debug_lock);
+	if (!list_empty(&nst->st_net_debug_item))
+		list_del_init(&nst->st_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_send_tracking
+			*next_nst(struct o2net_send_tracking *nst_start)
+{
+	struct o2net_send_tracking *nst, *ret = NULL;
+
+	assert_spin_locked(&o2net_debug_lock);
+
+	list_for_each_entry(nst, &nst_start->st_net_debug_item,
+			    st_net_debug_item) {
+		/* discover the head of the list */
+		if (&nst->st_net_debug_item == &send_tracking)
+			break;
+
+		/* use st_task to detect real nsts in the list */
+		if (nst->st_task != NULL) {
+			ret = nst;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	spin_unlock(&o2net_debug_lock);
+
+	return nst;
+}
+
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+	list_del_init(&dummy_nst->st_net_debug_item);
+	if (nst)
+		list_add(&dummy_nst->st_net_debug_item,
+			 &nst->st_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+
+	return nst; /* unused, just needs to be null when done */
+}
+
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+	struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	nst = next_nst(dummy_nst);
+
+	if (nst != NULL) {
+		/* get_task_comm isn't exported.  oh well. */
+		seq_printf(seq, "%p:\n"
+			   "  pid:          %lu\n"
+			   "  tgid:         %lu\n"
+			   "  process name: %s\n"
+			   "  node:         %u\n"
+			   "  sc:           %p\n"
+			   "  message id:   %d\n"
+			   "  message type: %u\n"
+			   "  message key:  0x%08x\n"
+			   "  sock acquiry: %lu.%lu\n"
+			   "  send start:   %lu.%lu\n"
+			   "  wait start:   %lu.%lu\n",
+			   nst, (unsigned long)nst->st_task->pid,
+			   (unsigned long)nst->st_task->tgid,
+			   nst->st_task->comm, nst->st_node,
+			   nst->st_sc, nst->st_id, nst->st_msg_type,
+			   nst->st_msg_key,
+			   nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
+			   nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+			   nst->st_status_time.tv_sec,
+			   nst->st_status_time.tv_usec);
+	}
+
+	spin_unlock(&o2net_debug_lock);
+
+	return 0;
+}
+
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations nst_seq_ops = {
+	.start = nst_seq_start,
+	.next = nst_seq_next,
+	.stop = nst_seq_stop,
+	.show = nst_seq_show,
+};
+
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_send_tracking *dummy_nst;
+	struct seq_file *seq;
+	int ret;
+
+	dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+	if (dummy_nst == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	dummy_nst->st_task = NULL;
+
+	ret = seq_open(file, &nst_seq_ops);
+	if (ret)
+		goto out;
+
+	seq = file->private_data;
+	seq->private = dummy_nst;
+	o2net_debug_add_nst(dummy_nst);
+
+	dummy_nst = NULL;
+
+out:
+	kfree(dummy_nst);
+	return ret;
+}
+
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct o2net_send_tracking *dummy_nst = seq->private;
+
+	o2net_debug_del_nst(dummy_nst);
+	return seq_release_private(inode, file);
+}
+
+static struct file_operations nst_seq_fops = {
+	.open = nst_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = nst_fop_release,
+};
+
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+	spin_lock(&o2net_debug_lock);
+	list_add(&sc->sc_net_debug_item, &sock_containers);
+	spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+	spin_lock(&o2net_debug_lock);
+	list_del_init(&sc->sc_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_sock_container
+			*next_sc(struct o2net_sock_container *sc_start)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+
+	assert_spin_locked(&o2net_debug_lock);
+
+	list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+			    sc_net_debug_item) {
+		/* discover the head of the list miscast as a sc */
+		if (&sc->sc_net_debug_item == &sock_containers)
+			break;
+
+		/* use sc_page to detect real scs in the list */
+		if (sc->sc_page != NULL) {
+			ret = sc;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+	spin_unlock(&o2net_debug_lock);
+
+	return sc;
+}
+
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+	list_del_init(&dummy_sc->sc_net_debug_item);
+	if (sc)
+		list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+	spin_unlock(&o2net_debug_lock);
+
+	return sc; /* unused, just needs to be null when done */
+}
+
+#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+	struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+	spin_lock(&o2net_debug_lock);
+	sc = next_sc(dummy_sc);
+
+	if (sc != NULL) {
+		struct inet_sock *inet = NULL;
+
+		__be32 saddr = 0, daddr = 0;
+		__be16 sport = 0, dport = 0;
+
+		if (sc->sc_sock) {
+			inet = inet_sk(sc->sc_sock->sk);
+			/* the stack's structs aren't sparse endian clean */
+			saddr = (__force __be32)inet->saddr;
+			daddr = (__force __be32)inet->daddr;
+			sport = (__force __be16)inet->sport;
+			dport = (__force __be16)inet->dport;
+		}
+
+		/* XXX sigh, inet-> doesn't have sparse annotation so any
+		 * use of it here generates a warning with -Wbitwise */
+		seq_printf(seq, "%p:\n"
+			   "  krefs:           %d\n"
+			   "  sock:            %u.%u.%u.%u:%u -> "
+					      "%u.%u.%u.%u:%u\n"
+			   "  remote node:     %s\n"
+			   "  page off:        %zu\n"
+			   "  handshake ok:    %u\n"
+			   "  timer:           %lu.%lu\n"
+			   "  data ready:      %lu.%lu\n"
+			   "  advance start:   %lu.%lu\n"
+			   "  advance stop:    %lu.%lu\n"
+			   "  func start:      %lu.%lu\n"
+			   "  func stop:       %lu.%lu\n"
+			   "  func key:        %u\n"
+			   "  func type:       %u\n",
+			   sc,
+			   atomic_read(&sc->sc_kref.refcount),
+			   NIPQUAD(saddr), inet ? ntohs(sport) : 0,
+			   NIPQUAD(daddr), inet ? ntohs(dport) : 0,
+			   sc->sc_node->nd_name,
+			   sc->sc_page_off,
+			   sc->sc_handshake_ok,
+			   TV_SEC_USEC(sc->sc_tv_timer),
+			   TV_SEC_USEC(sc->sc_tv_data_ready),
+			   TV_SEC_USEC(sc->sc_tv_advance_start),
+			   TV_SEC_USEC(sc->sc_tv_advance_stop),
+			   TV_SEC_USEC(sc->sc_tv_func_start),
+			   TV_SEC_USEC(sc->sc_tv_func_stop),
+			   sc->sc_msg_key,
+			   sc->sc_msg_type);
+	}
+
+
+	spin_unlock(&o2net_debug_lock);
+
+	return 0;
+}
+
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations sc_seq_ops = {
+	.start = sc_seq_start,
+	.next = sc_seq_next,
+	.stop = sc_seq_stop,
+	.show = sc_seq_show,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+	struct o2net_sock_container *dummy_sc;
+	struct seq_file *seq;
+	int ret;
+
+	dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+	if (dummy_sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	dummy_sc->sc_page = NULL;
+
+	ret = seq_open(file, &sc_seq_ops);
+	if (ret)
+		goto out;
+
+	seq = file->private_data;
+	seq->private = dummy_sc;
+	o2net_debug_add_sc(dummy_sc);
+
+	dummy_sc = NULL;
+
+out:
+	kfree(dummy_sc);
+	return ret;
+}
+
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct o2net_sock_container *dummy_sc = seq->private;
+
+	o2net_debug_del_sc(dummy_sc);
+	return seq_release_private(inode, file);
+}
+
+static struct file_operations sc_seq_fops = {
+	.open = sc_fop_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = sc_fop_release,
+};
+
+int o2net_debugfs_init(void)
+{
+	o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+	if (!o2net_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+					 o2net_dentry, NULL,
+					 &nst_seq_fops);
+	if (!nst_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+					o2net_dentry, NULL,
+					&sc_seq_fops);
+	if (!sc_dentry) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	return 0;
+bail:
+	if (sc_dentry)
+		debugfs_remove(sc_dentry);
+	if (nst_dentry)
+		debugfs_remove(nst_dentry);
+	if (o2net_dentry)
+		debugfs_remove(o2net_dentry);
+	return -ENOMEM;
+}
+
+void o2net_debugfs_exit(void)
+{
+	if (sc_dentry)
+		debugfs_remove(sc_dentry);
+	if (nst_dentry)
+		debugfs_remove(nst_dentry);
+	if (o2net_dentry)
+		debugfs_remove(o2net_dentry);
+}
+
+#endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7e..cf9401e8cd0b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
 	cluster_print_version();
 
 	o2hb_init();
-	o2net_init();
+
+	ret = o2net_init();
+	if (ret)
+		goto out;
 
 	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
 	if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1170918a9311..1e44ad14881a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,6 +142,54 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+			   u32 msgkey, struct task_struct *task, u8 node)
+{
+#ifdef CONFIG_DEBUG_FS
+	INIT_LIST_HEAD(&nst->st_net_debug_item);
+	nst->st_task = task;
+	nst->st_msg_type = msgtype;
+	nst->st_msg_key = msgkey;
+	nst->st_node = node;
+#endif
+}
+
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+	do_gettimeofday(&nst->st_sock_time);
+#endif
+}
+
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+	do_gettimeofday(&nst->st_send_time);
+#endif
+}
+
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+	do_gettimeofday(&nst->st_status_time);
+#endif
+}
+
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+					 struct o2net_sock_container *sc)
+{
+#ifdef CONFIG_DEBUG_FS
+	nst->st_sc = sc;
+#endif
+}
+
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+{
+#ifdef CONFIG_DEBUG_FS
+	nst->st_id = msg_id;
+#endif
+}
+
 static inline int o2net_reconnect_delay(void)
 {
 	return o2nm_single_cluster->cl_reconnect_delay_ms;
@@ -290,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
 	o2nm_node_put(sc->sc_node);
 	sc->sc_node = NULL;
 
+	o2net_debug_del_sc(sc);
 	kfree(sc);
 }
 
@@ -330,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 
 	ret = sc;
 	sc->sc_page = page;
+	o2net_debug_add_sc(sc);
 	sc = NULL;
 	page = NULL;
 
@@ -913,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 	struct o2net_status_wait nsw = {
 		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
 	};
+	struct o2net_send_tracking nst;
+
+	o2net_init_nst(&nst, msg_type, key, current, target_node);
 
 	if (o2net_wq == NULL) {
 		mlog(0, "attempt to tx without o2netd running\n");
@@ -938,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 		goto out;
 	}
 
+	o2net_debug_add_nst(&nst);
+
+	o2net_set_nst_sock_time(&nst);
+
 	ret = wait_event_interruptible(nn->nn_sc_wq,
 				       o2net_tx_can_proceed(nn, &sc, &error));
 	if (!ret && error)
@@ -945,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 	if (ret)
 		goto out;
 
+	o2net_set_nst_sock_container(&nst, sc);
+
 	veclen = caller_veclen + 1;
 	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
 	if (vec == NULL) {
@@ -971,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 		goto out;
 
 	msg->msg_num = cpu_to_be32(nsw.ns_id);
+	o2net_set_nst_msg_id(&nst, nsw.ns_id);
+
+	o2net_set_nst_send_time(&nst);
 
 	/* finally, convert the message header to network byte-order
 	 * and send */
@@ -985,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 	}
 
 	/* wait on other node's handler */
+	o2net_set_nst_status_time(&nst);
 	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
 
 	/* Note that we avoid overwriting the callers status return
@@ -997,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 	mlog(0, "woken, returning system status %d, user status %d\n",
 	     ret, nsw.ns_status);
 out:
+	o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
 	if (sc)
 		sc_put(sc);
 	if (vec)
@@ -1935,6 +1999,9 @@ int o2net_init(void)
 
 	o2quo_init();
 
+	if (o2net_debugfs_init())
+		return -ENOMEM;
+
 	o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
 	o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
 	o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1976,4 +2043,5 @@ void o2net_exit(void)
 	kfree(o2net_hand);
 	kfree(o2net_keep_req);
 	kfree(o2net_keep_resp);
+	o2net_debugfs_exit();
 }
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3dd..a705d5d19036 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
 int o2net_init(void);
 void o2net_exit(void);
 
+struct o2net_send_tracking;
+struct o2net_sock_container;
+
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static int o2net_debugfs_init(void)
+{
+	return 0;
+}
+static void o2net_debugfs_exit(void)
+{
+}
+static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif	/* CONFIG_DEBUG_FS */
+
 #endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index b4c5586f46ea..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -166,7 +166,9 @@ struct o2net_sock_container {
 	/* original handlers for the sockets */
 	void			(*sc_state_change)(struct sock *sk);
 	void			(*sc_data_ready)(struct sock *sk, int bytes);
-
+#ifdef CONFIG_DEBUG_FS
+	struct list_head        sc_net_debug_item;
+#endif
 	struct timeval 		sc_tv_timer;
 	struct timeval 		sc_tv_data_ready;
 	struct timeval 		sc_tv_advance_start;
@@ -208,4 +210,24 @@ struct o2net_status_wait {
 	struct list_head	ns_node_item;
 };
 
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+	struct list_head		st_net_debug_item;
+	struct task_struct		*st_task;
+	struct o2net_sock_container	*st_sc;
+	u32				st_id;
+	u32				st_msg_type;
+	u32				st_msg_key;
+	u8				st_node;
+	struct timeval			st_sock_time;
+	struct timeval			st_send_time;
+	struct timeval			st_status_time;
+};
+#else
+struct o2net_send_tracking {
+	u32	dummy;
+};
+#endif	/* CONFIG_DEBUG_FS */
+
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
-- 
cgit v1.2.3


From cb688371e27880d86c42323826846d1cd7caad8f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Tue, 26 Feb 2008 09:59:26 -0500
Subject: fs: Remove unnecessary inclusions of asm/semaphore.h

None of these files use any of the functionality promised by
asm/semaphore.h.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 fs/9p/fid.c           | 1 -
 fs/cramfs/inode.c     | 1 -
 fs/dlm/dlm_internal.h | 1 -
 fs/locks.c            | 1 -
 fs/reiserfs/xattr.c   | 1 -
 5 files changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index dfebdbe7440e..3031e3233dd6 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
-#include <asm/semaphore.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 350680fd7da7..0c3b618c15b3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -23,7 +23,6 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
-#include <asm/semaphore.h>
 
 #include <asm/uaccess.h>
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d30ea8b433a2..7a8824f475f2 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -37,7 +37,6 @@
 #include <linux/jhash.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
 #include <linux/dlm.h>
diff --git a/fs/locks.c b/fs/locks.c
index 43c0af21a0c5..592faadbcec1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
 
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 344b9b96cc56..d7c4935c1034 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -44,7 +44,6 @@
 #include <net/checksum.h>
 #include <linux/smp_lock.h>
 #include <linux/stat.h>
-#include <asm/semaphore.h>
 
 #define FL_READONLY 128
 #define FL_DIR_SEM_HELD 256
-- 
cgit v1.2.3


From 6188e10d38b8d7244ee7776d5f1f88c837b4b93f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 18 Apr 2008 22:21:05 -0400
Subject: Convert asm/semaphore.h users to linux/semaphore.h

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 Documentation/DocBook/kernel-locking.tmpl | 6 +++---
 arch/ia64/kernel/salinfo.c                | 2 +-
 drivers/base/core.c                       | 2 +-
 drivers/char/snsc.h                       | 2 +-
 drivers/firewire/fw-device.c              | 3 ++-
 drivers/i2c/i2c-core.c                    | 2 +-
 drivers/ieee1394/nodemgr.c                | 2 +-
 drivers/infiniband/core/user_mad.c        | 2 +-
 drivers/infiniband/hw/mthca/mthca_dev.h   | 3 +--
 drivers/input/serio/hp_sdc_mlc.c          | 2 +-
 drivers/macintosh/adb.c                   | 2 +-
 drivers/macintosh/windfarm_smu_sat.c      | 2 +-
 drivers/net/3c527.c                       | 2 +-
 drivers/net/hamradio/6pack.c              | 2 +-
 drivers/s390/cio/qdio.c                   | 2 +-
 drivers/scsi/aacraid/commctrl.c           | 2 +-
 drivers/scsi/aacraid/commsup.c            | 2 +-
 drivers/scsi/aacraid/dpcsup.c             | 2 +-
 drivers/scsi/megaraid/megaraid_ioctl.h    | 2 +-
 drivers/scsi/qla2xxx/qla_def.h            | 2 +-
 drivers/watchdog/sc1200wdt.c              | 2 +-
 fs/jffs2/jffs2_fs_i.h                     | 2 +-
 fs/jffs2/jffs2_fs_sb.h                    | 2 +-
 fs/reiserfs/journal.c                     | 2 +-
 fs/xfs/linux-2.6/sema.h                   | 2 +-
 include/linux/device.h                    | 2 +-
 include/linux/fs.h                        | 2 +-
 include/linux/hil_mlc.h                   | 2 +-
 include/linux/i2o.h                       | 2 +-
 include/linux/memory.h                    | 3 +--
 include/linux/parport.h                   | 2 +-
 lib/kernel_lock.c                         | 2 +-
 32 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index 2e9d6b41f034..435413ca40dc 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -241,7 +241,7 @@
    </para>
    <para>
      The third type is a semaphore
-     (<filename class="headerfile">include/asm/semaphore.h</filename>): it
+     (<filename class="headerfile">include/linux/semaphore.h</filename>): it
      can have more than one holder at any time (the number decided at
      initialization time), although it is most commonly used as a
      single-holder lock (a mutex).  If you can't get a semaphore, your
@@ -290,7 +290,7 @@
      <para>
        If you have a data structure which is only ever accessed from
        user context, then you can use a simple semaphore
-       (<filename>linux/asm/semaphore.h</filename>) to protect it.  This 
+       (<filename>linux/linux/semaphore.h</filename>) to protect it.  This
        is the most trivial case: you initialize the semaphore to the number 
        of resources available (usually 1), and call
        <function>down_interruptible()</function> to grab the semaphore, and 
@@ -1656,7 +1656,7 @@ the amount of locking which needs to be done.
  #include &lt;linux/slab.h&gt;
  #include &lt;linux/string.h&gt;
 +#include &lt;linux/rcupdate.h&gt;
- #include &lt;asm/semaphore.h&gt;
+ #include &lt;linux/semaphore.h&gt;
  #include &lt;asm/errno.h&gt;
 
  struct object
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 779c3cca206c..b11bb50a197a 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -44,8 +44,8 @@
 #include <linux/smp.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/semaphore.h>
 
-#include <asm/semaphore.h>
 #include <asm/sal.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 24198ad01976..7c4b36ccb1a0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -20,7 +20,7 @@
 #include <linux/notifier.h>
 #include <linux/genhd.h>
 #include <linux/kallsyms.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #include "base.h"
 #include "power/power.h"
diff --git a/drivers/char/snsc.h b/drivers/char/snsc.h
index 8a98169b60c1..4be62eda9fbc 100644
--- a/drivers/char/snsc.h
+++ b/drivers/char/snsc.h
@@ -22,8 +22,8 @@
 #include <linux/kobject.h>
 #include <linux/fs.h>
 #include <linux/cdev.h>
+#include <linux/semaphore.h>
 #include <asm/sn/types.h>
-#include <asm/semaphore.h>
 
 #define CHUNKSIZE 127
 
diff --git a/drivers/firewire/fw-device.c b/drivers/firewire/fw-device.c
index 2d01bc1b9752..d9c8daf7ae7d 100644
--- a/drivers/firewire/fw-device.c
+++ b/drivers/firewire/fw-device.c
@@ -26,7 +26,8 @@
 #include <linux/delay.h>
 #include <linux/idr.h>
 #include <linux/string.h>
-#include <asm/semaphore.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
 #include <asm/system.h>
 #include <linux/ctype.h>
 #include "fw-transaction.h"
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 8b645c6b2cb5..e186df657119 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -35,8 +35,8 @@
 #include <linux/completion.h>
 #include <linux/hardirq.h>
 #include <linux/irqflags.h>
+#include <linux/semaphore.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 
 #include "i2c-core.h"
 
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 70afa3786f3f..29d833e71cbf 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -18,8 +18,8 @@
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
 #include <linux/freezer.h>
+#include <linux/semaphore.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 
 #include "csr.h"
 #include "highlevel.h"
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 4e915104ac4c..be953e87bf93 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -46,9 +46,9 @@
 #include <linux/mutex.h>
 #include <linux/kref.h>
 #include <linux/compat.h>
+#include <linux/semaphore.h>
 
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 0e842e023400..7bc32f8e377e 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -46,8 +46,7 @@
 #include <linux/timer.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #include "mthca_provider.h"
 #include "mthca_doorbell.h"
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index c45ea74d53e4..f1fd3b638a37 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -40,7 +40,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #define PREFIX "HP SDC MLC: "
 
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 28958101061f..20978205cd02 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -37,9 +37,9 @@
 #include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/platform_device.h>
+#include <linux/semaphore.h>
 
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 #ifdef CONFIG_PPC
 #include <asm/prom.h>
 #include <asm/machdep.h>
diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c
index f449d775cdf4..797918d0e59c 100644
--- a/drivers/macintosh/windfarm_smu_sat.c
+++ b/drivers/macintosh/windfarm_smu_sat.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/i2c.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <asm/prom.h>
 #include <asm/smu.h>
 #include <asm/pmac_low_i2c.h>
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index b72b89d53ec8..fae295b6809c 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -103,8 +103,8 @@ DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " Richard Procter <rnp@paradise.net.
 #include <linux/ethtool.h>
 #include <linux/completion.h>
 #include <linux/bitops.h>
+#include <linux/semaphore.h>
 
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 0a9b75139e0f..1da55dd2a5a0 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -33,7 +33,7 @@
 #include <linux/init.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <asm/atomic.h>
 
 #define SIXPACK_VERSION    "Revision: 0.3.0"
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index c359386708e9..10aa1e780801 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -38,11 +38,11 @@
 #include <linux/proc_fs.h>
 #include <linux/timer.h>
 #include <linux/mempool.h>
+#include <linux/semaphore.h>
 
 #include <asm/ccwdev.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 #include <asm/timex.h>
 
 #include <asm/debug.h>
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index abef05146d75..5fd83deab36c 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -39,7 +39,7 @@
 #include <linux/blkdev.h>
 #include <linux/delay.h> /* ssleep prototype */
 #include <linux/kthread.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <asm/uaccess.h>
 
 #include "aacraid.h"
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 23a8e9f8dcb4..ef67816a6fe5 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -41,11 +41,11 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/interrupt.h>
+#include <linux/semaphore.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_cmnd.h>
-#include <asm/semaphore.h>
 
 #include "aacraid.h"
 
diff --git a/drivers/scsi/aacraid/dpcsup.c b/drivers/scsi/aacraid/dpcsup.c
index d1163ded132b..933f208eedba 100644
--- a/drivers/scsi/aacraid/dpcsup.c
+++ b/drivers/scsi/aacraid/dpcsup.c
@@ -36,7 +36,7 @@
 #include <linux/slab.h>
 #include <linux/completion.h>
 #include <linux/blkdev.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #include "aacraid.h"
 
diff --git a/drivers/scsi/megaraid/megaraid_ioctl.h b/drivers/scsi/megaraid/megaraid_ioctl.h
index 706fa05a187a..05f6e4ec3453 100644
--- a/drivers/scsi/megaraid/megaraid_ioctl.h
+++ b/drivers/scsi/megaraid/megaraid_ioctl.h
@@ -18,7 +18,7 @@
 #define _MEGARAID_IOCTL_H_
 
 #include <linux/types.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #include "mbox_defs.h"
 
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 094d95f0764c..299eccf6cabd 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -25,7 +25,7 @@
 #include <linux/firmware.h>
 #include <linux/aer.h>
 #include <linux/mutex.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
diff --git a/drivers/watchdog/sc1200wdt.c b/drivers/watchdog/sc1200wdt.c
index 32ccd7c89c7d..35cddff7020f 100644
--- a/drivers/watchdog/sc1200wdt.c
+++ b/drivers/watchdog/sc1200wdt.c
@@ -38,8 +38,8 @@
 #include <linux/init.h>
 #include <linux/pnp.h>
 #include <linux/fs.h>
+#include <linux/semaphore.h>
 
-#include <asm/semaphore.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 0b78fdc9773b..a841f4973a74 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -15,7 +15,7 @@
 #include <linux/version.h>
 #include <linux/rbtree.h>
 #include <linux/posix_acl.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 struct jffs2_inode_info {
 	/* We need an internal mutex similar to inode->i_mutex.
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 3a2197f3c812..18fca2b9e531 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -16,7 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/list.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bb05a3e51b93..060eb3f598e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -38,7 +38,7 @@
 #include <asm/system.h>
 
 #include <linux/time.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 #include <linux/vmalloc.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 2009e6d922ce..3abe7e9ceb33 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -20,8 +20,8 @@
 
 #include <linux/time.h>
 #include <linux/wait.h>
+#include <linux/semaphore.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 
 /*
  * sema_t structure just maps to struct semaphore in Linux kernel.
diff --git a/include/linux/device.h b/include/linux/device.h
index 2258d89bf523..c79b93e56fa0 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -20,7 +20,7 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/pm.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <asm/atomic.h>
 #include <asm/device.h>
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b84b848431f2..91e8dec9e42b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -287,9 +287,9 @@ extern int dir_notify_enable;
 #include <linux/pid.h>
 #include <linux/mutex.h>
 #include <linux/capability.h>
+#include <linux/semaphore.h>
 
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 #include <asm/byteorder.h>
 
 struct export_operations;
diff --git a/include/linux/hil_mlc.h b/include/linux/hil_mlc.h
index 8df29ca48a13..394a8405dd74 100644
--- a/include/linux/hil_mlc.h
+++ b/include/linux/hil_mlc.h
@@ -34,7 +34,7 @@
 #include <linux/hil.h>
 #include <linux/time.h>
 #include <linux/interrupt.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <linux/serio.h>
 #include <linux/list.h>
 
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 7da5b98d90e6..e92170dda245 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -33,9 +33,9 @@
 #include <linux/mempool.h>
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
+#include <linux/semaphore.h>	/* Needed for MUTEX init macros */
 
 #include <asm/io.h>
-#include <asm/semaphore.h>	/* Needed for MUTEX init macros */
 
 /* message queue empty */
 #define I2O_QUEUE_EMPTY		0xffffffff
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 33f0ff0cf634..54d7866d9c0e 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -18,8 +18,7 @@
 #include <linux/sysdev.h>
 #include <linux/node.h>
 #include <linux/compiler.h>
-
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 struct memory_block {
 	unsigned long phys_index;
diff --git a/include/linux/parport.h b/include/linux/parport.h
index d1ad546c8c9e..dcb9e01a69ca 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -101,9 +101,9 @@ typedef enum {
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/irqreturn.h>
+#include <linux/semaphore.h>
 #include <asm/system.h>
 #include <asm/ptrace.h>
-#include <asm/semaphore.h>
 
 /* Define this later. */
 struct parport;
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index fbc11a336bc5..cd3e82530b03 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -8,7 +8,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 
 /*
  * The 'big kernel semaphore'
-- 
cgit v1.2.3


From d57999e1527f0b0c818846dcba5a23015beb4823 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:27 -0800
Subject: [PATCH] do namei_flags calculation inside open_namei()

My end goal here is to make sure all users of may_open()
return filps.  This will ensure that we properly release
mount write counts which were taken for the filp in
may_open().

This patch moves the sys_open flags to namei flags
calculation into fs/namei.c.  We'll shortly be moving
the nameidata_to_filp() calls into namei.c, and this
gets the sys_open flags to a place where we can get
at them when we need them.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 43 ++++++++++++++++++++++++++++++++++---------
 fs/open.c  | 22 ++--------------------
 2 files changed, 36 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8cf9bb9c2fc0..c70dbf720109 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1677,7 +1677,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	return 0;
 }
 
-static int open_namei_create(struct nameidata *nd, struct path *path,
+/*
+ * Be careful about ever adding any more callers of this
+ * function.  Its flags must be in the namei format, not
+ * what get passed to sys_open().
+ */
+static int __open_namei_create(struct nameidata *nd, struct path *path,
 				int flag, int mode)
 {
 	int error;
@@ -1695,27 +1700,47 @@ static int open_namei_create(struct nameidata *nd, struct path *path,
 	return may_open(nd, 0, flag & ~O_TRUNC);
 }
 
+/*
+ * Note that while the flag value (low two bits) for sys_open means:
+ *	00 - read-only
+ *	01 - write-only
+ *	10 - read-write
+ *	11 - special
+ * it is changed into
+ *	00 - no permissions needed
+ *	01 - read-permission
+ *	10 - write-permission
+ *	11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
+ *
+*/
+static inline int open_to_namei_flags(int flag)
+{
+	if ((flag+1) & O_ACCMODE)
+		flag++;
+	return flag;
+}
+
 /*
  *	open_namei()
  *
  * namei for open - this is in fact almost the whole open-routine.
  *
  * Note that the low bits of "flag" aren't the same as in the open
- * system call - they are 00 - no permissions needed
- *			  01 - read permission needed
- *			  10 - write permission needed
- *			  11 - read/write permissions needed
- * which is a lot more logical, and also allows the "no perm" needed
- * for symlinks (where the permissions are checked later).
+ * system call.  See open_to_namei_flags().
  * SMP-safe
  */
-int open_namei(int dfd, const char *pathname, int flag,
+int open_namei(int dfd, const char *pathname, int open_flag,
 		int mode, struct nameidata *nd)
 {
 	int acc_mode, error;
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
+	int flag = open_to_namei_flags(open_flag);
 
 	acc_mode = ACC_MODE(flag);
 
@@ -1776,7 +1801,7 @@ do_last:
 
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
-		error = open_namei_create(nd, &path, flag, mode);
+		error = __open_namei_create(nd, &path, flag, mode);
 		if (error)
 			goto exit;
 		return 0;
diff --git a/fs/open.c b/fs/open.c
index 3fa4e4ffce4c..5ab3f3f079c0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -796,31 +796,13 @@ cleanup_file:
 	return ERR_PTR(error);
 }
 
-/*
- * Note that while the flag value (low two bits) for sys_open means:
- *	00 - read-only
- *	01 - write-only
- *	10 - read-write
- *	11 - special
- * it is changed into
- *	00 - no permissions needed
- *	01 - read-permission
- *	10 - write-permission
- *	11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
 static struct file *do_filp_open(int dfd, const char *filename, int flags,
 				 int mode)
 {
-	int namei_flags, error;
+	int error;
 	struct nameidata nd;
 
-	namei_flags = flags;
-	if ((namei_flags+1) & O_ACCMODE)
-		namei_flags++;
-
-	error = open_namei(dfd, filename, namei_flags, mode, &nd);
+	error = open_namei(dfd, filename, flags, mode, &nd);
 	if (!error)
 		return nameidata_to_filp(&nd, flags);
 
-- 
cgit v1.2.3


From a70e65df8812c52252fa07a2eb92a46451a4427f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 15 Feb 2008 14:37:28 -0800
Subject: [PATCH] merge open_namei() and do_filp_open()

open_namei() will, in the future, need to take mount write counts
over its creation and truncation (via may_open()) operations.  It
needs to keep these write counts until any potential filp that is
created gets __fput()'d.

This gets complicated in the error handling and becomes very murky
as to how far open_namei() actually got, and whether or not that
mount write count was taken.  That makes it a bad interface.

All that the current do_filp_open() really does is allocate the
nameidata on the stack, then call open_namei().

So, this merges those two functions and moves filp_open() over
to namei.c so it can be close to its buddy: do_filp_open().  It
also gets a kerneldoc comment in the process.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 100 ++++++++++++++++++++++++++++++-----------------------
 fs/open.c          |  19 ----------
 include/linux/fs.h |   3 +-
 3 files changed, 59 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c70dbf720109..a1f8bbbd58e5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1725,17 +1725,13 @@ static inline int open_to_namei_flags(int flag)
 }
 
 /*
- *	open_namei()
- *
- * namei for open - this is in fact almost the whole open-routine.
- *
  * Note that the low bits of "flag" aren't the same as in the open
  * system call.  See open_to_namei_flags().
- * SMP-safe
  */
-int open_namei(int dfd, const char *pathname, int open_flag,
-		int mode, struct nameidata *nd)
+struct file *do_filp_open(int dfd, const char *pathname,
+		int open_flag, int mode)
 {
+	struct nameidata nd;
 	int acc_mode, error;
 	struct path path;
 	struct dentry *dir;
@@ -1758,18 +1754,19 @@ int open_namei(int dfd, const char *pathname, int open_flag,
 	 */
 	if (!(flag & O_CREAT)) {
 		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
-					 nd, flag);
+					 &nd, flag);
 		if (error)
-			return error;
+			return ERR_PTR(error);
 		goto ok;
 	}
 
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
+	error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
+				   &nd, flag, mode);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 
 	/*
 	 * We have the parent and last component. First of all, check
@@ -1777,14 +1774,14 @@ int open_namei(int dfd, const char *pathname, int open_flag,
 	 * will not do.
 	 */
 	error = -EISDIR;
-	if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
 		goto exit;
 
-	dir = nd->path.dentry;
-	nd->flags &= ~LOOKUP_PARENT;
+	dir = nd.path.dentry;
+	nd.flags &= ~LOOKUP_PARENT;
 	mutex_lock(&dir->d_inode->i_mutex);
-	path.dentry = lookup_hash(nd);
-	path.mnt = nd->path.mnt;
+	path.dentry = lookup_hash(&nd);
+	path.mnt = nd.path.mnt;
 
 do_last:
 	error = PTR_ERR(path.dentry);
@@ -1793,18 +1790,18 @@ do_last:
 		goto exit;
 	}
 
-	if (IS_ERR(nd->intent.open.file)) {
+	if (IS_ERR(nd.intent.open.file)) {
 		mutex_unlock(&dir->d_inode->i_mutex);
-		error = PTR_ERR(nd->intent.open.file);
+		error = PTR_ERR(nd.intent.open.file);
 		goto exit_dput;
 	}
 
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
-		error = __open_namei_create(nd, &path, flag, mode);
+		error = __open_namei_create(&nd, &path, flag, mode);
 		if (error)
 			goto exit;
-		return 0;
+		return nameidata_to_filp(&nd, open_flag);
 	}
 
 	/*
@@ -1829,23 +1826,23 @@ do_last:
 	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
 		goto do_link;
 
-	path_to_nameidata(&path, nd);
+	path_to_nameidata(&path, &nd);
 	error = -EISDIR;
 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
 		goto exit;
 ok:
-	error = may_open(nd, acc_mode, flag);
+	error = may_open(&nd, acc_mode, flag);
 	if (error)
 		goto exit;
-	return 0;
+	return nameidata_to_filp(&nd, open_flag);
 
 exit_dput:
-	path_put_conditional(&path, nd);
+	path_put_conditional(&path, &nd);
 exit:
-	if (!IS_ERR(nd->intent.open.file))
-		release_open_intent(nd);
-	path_put(&nd->path);
-	return error;
+	if (!IS_ERR(nd.intent.open.file))
+		release_open_intent(&nd);
+	path_put(&nd.path);
+	return ERR_PTR(error);
 
 do_link:
 	error = -ELOOP;
@@ -1861,42 +1858,59 @@ do_link:
 	 * stored in nd->last.name and we will have to putname() it when we
 	 * are done. Procfs-like symlinks just set LAST_BIND.
 	 */
-	nd->flags |= LOOKUP_PARENT;
-	error = security_inode_follow_link(path.dentry, nd);
+	nd.flags |= LOOKUP_PARENT;
+	error = security_inode_follow_link(path.dentry, &nd);
 	if (error)
 		goto exit_dput;
-	error = __do_follow_link(&path, nd);
+	error = __do_follow_link(&path, &nd);
 	if (error) {
 		/* Does someone understand code flow here? Or it is only
 		 * me so stupid? Anathema to whoever designed this non-sense
 		 * with "intent.open".
 		 */
-		release_open_intent(nd);
-		return error;
+		release_open_intent(&nd);
+		return ERR_PTR(error);
 	}
-	nd->flags &= ~LOOKUP_PARENT;
-	if (nd->last_type == LAST_BIND)
+	nd.flags &= ~LOOKUP_PARENT;
+	if (nd.last_type == LAST_BIND)
 		goto ok;
 	error = -EISDIR;
-	if (nd->last_type != LAST_NORM)
+	if (nd.last_type != LAST_NORM)
 		goto exit;
-	if (nd->last.name[nd->last.len]) {
-		__putname(nd->last.name);
+	if (nd.last.name[nd.last.len]) {
+		__putname(nd.last.name);
 		goto exit;
 	}
 	error = -ELOOP;
 	if (count++==32) {
-		__putname(nd->last.name);
+		__putname(nd.last.name);
 		goto exit;
 	}
-	dir = nd->path.dentry;
+	dir = nd.path.dentry;
 	mutex_lock(&dir->d_inode->i_mutex);
-	path.dentry = lookup_hash(nd);
-	path.mnt = nd->path.mnt;
-	__putname(nd->last.name);
+	path.dentry = lookup_hash(&nd);
+	path.mnt = nd.path.mnt;
+	__putname(nd.last.name);
 	goto do_last;
 }
 
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:	path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	return do_filp_open(AT_FDCWD, filename, flags, mode);
+}
+EXPORT_SYMBOL(filp_open);
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
diff --git a/fs/open.c b/fs/open.c
index 5ab3f3f079c0..8111947905d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -796,25 +796,6 @@ cleanup_file:
 	return ERR_PTR(error);
 }
 
-static struct file *do_filp_open(int dfd, const char *filename, int flags,
-				 int mode)
-{
-	int error;
-	struct nameidata nd;
-
-	error = open_namei(dfd, filename, flags, mode, &nd);
-	if (!error)
-		return nameidata_to_filp(&nd, flags);
-
-	return ERR_PTR(error);
-}
-
-struct file *filp_open(const char *filename, int flags, int mode)
-{
-	return do_filp_open(AT_FDCWD, filename, flags, mode);
-}
-EXPORT_SYMBOL(filp_open);
-
 /**
  * lookup_instantiate_filp - instantiates the open intent filp
  * @nd: pointer to nameidata
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b84b848431f2..013b9c2b88e6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1735,7 +1735,8 @@ extern struct file *create_read_pipe(struct file *f);
 extern struct file *create_write_pipe(void);
 extern void free_write_pipe(struct file *);
 
-extern int open_namei(int dfd, const char *, int, int, struct nameidata *);
+extern struct file *do_filp_open(int dfd, const char *pathname,
+		int open_flag, int mode);
 extern int may_open(struct nameidata *, int, int);
 
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
-- 
cgit v1.2.3


From 8366025eb80dfa0d8d94b286d53027081c280ef1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:30 -0800
Subject: [PATCH] r/o bind mounts: stub functions

This patch adds two function mnt_want_write() and mnt_drop_write().  These are
used like a lock pair around and fs operations that might cause a write to the
filesystem.

Before these can become useful, we must first cover each place in the VFS
where writes are performed with a want/drop pair.  When that is complete, we
can actually introduce code that will safely check the counts before allowing
r/w<->r/o transitions to occur.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mount.h |  3 +++
 2 files changed, 57 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 94f026ec990a..066b393578c1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -80,6 +80,60 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/**
+ * mnt_want_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is
+ * about to be performed to it, and makes sure that
+ * writes are allowed before returning success.  When
+ * the write operation is finished, mnt_drop_write()
+ * must be called.  This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *mnt)
+{
+	if (__mnt_is_readonly(mnt))
+		return -EROFS;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it.  Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+	return (mnt->mnt_sb->s_flags & MS_RDONLY);
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5ee2df217cdf..2eecd2c8c760 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -71,9 +71,12 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 	return mnt;
 }
 
+extern int mnt_want_write(struct vfsmount *mnt);
+extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
 extern void mnt_unpin(struct vfsmount *mnt);
+extern int __mnt_is_readonly(struct vfsmount *mnt);
 
 static inline void mntput(struct vfsmount *mnt)
 {
-- 
cgit v1.2.3


From aceaf78da92a53f5e1b105649a1b8c0afdb2135c Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:31 -0800
Subject: [PATCH] r/o bind mounts: create helper to drop file write access

If someone decides to demote a file from r/w to just
r/o, they can use this same code as __fput().

NFS does just that, and will use this in the next
patch.

AV: drop write access in __fput() only after we evict from file list.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Cc: Erez Zadok <ezk@cs.sunysb.edu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J Bruce Fields" <bfields@fieldses.org>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c      | 21 +++++++++++++++++++--
 fs/nfsd/nfs4state.c  |  3 ++-
 include/linux/file.h |  1 +
 3 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 986ff4ed0a7c..3f73eb1f195a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -211,6 +211,23 @@ void fput(struct file *file)
 
 EXPORT_SYMBOL(fput);
 
+/**
+ * drop_file_write_access - give up ability to write to a file
+ * @file: the file to which we will stop writing
+ *
+ * This is a central place which will give up the ability
+ * to write to @file, along with access to write through
+ * its vfsmount.
+ */
+void drop_file_write_access(struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+
+	put_write_access(inode);
+}
+EXPORT_SYMBOL_GPL(drop_file_write_access);
+
 /* __fput is called from task context when aio completion releases the last
  * last use of a struct file *.  Do not use otherwise.
  */
@@ -236,10 +253,10 @@ void __fput(struct file *file)
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
-	if (file->f_mode & FMODE_WRITE)
-		put_write_access(inode);
 	put_pid(file->f_owner.pid);
 	file_kill(file);
+	if (file->f_mode & FMODE_WRITE)
+		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
 	file->f_path.mnt = NULL;
 	file_free(file);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bcb97d8e8b8b..81a75f3081f4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/cache.h>
+#include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/workqueue.h>
 #include <linux/smp_lock.h>
@@ -1239,7 +1240,7 @@ static inline void
 nfs4_file_downgrade(struct file *filp, unsigned int share_access)
 {
 	if (share_access & NFS4_SHARE_ACCESS_WRITE) {
-		put_write_access(filp->f_path.dentry->d_inode);
+		drop_file_write_access(filp);
 		filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
 	}
 }
diff --git a/include/linux/file.h b/include/linux/file.h
index 7239baac81a9..653477021e4c 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -61,6 +61,7 @@ extern struct kmem_cache *filp_cachep;
 
 extern void __fput(struct file *);
 extern void fput(struct file *);
+extern void drop_file_write_access(struct file *file);
 
 struct file_operations;
 struct vfsmount;
-- 
cgit v1.2.3


From 49e0d02cf018d4edf24bfc8531a816a26367e4ce Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:32 -0800
Subject: [PATCH] r/o bind mounts: drop write during emergency remount

The emergency remount code forcibly removes FMODE_WRITE from
filps.  The r/o bind mount code notices that this was done
without a proper mnt_drop_write() and properly gives a
warning.

This patch does a mnt_drop_write() to keep everything
balanced.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 09008dbd264e..01d5c40e9119 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
 #include <linux/idr.h>
 #include <linux/kobject.h>
 #include <linux/mutex.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 
 
@@ -567,10 +568,26 @@ static void mark_files_ro(struct super_block *sb)
 {
 	struct file *f;
 
+retry:
 	file_list_lock();
 	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-		if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
-			f->f_mode &= ~FMODE_WRITE;
+		struct vfsmount *mnt;
+		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+		       continue;
+		if (!file_count(f))
+			continue;
+		if (!(f->f_mode & FMODE_WRITE))
+			continue;
+		f->f_mode &= ~FMODE_WRITE;
+		mnt = mntget(f->f_path.mnt);
+		file_list_unlock();
+		/*
+		 * This can sleep, so we can't hold
+		 * the file_list_lock() spinlock.
+		 */
+		mnt_drop_write(mnt);
+		mntput(mnt);
+		goto retry;
 	}
 	file_list_unlock();
 }
-- 
cgit v1.2.3


From 0622753b800e4cc6cb9319b36b27658c72dd7cdc Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:34 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for rmdir and unlink.

Elevate the write count during the vfs_rmdir() and vfs_unlink().

[AV: merged rmdir and unlink parts, added missing pieces in nfsd]

Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  9 +++++++++
 fs/nfsd/nfs4recover.c | 12 +++++++++++-
 fs/nfsd/vfs.c         |  8 +++++++-
 ipc/mqueue.c          |  5 ++++-
 4 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a1f8bbbd58e5..89ef3178eaaa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2190,7 +2190,12 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
 		goto exit2;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto exit3;
 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+	mnt_drop_write(nd.path.mnt);
+exit3:
 	dput(dentry);
 exit2:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2271,7 +2276,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		inode = dentry->d_inode;
 		if (inode)
 			atomic_inc(&inode->i_count);
+		error = mnt_want_write(nd.path.mnt);
+		if (error)
+			goto exit2;
 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+		mnt_drop_write(nd.path.mnt);
 	exit2:
 		dput(dentry);
 	}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1ff90625860f..4e77a1a3bd73 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -46,6 +46,7 @@
 #include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
+#include <linux/mount.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -313,12 +314,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	if (!rec_dir_init || !clp->cl_firststate)
 		return;
 
+	status = mnt_want_write(rec_dir.path.mnt);
+	if (status)
+		goto out;
 	clp->cl_firststate = 0;
 	nfs4_save_user(&uid, &gid);
 	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
 	nfs4_reset_user(uid, gid);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
+	mnt_drop_write(rec_dir.path.mnt);
+out:
 	if (status)
 		printk("NFSD: Failed to remove expired client state directory"
 				" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
@@ -347,13 +353,17 @@ nfsd4_recdir_purge_old(void) {
 
 	if (!rec_dir_init)
 		return;
+	status = mnt_want_write(rec_dir.path.mnt);
+	if (status)
+		goto out;
 	status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
+	mnt_drop_write(rec_dir.path.mnt);
+out:
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
 			" directory %s\n", rec_dir.path.dentry->d_name.name);
-	return;
 }
 
 static int
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5365a0..efff58a5818c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1750,6 +1750,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (!type)
 		type = rdentry->d_inode->i_mode & S_IFMT;
 
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+
 	if (type != S_IFDIR) { /* It's UNLINK */
 #ifdef MSNFS
 		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
@@ -1765,10 +1769,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	dput(rdentry);
 
 	if (host_err)
-		goto out_nfserr;
+		goto out_drop;
 	if (EX_ISSYNC(fhp->fh_export))
 		host_err = nfsd_sync_dir(dentry);
 
+out_drop:
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 60f7a27f7a9e..34262c11f480 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -742,8 +742,11 @@ asmlinkage long sys_mq_unlink(const char __user *u_name)
 	inode = dentry->d_inode;
 	if (inode)
 		atomic_inc(&inode->i_count);
-
+	err = mnt_want_write(mqueue_mnt);
+	if (err)
+		goto out_err;
 	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+	mnt_drop_write(mqueue_mnt);
 out_err:
 	dput(dentry);
 
-- 
cgit v1.2.3


From 463c3197263bd26ac59a00d2484990e17e35c50e Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:57 -0800
Subject: [PATCH] r/o bind mounts: get callers of vfs_mknod/create/mkdir()

This takes care of all of the direct callers of vfs_mknod().
Since a few of these cases also handle normal file creation
as well, this also covers some calls to vfs_create().

So that we don't have to make three mnt_want/drop_write()
calls inside of the switch statement, we move some of its
logic outside of the switch and into a helper function
suggested by Christoph.

This also encapsulates a fix for mknod(S_IFREG) that Miklos
found.

[AV: merged mkdir handling, added missing nfsd pieces]

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 48 +++++++++++++++++++++++++++++++++++++-----------
 fs/nfsd/nfs4recover.c |  4 ++++
 fs/nfsd/vfs.c         | 24 ++++++++++++++++++++++--
 net/unix/af_unix.c    |  4 ++++
 4 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 89ef3178eaaa..3fbcf2021a2e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1984,6 +1984,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return error;
 }
 
+static int may_mknod(mode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+	case 0: /* zero mode translates to S_IFREG */
+		return 0;
+	case S_IFDIR:
+		return -EPERM;
+	default:
+		return -EINVAL;
+	}
+}
+
 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 				unsigned dev)
 {
@@ -2002,12 +2019,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 	if (error)
 		goto out;
 	dentry = lookup_create(&nd, 0);
-	error = PTR_ERR(dentry);
-
+	if (IS_ERR(dentry)) {
+		error = PTR_ERR(dentry);
+		goto out_unlock;
+	}
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
 		mode &= ~current->fs->umask;
-	if (!IS_ERR(dentry)) {
-		switch (mode & S_IFMT) {
+	error = may_mknod(mode);
+	if (error)
+		goto out_dput;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
+	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
 			break;
@@ -2018,14 +2042,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 		case S_IFIFO: case S_IFSOCK:
 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
 			break;
-		case S_IFDIR:
-			error = -EPERM;
-			break;
-		default:
-			error = -EINVAL;
-		}
-		dput(dentry);
 	}
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
 out:
@@ -2083,7 +2104,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
 		mode &= ~current->fs->umask;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+	mnt_drop_write(nd.path.mnt);
+out_dput:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e77a1a3bd73..145b3c877a27 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -155,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
 		goto out_put;
 	}
+	status = mnt_want_write(rec_dir.path.mnt);
+	if (status)
+		goto out_put;
 	status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
+	mnt_drop_write(rec_dir.path.mnt);
 out_put:
 	dput(dentry);
 out_unlock:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index efff58a5818c..14d582467029 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = 0;
 	switch (type) {
 	case S_IFREG:
+		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+		if (host_err)
+			goto out_nfserr;
 		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
 		break;
 	case S_IFDIR:
+		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+		if (host_err)
+			goto out_nfserr;
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
 		break;
 	case S_IFCHR:
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
+		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+		if (host_err)
+			goto out_nfserr;
 		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
 		break;
 	default:
 	        printk("nfsd: bad file type %o in nfsd_create\n", type);
 		host_err = -EINVAL;
+		goto out_nfserr;
 	}
-	if (host_err < 0)
+	if (host_err < 0) {
+		mnt_drop_write(fhp->fh_export->ex_path.mnt);
 		goto out_nfserr;
+	}
 
 	if (EX_ISSYNC(fhp->fh_export)) {
 		err = nfserrno(nfsd_sync_dir(dentry));
@@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
 	if (err2)
 		err = err2;
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
 	/*
 	 * Update the file handle to get the new inode info.
 	 */
@@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		v_atime = verifier[1]&0x7fffffff;
 	}
 	
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
 	if (dchild->d_inode) {
 		err = 0;
 
@@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		case NFS3_CREATE_GUARDED:
 			err = nfserr_exist;
 		}
+		mnt_drop_write(fhp->fh_export->ex_path.mnt);
 		goto out;
 	}
 
 	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
-	if (host_err < 0)
+	if (host_err < 0) {
+		mnt_drop_write(fhp->fh_export->ex_path.mnt);
 		goto out_nfserr;
+	}
 	if (created)
 		*created = 1;
 
@@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (err2)
 		err = err2;
 
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
 	/*
 	 * Update the filehandle to get the new inode info.
 	 */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2851d0d15048..1454afcc06c4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -819,7 +819,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 */
 		mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+		err = mnt_want_write(nd.path.mnt);
+		if (err)
+			goto out_mknod_dput;
 		err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0);
+		mnt_drop_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
 		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-- 
cgit v1.2.3


From 75c3f29de7451677c59580b0a959f694f36aac28 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:45 -0800
Subject: [PATCH] r/o bind mounts: write counts for link/symlink

[AV: add missing nfsd pieces]

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c    | 10 ++++++++++
 fs/nfsd/vfs.c | 14 +++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3fbcf2021a2e..00df735fb509 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2387,7 +2387,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	if (IS_ERR(dentry))
 		goto out_unlock;
 
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+	mnt_drop_write(nd.path.mnt);
+out_dput:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2482,7 +2487,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
 		goto out_unlock;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_dput;
 	error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+	mnt_drop_write(nd.path.mnt);
+out_dput:
 	dput(new_dentry);
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 14d582467029..71f899482306 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1542,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (iap && (iap->ia_valid & ATTR_MODE))
 		mode = iap->ia_mode & S_IALLUGO;
 
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+
 	if (unlikely(path[plen] != 0)) {
 		char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
 		if (path_alloced == NULL)
@@ -1562,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = nfserrno(host_err);
 	fh_unlock(fhp);
 
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+
 	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
 	dput(dnew);
 	if (err==0) err = cerr;
@@ -1612,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	dold = tfhp->fh_dentry;
 	dest = dold->d_inode;
 
+	host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
+	if (host_err) {
+		err = nfserrno(host_err);
+		goto out_dput;
+	}
 	host_err = vfs_link(dold, dirp, dnew);
 	if (!host_err) {
 		if (EX_ISSYNC(ffhp->fh_export)) {
@@ -1625,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		else
 			err = nfserrno(host_err);
 	}
-
+	mnt_drop_write(tfhp->fh_export->ex_path.mnt);
+out_dput:
 	dput(dnew);
 out_unlock:
 	fh_unlock(ffhp);
-- 
cgit v1.2.3


From 9079b1eb1753f217c3de9f1b7dd7fd549cc3f0cf Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:49 -0800
Subject: [PATCH] r/o bind mounts: get write access for vfs_rename() callers

This also uses the little helper in the NFS code to make an if() a little bit
less ugly.  We introduced the helper at the beginning of the series.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c    |  4 ++++
 fs/nfsd/vfs.c | 17 +++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 00df735fb509..83c843b3fea3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2718,8 +2718,12 @@ static int do_rename(int olddfd, const char *oldname,
 	if (new_dentry == trap)
 		goto exit5;
 
+	error = mnt_want_write(oldnd.path.mnt);
+	if (error)
+		goto exit5;
 	error = vfs_rename(old_dir->d_inode, old_dentry,
 				   new_dir->d_inode, new_dentry);
+	mnt_drop_write(oldnd.path.mnt);
 exit5:
 	dput(new_dentry);
 exit4:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 71f899482306..18a4cc9feeb3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1710,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if (ndentry == trap)
 		goto out_dput_new;
 
-#ifdef MSNFS
-	if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+	if (svc_msnfs(ffhp) &&
 		((atomic_read(&odentry->d_count) > 1)
 		 || (atomic_read(&ndentry->d_count) > 1))) {
 			host_err = -EPERM;
-	} else
-#endif
+			goto out_dput_new;
+	}
+
+	host_err = -EXDEV;
+	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+		goto out_dput_new;
+	host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_dput_new;
+
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
 		host_err = nfsd_sync_dir(tdentry);
@@ -1724,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 			host_err = nfsd_sync_dir(fdentry);
 	}
 
+	mnt_drop_write(ffhp->fh_export->ex_path.mnt);
+
  out_dput_new:
 	dput(ndentry);
  out_dput_old:
-- 
cgit v1.2.3


From 18f335aff86913de3c76f88d32c8135c1da62ce6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:38 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for xattr_permission()
 callers

This basically audits the callers of xattr_permission(), which calls
permission() and can perform writes to the filesystem.

[AV: add missing parts - removexattr() and nfsd posix acls, plug for a leak
spotted by Miklos]

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4proc.c |  7 ++++++-
 fs/nfsd/vfs.c      |  4 ++++
 fs/xattr.c         | 40 ++++++++++++++++++++++++++++++++--------
 3 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c593db047d8b..c309c881bd4e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			return status;
 		}
 	}
+	status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
+	if (status)
+		return status;
 	status = nfs_ok;
 	if (setattr->sa_acl != NULL)
 		status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
 					    setattr->sa_acl);
 	if (status)
-		return status;
+		goto out;
 	status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
 				0, (time_t)0);
+out:
+	mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
 	return status;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18a4cc9feeb3..626dfd38528f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2086,6 +2086,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 	} else
 		size = 0;
 
+	error = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (error)
+		goto getout;
 	if (size)
 		error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
 	else {
@@ -2097,6 +2100,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 				error = 0;
 		}
 	}
+	mnt_drop_write(fhp->fh_export->ex_path.mnt);
 
 getout:
 	kfree(value);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3acab1615460..f7062da505d4 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/xattr.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	 * filesystem  or on an immutable / append-only inode.
 	 */
 	if (mask & MAY_WRITE) {
-		if (IS_RDONLY(inode))
-			return -EROFS;
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 			return -EPERM;
 	}
@@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value,
 	error = user_path_walk(path, &nd);
 	if (error)
 		return error;
-	error = setxattr(nd.path.dentry, name, value, size, flags);
+	error = mnt_want_write(nd.path.mnt);
+	if (!error) {
+		error = setxattr(nd.path.dentry, name, value, size, flags);
+		mnt_drop_write(nd.path.mnt);
+	}
 	path_put(&nd.path);
 	return error;
 }
@@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value,
 	error = user_path_walk_link(path, &nd);
 	if (error)
 		return error;
-	error = setxattr(nd.path.dentry, name, value, size, flags);
+	error = mnt_want_write(nd.path.mnt);
+	if (!error) {
+		error = setxattr(nd.path.dentry, name, value, size, flags);
+		mnt_drop_write(nd.path.mnt);
+	}
 	path_put(&nd.path);
 	return error;
 }
@@ -295,7 +302,12 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = setxattr(dentry, name, value, size, flags);
+	error = mnt_want_write(f->f_path.mnt);
+	if (!error) {
+		error = setxattr(dentry, name, value, size, flags);
+		mnt_drop_write(f->f_path.mnt);
+	}
+out_fput:
 	fput(f);
 	return error;
 }
@@ -482,7 +494,11 @@ sys_removexattr(char __user *path, char __user *name)
 	error = user_path_walk(path, &nd);
 	if (error)
 		return error;
-	error = removexattr(nd.path.dentry, name);
+	error = mnt_want_write(nd.path.mnt);
+	if (!error) {
+		error = removexattr(nd.path.dentry, name);
+		mnt_drop_write(nd.path.mnt);
+	}
 	path_put(&nd.path);
 	return error;
 }
@@ -496,7 +512,11 @@ sys_lremovexattr(char __user *path, char __user *name)
 	error = user_path_walk_link(path, &nd);
 	if (error)
 		return error;
-	error = removexattr(nd.path.dentry, name);
+	error = mnt_want_write(nd.path.mnt);
+	if (!error) {
+		error = removexattr(nd.path.dentry, name);
+		mnt_drop_write(nd.path.mnt);
+	}
 	path_put(&nd.path);
 	return error;
 }
@@ -513,7 +533,11 @@ sys_fremovexattr(int fd, char __user *name)
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = removexattr(dentry, name);
+	error = mnt_want_write(f->f_path.mnt);
+	if (!error) {
+		error = removexattr(dentry, name);
+		mnt_drop_write(f->f_path.mnt);
+	}
 	fput(f);
 	return error;
 }
-- 
cgit v1.2.3


From a761a1c03a739f04afd6c8d37fd16405bbe754da Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:39 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for ncp_ioctl()

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/ioctl.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c67b4bdcf719..ad8f167e54bc 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/ioctl.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
@@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
 
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
 	      unsigned int cmd, unsigned long arg)
 {
 	struct ncp_server *server = NCP_SERVER(inode);
@@ -822,6 +823,57 @@ outrel:
 	return -EINVAL;
 }
 
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+	switch (cmd) {
+	case NCP_IOC_GET_FS_INFO:
+	case NCP_IOC_GET_FS_INFO_V2:
+	case NCP_IOC_NCPREQUEST:
+	case NCP_IOC_SETDENTRYTTL:
+	case NCP_IOC_SIGN_INIT:
+	case NCP_IOC_LOCKUNLOCK:
+	case NCP_IOC_SET_SIGN_WANTED:
+		return 1;
+	case NCP_IOC_GETOBJECTNAME:
+	case NCP_IOC_SETOBJECTNAME:
+	case NCP_IOC_GETPRIVATEDATA:
+	case NCP_IOC_SETPRIVATEDATA:
+	case NCP_IOC_SETCHARSETS:
+	case NCP_IOC_GETCHARSETS:
+	case NCP_IOC_CONN_LOGGED_IN:
+	case NCP_IOC_GETDENTRYTTL:
+	case NCP_IOC_GETMOUNTUID2:
+	case NCP_IOC_SIGN_WANTED:
+	case NCP_IOC_GETROOT:
+	case NCP_IOC_SETROOT:
+		return 0;
+	default:
+		/* unkown IOCTL command, assume write */
+		return 1;
+	}
+}
+
+int ncp_ioctl(struct inode *inode, struct file *filp,
+	      unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	if (ncp_ioctl_need_write(cmd)) {
+		/*
+		 * inside the ioctl(), any failures which
+		 * are because of file_permission() are
+		 * -EACCESS, so it seems consistent to keep
+		 *  that here.
+		 */
+		if (mnt_want_write(filp->f_path.mnt))
+			return -EACCES;
+	}
+	ret = __ncp_ioctl(inode, filp, cmd, arg);
+	if (ncp_ioctl_need_write(cmd))
+		mnt_drop_write(filp->f_path.mnt);
+	return ret;
+}
+
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-- 
cgit v1.2.3


From cdb70f3f74b31576cc4d707a3d3b00d159cab8bb Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:41 -0800
Subject: [PATCH] r/o bind mounts: write counts for touch_atime()

Remove handling of NULL mnt while we are at it - that can't happen these days.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 53245ffcf93d..6f6250c08ce6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
-	if (inode->i_flags & S_NOATIME)
+	if (mnt_want_write(mnt))
 		return;
+	if (inode->i_flags & S_NOATIME)
+		goto out;
 	if (IS_NOATIME(inode))
-		return;
+		goto out;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-		return;
-
-	/*
-	 * We may have a NULL vfsmount when coming from NFSD
-	 */
-	if (mnt) {
-		if (mnt->mnt_flags & MNT_NOATIME)
-			return;
-		if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-			return;
+		goto out;
 
-		if (mnt->mnt_flags & MNT_RELATIME) {
-			/*
-			 * With relative atime, only update atime if the
-			 * previous atime is earlier than either the ctime or
-			 * mtime.
-			 */
-			if (timespec_compare(&inode->i_mtime,
-						&inode->i_atime) < 0 &&
-			    timespec_compare(&inode->i_ctime,
-						&inode->i_atime) < 0)
-				return;
-		}
+	if (mnt->mnt_flags & MNT_NOATIME)
+		goto out;
+	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+		goto out;
+	if (mnt->mnt_flags & MNT_RELATIME) {
+		/*
+		 * With relative atime, only update atime if the previous
+		 * atime is earlier than either the ctime or mtime.
+		 */
+		if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
+		    timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
+			goto out;
 	}
 
 	now = current_fs_time(inode->i_sb);
 	if (timespec_equal(&inode->i_atime, &now))
-		return;
+		goto out;
 
 	inode->i_atime = now;
 	mark_inode_dirty_sync(inode);
+out:
+	mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
 
-- 
cgit v1.2.3


From 74f9fdfa1f229284ee1ea58fa47f2cdeeb12f6fe Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:42 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for do_utimes()

Now includes fix for oops seen by akpm.

"never let a libc developer write your kernel code" - hch

"nor, apparently, a kernel developer" - akpm

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/utimes.c b/fs/utimes.c
index b18da9c0b97f..a2bef77dc9c9 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -2,6 +2,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/linkage.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/stat.h>
@@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 	struct inode *inode;
 	struct iattr newattrs;
 	struct file *f = NULL;
+	struct vfsmount *mnt;
 
 	error = -EINVAL;
 	if (times && (!nsec_valid(times[0].tv_nsec) ||
@@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 		if (!f)
 			goto out;
 		dentry = f->f_path.dentry;
+		mnt = f->f_path.mnt;
 	} else {
 		error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
 		if (error)
 			goto out;
 
 		dentry = nd.path.dentry;
+		mnt = nd.path.mnt;
 	}
 
 	inode = dentry->d_inode;
 
-	error = -EROFS;
-	if (IS_RDONLY(inode))
+	error = mnt_want_write(mnt);
+	if (error)
 		goto dput_and_out;
 
 	/* Don't worry, the checks are done in inode_change_ok() */
@@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 	if (times) {
 		error = -EPERM;
                 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                        goto dput_and_out;
+			goto mnt_drop_write_and_out;
 
 		if (times[0].tv_nsec == UTIME_OMIT)
 			newattrs.ia_valid &= ~ATTR_ATIME;
@@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 	} else {
 		error = -EACCES;
                 if (IS_IMMUTABLE(inode))
-                        goto dput_and_out;
+			goto mnt_drop_write_and_out;
 
 		if (!is_owner_or_cap(inode)) {
 			if (f) {
 				if (!(f->f_mode & FMODE_WRITE))
-					goto dput_and_out;
+					goto mnt_drop_write_and_out;
 			} else {
 				error = vfs_permission(&nd, MAY_WRITE);
 				if (error)
-					goto dput_and_out;
+					goto mnt_drop_write_and_out;
 			}
 		}
 	}
 	mutex_lock(&inode->i_mutex);
 	error = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+	mnt_drop_write(mnt);
 dput_and_out:
 	if (f)
 		fput(f);
-- 
cgit v1.2.3


From 20ddee2c75339cc095f6191c3115f81da8955e96 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:43 -0800
Subject: [PATCH] r/o bind mounts: write count for file_update_time()

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 6f6250c08ce6..27ee1af50d02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1250,10 +1250,13 @@ void file_update_time(struct file *file)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct timespec now;
 	int sync_it = 0;
+	int err;
 
 	if (IS_NOCMTIME(inode))
 		return;
-	if (IS_RDONLY(inode))
+
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
 		return;
 
 	now = current_fs_time(inode->i_sb);
@@ -1274,6 +1277,7 @@ void file_update_time(struct file *file)
 
 	if (sync_it)
 		mark_inode_dirty_sync(inode);
+	mnt_drop_write(file->f_path.mnt);
 }
 
 EXPORT_SYMBOL(file_update_time);
-- 
cgit v1.2.3


From 42a74f206b914db13ee1f5ae932dcd91a77c8579 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:46 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for ioctls()

Some ioctl()s can cause writes to the filesystem.  Take these, and make them
use mnt_want/drop_write() instead.

[AV: updated]

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/ioctl.c              |  57 +++++++++++++++---------
 fs/ext3/ioctl.c              | 103 ++++++++++++++++++++++++++++---------------
 fs/ext4/ioctl.c              |  86 ++++++++++++++++++++----------------
 fs/fat/file.c                |  12 ++---
 fs/hfsplus/ioctl.c           |  40 ++++++++++-------
 fs/jfs/ioctl.c               |  33 +++++++++-----
 fs/ocfs2/ioctl.c             |  11 ++---
 fs/reiserfs/ioctl.c          |  63 +++++++++++++++++---------
 fs/xfs/linux-2.6/xfs_ioctl.c |  15 ++++---
 9 files changed, 265 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b8ea11fee5c6..de876fa793e1 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
+#include <linux/mount.h>
 #include <linux/smp_lock.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
@@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
+	int ret;
 
 	ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
 
@@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case EXT2_IOC_SETFLAGS: {
 		unsigned int oldflags;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		ret = mnt_want_write(filp->f_path.mnt);
+		if (ret)
+			return ret;
 
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
+		if (!is_owner_or_cap(inode)) {
+			ret = -EACCES;
+			goto setflags_out;
+		}
 
-		if (get_user(flags, (int __user *) arg))
-			return -EFAULT;
+		if (get_user(flags, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setflags_out;
+		}
 
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~EXT2_DIRSYNC_FL;
@@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		/* Is it quota file? Do not allow user to mess with it */
 		if (IS_NOQUOTA(inode)) {
 			mutex_unlock(&inode->i_mutex);
-			return -EPERM;
+			ret = -EPERM;
+			goto setflags_out;
 		}
 		oldflags = ei->i_flags;
 
@@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
 				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
+				ret = -EPERM;
+				goto setflags_out;
 			}
 		}
 
@@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		ext2_set_inode_flags(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
-		return 0;
+setflags_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return ret;
 	}
 	case EXT2_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *) arg);
 	case EXT2_IOC_SETVERSION:
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
-		if (IS_RDONLY(inode))
-			return -EROFS;
-		if (get_user(inode->i_generation, (int __user *) arg))
-			return -EFAULT;	
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-		return 0;
+		ret = mnt_want_write(filp->f_path.mnt);
+		if (ret)
+			return ret;
+		if (get_user(inode->i_generation, (int __user *) arg)) {
+			ret = -EFAULT;
+		} else {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+		}
+		mnt_drop_write(filp->f_path.mnt);
+		return ret;
 	case EXT2_IOC_GETRSVSZ:
 		if (test_opt(inode->i_sb, RESERVATION)
 			&& S_ISREG(inode->i_mode)
@@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
 			return -EFAULT;
 
+		ret = mnt_want_write(filp->f_path.mnt);
+		if (ret)
+			return ret;
+
 		if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
 			rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
 
@@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			rsv->rsv_goal_size = rsv_window_size;
 		}
 		mutex_unlock(&ei->truncate_mutex);
+		mnt_drop_write(filp->f_path.mnt);
 		return 0;
 	}
 	default:
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 023a070f55f1..0d0c70151642 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -12,6 +12,7 @@
 #include <linux/capability.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
+#include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
@@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		unsigned int oldflags;
 		unsigned int jflag;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
 
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
+		if (!is_owner_or_cap(inode)) {
+			err = -EACCES;
+			goto flags_out;
+		}
 
-		if (get_user(flags, (int __user *) arg))
-			return -EFAULT;
+		if (get_user(flags, (int __user *) arg)) {
+			err = -EFAULT;
+			goto flags_out;
+		}
 
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~EXT3_DIRSYNC_FL;
@@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		/* Is it quota file? Do not allow user to mess with it */
 		if (IS_NOQUOTA(inode)) {
 			mutex_unlock(&inode->i_mutex);
-			return -EPERM;
+			err = -EPERM;
+			goto flags_out;
 		}
 		oldflags = ei->i_flags;
 
@@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
 				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
+				err = -EPERM;
+				goto flags_out;
 			}
 		}
 
@@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
 			if (!capable(CAP_SYS_RESOURCE)) {
 				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
+				err = -EPERM;
+				goto flags_out;
 			}
 		}
 
@@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		handle = ext3_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			mutex_unlock(&inode->i_mutex);
-			return PTR_ERR(handle);
+			err = PTR_ERR(handle);
+			goto flags_out;
 		}
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
@@ -115,6 +125,8 @@ flags_err:
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
 			err = ext3_change_inode_journal_flag(inode, jflag);
 		mutex_unlock(&inode->i_mutex);
+flags_out:
+		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
 	case EXT3_IOC_GETVERSION:
@@ -129,14 +141,18 @@ flags_err:
 
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
-		if (IS_RDONLY(inode))
-			return -EROFS;
-		if (get_user(generation, (int __user *) arg))
-			return -EFAULT;
-
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
 		handle = ext3_journal_start(inode, 1);
-		if (IS_ERR(handle))
-			return PTR_ERR(handle);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto setversion_out;
+		}
 		err = ext3_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
 			inode->i_ctime = CURRENT_TIME_SEC;
@@ -144,6 +160,8 @@ flags_err:
 			err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 		}
 		ext3_journal_stop(handle);
+setversion_out:
+		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
 #ifdef CONFIG_JBD_DEBUG
@@ -179,18 +197,24 @@ flags_err:
 		}
 		return -ENOTTY;
 	case EXT3_IOC_SETRSVSZ: {
+		int err;
 
 		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
 
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
+		if (!is_owner_or_cap(inode)) {
+			err = -EACCES;
+			goto setrsvsz_out;
+		}
 
-		if (get_user(rsv_window_size, (int __user *)arg))
-			return -EFAULT;
+		if (get_user(rsv_window_size, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setrsvsz_out;
+		}
 
 		if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
 			rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -208,7 +232,9 @@ flags_err:
 			rsv->rsv_goal_size = rsv_window_size;
 		}
 		mutex_unlock(&ei->truncate_mutex);
-		return 0;
+setrsvsz_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
 	}
 	case EXT3_IOC_GROUP_EXTEND: {
 		ext3_fsblk_t n_blocks_count;
@@ -218,17 +244,20 @@ flags_err:
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
-		if (get_user(n_blocks_count, (__u32 __user *)arg))
-			return -EFAULT;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
 
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
+		}
 		err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
 		journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_extend_out:
+		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
 	case EXT3_IOC_GROUP_ADD: {
@@ -239,18 +268,22 @@ flags_err:
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
 
 		if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
-				sizeof(input)))
-			return -EFAULT;
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
 
 		err = ext3_group_add(sb, &input);
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
 		journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_add_out:
+		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897e..25b13ede8086 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 
 int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
@@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		unsigned int oldflags;
 		unsigned int jflag;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
 		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
 			return -EFAULT;
 
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~EXT4_DIRSYNC_FL;
 
+		err = -EPERM;
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
-			return -EPERM;
-		}
+		if (IS_NOQUOTA(inode))
+			goto flags_out;
+
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
-			}
+			if (!capable(CAP_LINUX_IMMUTABLE))
+				goto flags_out;
 		}
 
 		/*
@@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE)) {
-				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
-			}
+			if (!capable(CAP_SYS_RESOURCE))
+				goto flags_out;
 		}
 
-
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
-			mutex_unlock(&inode->i_mutex);
-			return PTR_ERR(handle);
+			err = PTR_ERR(handle);
+			goto flags_out;
 		}
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
@@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext4_journal_stop(handle);
-		if (err) {
-			mutex_unlock(&inode->i_mutex);
-			return err;
-		}
+		if (err)
+			goto flags_out;
 
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
 			err = ext4_change_inode_journal_flag(inode, jflag);
+flags_out:
 		mutex_unlock(&inode->i_mutex);
+		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
 	case EXT4_IOC_GETVERSION:
@@ -129,14 +126,20 @@ flags_err:
 
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
-		if (IS_RDONLY(inode))
-			return -EROFS;
-		if (get_user(generation, (int __user *) arg))
-			return -EFAULT;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
 
 		handle = ext4_journal_start(inode, 1);
-		if (IS_ERR(handle))
-			return PTR_ERR(handle);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto setversion_out;
+		}
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
 			inode->i_ctime = ext4_current_time(inode);
@@ -144,6 +147,8 @@ flags_err:
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
 		ext4_journal_stop(handle);
+setversion_out:
+		mnt_drop_write(filp->f_path.mnt);
 		return err;
 	}
 #ifdef CONFIG_JBD2_DEBUG
@@ -179,19 +184,21 @@ flags_err:
 		}
 		return -ENOTTY;
 	case EXT4_IOC_SETRSVSZ: {
+		int err;
 
 		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
 		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
 			return -EFAULT;
 
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
 		if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
 			rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
 
@@ -208,6 +215,7 @@ flags_err:
 			rsv->rsv_goal_size = rsv_window_size;
 		}
 		up_write(&ei->i_data_sem);
+		mnt_drop_write(filp->f_path.mnt);
 		return 0;
 	}
 	case EXT4_IOC_GROUP_EXTEND: {
@@ -218,16 +226,18 @@ flags_err:
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
 		if (get_user(n_blocks_count, (__u32 __user *)arg))
 			return -EFAULT;
 
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
 		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		mnt_drop_write(filp->f_path.mnt);
 
 		return err;
 	}
@@ -239,17 +249,19 @@ flags_err:
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
 		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
 				sizeof(input)))
 			return -EFAULT;
 
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
 		err = ext4_group_add(sb, &input);
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
 		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		mnt_drop_write(filp->f_path.mnt);
 
 		return err;
 	}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c614175876e0..2a3bed967041 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
@@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 
 		mutex_lock(&inode->i_mutex);
 
-		if (IS_RDONLY(inode)) {
-			err = -EROFS;
-			goto up;
-		}
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto up_no_drop_write;
 
 		/*
 		 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 
 		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
 		mark_inode_dirty(inode);
-	up:
+up:
+		mnt_drop_write(filp->f_path.mnt);
+up_no_drop_write:
 		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index b60c0affbec5..f457d2ca51ab 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -14,6 +14,7 @@
 
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
 #include <asm/uaccess.h>
@@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 			flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
 		return put_user(flags, (int __user *)arg);
 	case HFSPLUS_IOC_EXT2_SETFLAGS: {
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
-
-		if (get_user(flags, (int __user *)arg))
-			return -EFAULT;
-
+		int err = 0;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+
+		if (!is_owner_or_cap(inode)) {
+			err = -EACCES;
+			goto setflags_out;
+		}
+		if (get_user(flags, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setflags_out;
+		}
 		if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
 		    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
-				return -EPERM;
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
 		}
 
 		/* don't silently ignore unsupported ext2 flags */
-		if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
-			return -EOPNOTSUPP;
-
+		if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+			err = -EOPNOTSUPP;
+			goto setflags_out;
+		}
 		if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
 			inode->i_flags |= S_IMMUTABLE;
 			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
-		return 0;
+setflags_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
 	}
 	default:
 		return -ENOTTY;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index a1f8e375ad21..afe222bf300f 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/ctype.h>
 #include <linux/capability.h>
+#include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <asm/current.h>
@@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return put_user(flags, (int __user *) arg);
 	case JFS_IOC_SETFLAGS: {
 		unsigned int oldflags;
+		int err;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
 
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
-
-		if (get_user(flags, (int __user *) arg))
-			return -EFAULT;
+		if (!is_owner_or_cap(inode)) {
+			err = -EACCES;
+			goto setflags_out;
+		}
+		if (get_user(flags, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setflags_out;
+		}
 
 		flags = jfs_map_ext2(flags, 1);
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~JFS_DIRSYNC_FL;
 
 		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode))
-			return -EPERM;
+		if (IS_NOQUOTA(inode)) {
+			err = -EPERM;
+			goto setflags_out;
+		}
 
 		/* Lock against other parallel changes of flags */
 		mutex_lock(&inode->i_mutex);
@@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
 				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
+				err = -EPERM;
+				goto setflags_out;
 			}
 		}
 
@@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		mutex_unlock(&inode->i_mutex);
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
-		return 0;
+setflags_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
 	}
 	default:
 		return -ENOTTY;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index b413166dd163..7b142f0ce995 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -60,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 		goto bail;
 	}
 
-	status = -EROFS;
-	if (IS_RDONLY(inode))
-		goto bail_unlock;
-
 	status = -EACCES;
 	if (!is_owner_or_cap(inode))
 		goto bail_unlock;
@@ -134,8 +130,13 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (get_user(flags, (int __user *) arg))
 			return -EFAULT;
 
-		return ocfs2_set_inode_attr(inode, flags,
+		status = mnt_want_write(filp->f_path.mnt);
+		if (status)
+			return status;
+		status = ocfs2_set_inode_attr(inode, flags,
 			OCFS2_FL_MODIFIABLE);
+		mnt_drop_write(filp->f_path.mnt);
+		return status;
 	case OCFS2_IOC_RESVSP:
 	case OCFS2_IOC_RESVSP64:
 	case OCFS2_IOC_UNRESVSP:
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index e0f0f098a523..74363a7aacbc 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -4,6 +4,7 @@
 
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
@@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		   unsigned long arg)
 {
 	unsigned int flags;
+	int err = 0;
 
 	switch (cmd) {
 	case REISERFS_IOC_UNPACK:
@@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 			if (!reiserfs_attrs(inode->i_sb))
 				return -ENOTTY;
 
-			if (IS_RDONLY(inode))
-				return -EROFS;
+			err = mnt_want_write(filp->f_path.mnt);
+			if (err)
+				return err;
 
-			if (!is_owner_or_cap(inode))
-				return -EPERM;
-
-			if (get_user(flags, (int __user *)arg))
-				return -EFAULT;
-
-			/* Is it quota file? Do not allow user to mess with it. */
-			if (IS_NOQUOTA(inode))
-				return -EPERM;
+			if (!is_owner_or_cap(inode)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
+			if (get_user(flags, (int __user *)arg)) {
+				err = -EFAULT;
+				goto setflags_out;
+			}
+			/*
+			 * Is it quota file? Do not allow user to mess with it
+			 */
+			if (IS_NOQUOTA(inode)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
 			if (((flags ^ REISERFS_I(inode)->
 			      i_attrs) & (REISERFS_IMMUTABLE_FL |
 					  REISERFS_APPEND_FL))
-			    && !capable(CAP_LINUX_IMMUTABLE))
-				return -EPERM;
-
+			    && !capable(CAP_LINUX_IMMUTABLE)) {
+				err = -EPERM;
+				goto setflags_out;
+			}
 			if ((flags & REISERFS_NOTAIL_FL) &&
 			    S_ISREG(inode->i_mode)) {
 				int result;
 
 				result = reiserfs_unpack(inode, filp);
-				if (result)
-					return result;
+				if (result) {
+					err = result;
+					goto setflags_out;
+				}
 			}
 			sd_attrs_to_i_attrs(flags, inode);
 			REISERFS_I(inode)->i_attrs = flags;
 			inode->i_ctime = CURRENT_TIME_SEC;
 			mark_inode_dirty(inode);
-			return 0;
+setflags_out:
+			mnt_drop_write(filp->f_path.mnt);
+			return err;
 		}
 	case REISERFS_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *)arg);
 	case REISERFS_IOC_SETVERSION:
 		if (!is_owner_or_cap(inode))
 			return -EPERM;
-		if (IS_RDONLY(inode))
-			return -EROFS;
-		if (get_user(inode->i_generation, (int __user *)arg))
-			return -EFAULT;
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		if (get_user(inode->i_generation, (int __user *)arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
-		return 0;
+setversion_out:
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index bf7759793856..4ddb86b73c6b 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -535,8 +535,6 @@ xfs_attrmulti_attr_set(
 	char			*kbuf;
 	int			error = EFAULT;
 
-	if (IS_RDONLY(inode))
-		return -EROFS;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		return EPERM;
 	if (len > XATTR_SIZE_MAX)
@@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove(
 	char			*name,
 	__uint32_t		flags)
 {
-	if (IS_RDONLY(inode))
-		return -EROFS;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		return EPERM;
 	return xfs_attr_remove(XFS_I(inode), name, flags);
@@ -573,6 +569,7 @@ STATIC int
 xfs_attrmulti_by_handle(
 	xfs_mount_t		*mp,
 	void			__user *arg,
+	struct file		*parfilp,
 	struct inode		*parinode)
 {
 	int			error;
@@ -626,13 +623,21 @@ xfs_attrmulti_by_handle(
 					&ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
 			ops[i].am_error = xfs_attrmulti_attr_set(inode,
 					attr_name, ops[i].am_attrvalue,
 					ops[i].am_length, ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
 			ops[i].am_error = xfs_attrmulti_attr_remove(inode,
 					attr_name, ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		default:
 			ops[i].am_error = EINVAL;
@@ -1133,7 +1138,7 @@ xfs_ioctl(
 		return xfs_attrlist_by_handle(mp, arg, inode);
 
 	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-		return xfs_attrmulti_by_handle(mp, arg, inode);
+		return xfs_attrmulti_by_handle(mp, arg, filp, inode);
 
 	case XFS_IOC_SWAPEXT: {
 		error = xfs_swapext((struct xfs_swapext __user *)arg);
-- 
cgit v1.2.3


From 4a3fd211ccfc08a88edc824300e25a87785c6a5f Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:48 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for open()s

This is the first really tricky patch in the series.  It elevates the writer
count on a mount each time a non-special file is opened for write.

We used to do this in may_open(), but Miklos pointed out that __dentry_open()
is used as well to create filps.  This will cover even those cases, while a
call in may_open() would not have.

There is also an elevated count around the vfs_create() call in open_namei().
See the comments for more details, but we need this to fix a 'create, remount,
fail r/w open()' race.

Some filesystems forego the use of normal vfs calls to create
struct files.   Make sure that these users elevate the mnt
writer count because they will get __fput(), and we need
to make sure they're balanced.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c | 14 +++++++++++
 fs/namei.c      | 75 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 fs/open.c       | 36 +++++++++++++++++++++++++--
 ipc/mqueue.c    | 16 ++++++++++--
 4 files changed, 127 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 3f73eb1f195a..71efc7000226 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -199,6 +199,17 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
 	file->f_mapping = dentry->d_inode->i_mapping;
 	file->f_mode = mode;
 	file->f_op = fop;
+
+	/*
+	 * These mounts don't really matter in practice
+	 * for r/o bind mounts.  They aren't userspace-
+	 * visible.  We do this for consistency, and so
+	 * that we can do debugging checks at __fput()
+	 */
+	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+		error = mnt_want_write(mnt);
+		WARN_ON(error);
+	}
 	return error;
 }
 EXPORT_SYMBOL(init_file);
@@ -221,10 +232,13 @@ EXPORT_SYMBOL(fput);
  */
 void drop_file_write_access(struct file *file)
 {
+	struct vfsmount *mnt = file->f_path.mnt;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 
 	put_write_access(inode);
+	if (!special_file(inode->i_mode))
+		mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
 
diff --git a/fs/namei.c b/fs/namei.c
index 83c843b3fea3..e179f71bfcb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
-	} else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
-		return -EROFS;
+	}
 
 	error = vfs_permission(nd, acc_mode);
 	if (error)
@@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+	/*
+	 * We'll never write to the fs underlying
+	 * a device file.
+	 */
+	if (special_file(inode->i_mode))
+		return 0;
+	return (flag & O_TRUNC);
+}
+
 /*
- * Note that the low bits of "flag" aren't the same as in the open
- * system call.  See open_to_namei_flags().
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode)
 {
+	struct file *filp;
 	struct nameidata nd;
 	int acc_mode, error;
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
+	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
 	acc_mode = ACC_MODE(flag);
@@ -1791,17 +1804,30 @@ do_last:
 	}
 
 	if (IS_ERR(nd.intent.open.file)) {
-		mutex_unlock(&dir->d_inode->i_mutex);
 		error = PTR_ERR(nd.intent.open.file);
-		goto exit_dput;
+		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
-		error = __open_namei_create(&nd, &path, flag, mode);
+		/*
+		 * This write is needed to ensure that a
+		 * ro->rw transition does not occur between
+		 * the time when the file is created and when
+		 * a permanent write count is taken through
+		 * the 'struct file' in nameidata_to_filp().
+		 */
+		error = mnt_want_write(nd.path.mnt);
 		if (error)
+			goto exit_mutex_unlock;
+		error = __open_namei_create(&nd, &path, flag, mode);
+		if (error) {
+			mnt_drop_write(nd.path.mnt);
 			goto exit;
-		return nameidata_to_filp(&nd, open_flag);
+		}
+		filp = nameidata_to_filp(&nd, open_flag);
+		mnt_drop_write(nd.path.mnt);
+		return filp;
 	}
 
 	/*
@@ -1831,11 +1857,40 @@ do_last:
 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
 		goto exit;
 ok:
+	/*
+	 * Consider:
+	 * 1. may_open() truncates a file
+	 * 2. a rw->ro mount transition occurs
+	 * 3. nameidata_to_filp() fails due to
+	 *    the ro mount.
+	 * That would be inconsistent, and should
+	 * be avoided. Taking this mnt write here
+	 * ensures that (2) can not occur.
+	 */
+	will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+	if (will_write) {
+		error = mnt_want_write(nd.path.mnt);
+		if (error)
+			goto exit;
+	}
 	error = may_open(&nd, acc_mode, flag);
-	if (error)
+	if (error) {
+		if (will_write)
+			mnt_drop_write(nd.path.mnt);
 		goto exit;
-	return nameidata_to_filp(&nd, open_flag);
+	}
+	filp = nameidata_to_filp(&nd, open_flag);
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_write)
+		mnt_drop_write(nd.path.mnt);
+	return filp;
 
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
 	path_put_conditional(&path, &nd);
 exit:
diff --git a/fs/open.c b/fs/open.c
index 8111947905d8..e12f17010324 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -730,6 +730,35 @@ out:
 	return error;
 }
 
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput().  This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+					  struct vfsmount *mnt)
+{
+	int error;
+	error = get_write_access(inode);
+	if (error)
+		return error;
+	/*
+	 * Do not take mount writer counts on
+	 * special files since no writes to
+	 * the mount itself will occur.
+	 */
+	if (!special_file(inode->i_mode)) {
+		/*
+		 * Balanced in __fput()
+		 */
+		error = mnt_want_write(mnt);
+		if (error)
+			put_write_access(inode);
+	}
+	return error;
+}
+
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int flags, struct file *f,
 					int (*open)(struct inode *, struct file *))
@@ -742,7 +771,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
-		error = get_write_access(inode);
+		error = __get_file_write_access(inode, mnt);
 		if (error)
 			goto cleanup_file;
 	}
@@ -784,8 +813,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 
 cleanup_all:
 	fops_put(f->f_op);
-	if (f->f_mode & FMODE_WRITE)
+	if (f->f_mode & FMODE_WRITE) {
 		put_write_access(inode);
+		if (!special_file(inode->i_mode))
+			mnt_drop_write(mnt);
+	}
 	file_kill(f);
 	f->f_path.dentry = NULL;
 	f->f_path.mnt = NULL;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 34262c11f480..94fd3b08fb77 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -598,6 +598,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 			int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
 	struct mq_attr attr;
+	struct file *result;
 	int ret;
 
 	if (u_attr) {
@@ -612,13 +613,24 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 	}
 
 	mode &= ~current->fs->umask;
+	ret = mnt_want_write(mqueue_mnt);
+	if (ret)
+		goto out;
 	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
 	dentry->d_fsdata = NULL;
 	if (ret)
-		goto out;
+		goto out_drop_write;
 
-	return dentry_open(dentry, mqueue_mnt, oflag);
+	result = dentry_open(dentry, mqueue_mnt, oflag);
+	/*
+	 * dentry_open() took a persistent mnt_want_write(),
+	 * so we can now drop this one.
+	 */
+	mnt_drop_write(mqueue_mnt);
+	return result;
 
+out_drop_write:
+	mnt_drop_write(mqueue_mnt);
 out:
 	dput(dentry);
 	mntput(mqueue_mnt);
-- 
cgit v1.2.3


From 2af482a7edfb8810539cacc2fdd8242611ca43bb Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:50 -0800
Subject: [PATCH] r/o bind mounts: elevate write count for chmod/chown callers

chown/chmod,etc...  don't call permission in the same way that the normal
"open for write" calls do.  They still write to the filesystem, so bump the
write count during these operations.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index e12f17010324..e2df8fd1eb0b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -567,12 +567,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 
 	audit_inode(NULL, dentry);
 
-	err = -EROFS;
-	if (IS_RDONLY(inode))
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
 		goto out_putf;
 	err = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_putf;
+		goto out_drop_write;
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -581,6 +581,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	err = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
 
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
 out_putf:
 	fput(file);
 out:
@@ -600,13 +602,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 		goto out;
 	inode = nd.path.dentry->d_inode;
 
-	error = -EROFS;
-	if (IS_RDONLY(inode))
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
 		goto dput_and_out;
 
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto dput_and_out;
+		goto out_drop_write;
 
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
@@ -616,6 +618,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 	error = notify_change(nd.path.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
 
+out_drop_write:
+	mnt_drop_write(nd.path.mnt);
 dput_and_out:
 	path_put(&nd.path);
 out:
@@ -638,9 +642,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 		printk(KERN_ERR "chown_common: NULL inode\n");
 		goto out;
 	}
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto out;
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto out;
@@ -671,7 +672,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 	error = user_path_walk(filename, &nd);
 	if (error)
 		goto out;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_release;
 	error = chown_common(nd.path.dentry, user, group);
+	mnt_drop_write(nd.path.mnt);
+out_release:
 	path_put(&nd.path);
 out:
 	return error;
@@ -691,7 +697,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 	error = __user_walk_fd(dfd, filename, follow, &nd);
 	if (error)
 		goto out;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_release;
 	error = chown_common(nd.path.dentry, user, group);
+	mnt_drop_write(nd.path.mnt);
+out_release:
 	path_put(&nd.path);
 out:
 	return error;
@@ -705,7 +716,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group
 	error = user_path_walk_link(filename, &nd);
 	if (error)
 		goto out;
+	error = mnt_want_write(nd.path.mnt);
+	if (error)
+		goto out_release;
 	error = chown_common(nd.path.dentry, user, group);
+	mnt_drop_write(nd.path.mnt);
+out_release:
 	path_put(&nd.path);
 out:
 	return error;
@@ -722,9 +738,14 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 	if (!file)
 		goto out;
 
+	error = mnt_want_write(file->f_path.mnt);
+	if (error)
+		goto out_fput;
 	dentry = file->f_path.dentry;
 	audit_inode(NULL, dentry);
 	error = chown_common(dentry, user, group);
+	mnt_drop_write(file->f_path.mnt);
+out_fput:
 	fput(file);
 out:
 	return error;
-- 
cgit v1.2.3


From 9ac9b8474c39c3ae2c2b37d8e1f08db8a9146124 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:52 -0800
Subject: [PATCH] r/o bind mounts: write counts for truncate()

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index e2df8fd1eb0b..4d690e33446f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		goto dput_and_out;
 
-	error = vfs_permission(&nd, MAY_WRITE);
+	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto dput_and_out;
 
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto dput_and_out;
+	error = vfs_permission(&nd, MAY_WRITE);
+	if (error)
+		goto mnt_drop_write_and_out;
 
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	error = get_write_access(inode);
 	if (error)
-		goto dput_and_out;
+		goto mnt_drop_write_and_out;
 
 	/*
 	 * Make sure that there are no leases.  get_write_access() protects
@@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 
 put_write_and_out:
 	put_write_access(inode);
+mnt_drop_write_and_out:
+	mnt_drop_write(nd.path.mnt);
 dput_and_out:
 	path_put(&nd.path);
 out:
-- 
cgit v1.2.3


From 2f676cbc0d60ae806216c7a61c6971bd72dedde8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:55 -0800
Subject: [PATCH] r/o bind mounts: make access() use new r/o helper

It is OK to let access() go without using a mnt_want/drop_write() pair because
it doesn't actually do writes to the filesystem, and it is inherently racy
anyway.  This is a rare case when it is OK to use __mnt_is_readonly()
directly.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 4d690e33446f..e58382d57e72 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -459,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	if(res || !(mode & S_IWOTH) ||
 	   special_file(nd.path.dentry->d_inode->i_mode))
 		goto out_path_release;
-
-	if(IS_RDONLY(nd.path.dentry->d_inode))
+	/*
+	 * This is a rare case where using __mnt_is_readonly()
+	 * is OK without a mnt_want/drop_write() pair.  Since
+	 * no actual write to the fs is performed here, we do
+	 * not need to telegraph to that to anyone.
+	 *
+	 * By doing this, we accept that this access is
+	 * inherently racy and know that the fs may change
+	 * state before we even see this result.
+	 */
+	if (__mnt_is_readonly(nd.path.mnt))
 		res = -EROFS;
 
 out_path_release:
-- 
cgit v1.2.3


From ec82687f29127a954dd0da95dc1e0a4ce92b560c Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:53 -0800
Subject: [PATCH] r/o bind mounts: elevate count for xfs timestamp updates

Elevate the write count during the xfs m/ctime updates.

XFS has to do it's own timestamp updates due to an unfortunate VFS
design limitation, so it will have to track writers by itself aswell.

[hch: split out from the touch_atime patch as it's not related to it at all]

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/linux-2.6/xfs_iops.c | 7 -------
 fs/xfs/linux-2.6/xfs_lrw.c  | 9 ++++++++-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 0c958cf77758..a1237dad6430 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -155,13 +155,6 @@ xfs_ichgtime_fast(
 	 */
 	ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
 
-	/*
-	 * We're not supposed to change timestamps in readonly-mounted
-	 * filesystems.  Throw it away if anyone asks us.
-	 */
-	if (unlikely(IS_RDONLY(inode)))
-		return;
-
 	if (flags & XFS_ICHGTIME_MOD) {
 		tvp = &inode->i_mtime;
 		ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 21c0dbc74093..1ebd8004469c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,6 +51,7 @@
 #include "xfs_vnodeops.h"
 
 #include <linux/capability.h>
+#include <linux/mount.h>
 #include <linux/writeback.h>
 
 
@@ -670,10 +671,16 @@ start:
 	if (new_size > xip->i_size)
 		xip->i_new_size = new_size;
 
-	if (likely(!(ioflags & IO_INVIS))) {
+	/*
+	 * We're not supposed to change timestamps in readonly-mounted
+	 * filesystems.  Throw it away if anyone asks us.
+	 */
+	if (likely(!(ioflags & IO_INVIS) &&
+		   !mnt_want_write(file->f_path.mnt))) {
 		file_update_time(file);
 		xfs_ichgtime_fast(xip, inode,
 				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		mnt_drop_write(file->f_path.mnt);
 	}
 
 	/*
-- 
cgit v1.2.3


From 2c463e95480829a2fe8f386589516e13b1289db6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:56 -0800
Subject: [PATCH] r/o bind mounts: check mnt instead of superblock directly

If we depend on the inodes for writeability, we will not catch the r/o mounts
when implemented.

This patches uses __mnt_want_write().  It does not guarantee that the mount
will stay writeable after the check.  But, this is OK for one of the checks
because it is just for a printk().

The other two are probably unnecessary and duplicate existing checks in the
VFS.  This won't make them better checks than before, but it will make them
detect r/o mounts.

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c  | 3 ++-
 fs/nfsd/vfs.c | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6cea7479c5b4..d9e30ac2798d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -967,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
 	if (nd->flags & LOOKUP_DIRECTORY)
 		return 0;
 	/* Are we trying to write to a read only partition? */
-	if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+	if (__mnt_is_readonly(nd->path.mnt) &&
+	    (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
 		return 0;
 	return 1;
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 626dfd38528f..304bf5f643c9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		inode->i_mode,
 		IS_IMMUTABLE(inode)?	" immut" : "",
 		IS_APPEND(inode)?	" append" : "",
-		IS_RDONLY(inode)?	" ro" : "");
+		__mnt_is_readonly(exp->ex_path.mnt)?	" ro" : "");
 	dprintk("      owner %d/%d user %d/%d\n",
 		inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
 #endif
@@ -1923,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 */
 	if (!(acc & MAY_LOCAL_ACCESS))
 		if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
-			if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode))
+			if (exp_rdonly(rqstp, exp) ||
+			    __mnt_is_readonly(exp->ex_path.mnt))
 				return nfserr_rofs;
 			if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
 				return nfserr_perm;
-- 
cgit v1.2.3


From 3d733633a633065729c9e4e254b2e5442c00ef7e Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:37:59 -0800
Subject: [PATCH] r/o bind mounts: track numbers of writers to mounts

This is the real meat of the entire series.  It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time.  Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.

This uses a statically-allocated percpu variable.  All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances.  Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two

Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.

I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.

http://sr71.net/~dave/linux/openbench.c

The code in here is a a worst-possible case for this patch.  It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely.  This worst-case scenario causes a 3%
degredation in the benchmark.

I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory.  In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.

(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable.  So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)

[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 252 +++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mount.h |   7 ++
 2 files changed, 244 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 066b393578c1..e3ce18d91aad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/sysfs.h>
 #include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 	return tmp & (HASH_SIZE - 1);
 }
 
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
+		atomic_set(&mnt->__mnt_writers, 0);
 		if (name) {
 			int size = strlen(name) + 1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+	return (mnt->mnt_sb->s_flags & MS_RDONLY);
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+struct mnt_writer {
+	/*
+	 * If holding multiple instances of this lock, they
+	 * must be ordered by cpu number.
+	 */
+	spinlock_t lock;
+	struct lock_class_key lock_class; /* compiles out with !lockdep */
+	unsigned long count;
+	struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+		spin_lock_init(&writer->lock);
+		lockdep_set_class(&writer->lock, &writer->lock_class);
+		writer->count = 0;
+	}
+	return 0;
+}
+fs_initcall(init_mnt_writers);
+
+static void unlock_mnt_writers(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_unlock(&cpu_writer->lock);
+	}
+}
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+	if (!cpu_writer->mnt)
+		return;
+	/*
+	 * This is in case anyone ever leaves an invalid,
+	 * old ->mnt and a count of 0.
+	 */
+	if (!cpu_writer->count)
+		return;
+	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+	cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+					  struct vfsmount *mnt)
+{
+	if (cpu_writer->mnt == mnt)
+		return;
+	__clear_mnt_count(cpu_writer);
+	cpu_writer->mnt = mnt;
+}
+
 /*
  * Most r/o checks on a fs are for operations that take
  * discrete amounts of time, like a write() or unlink().
@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
  */
 int mnt_want_write(struct vfsmount *mnt)
 {
-	if (__mnt_is_readonly(mnt))
-		return -EROFS;
-	return 0;
+	int ret = 0;
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+	if (__mnt_is_readonly(mnt)) {
+		ret = -EROFS;
+		goto out;
+	}
+	use_cpu_writer_for_mount(cpu_writer, mnt);
+	cpu_writer->count++;
+out:
+	spin_unlock(&cpu_writer->lock);
+	put_cpu_var(mnt_writers);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
+static void lock_mnt_writers(void)
+{
+	int cpu;
+	struct mnt_writer *cpu_writer;
+
+	for_each_possible_cpu(cpu) {
+		cpu_writer = &per_cpu(mnt_writers, cpu);
+		spin_lock(&cpu_writer->lock);
+		__clear_mnt_count(cpu_writer);
+		cpu_writer->mnt = NULL;
+	}
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+	if (atomic_read(&mnt->__mnt_writers) >=
+	    MNT_WRITER_UNDERFLOW_LIMIT)
+		return;
+	/*
+	 * It isn't necessary to hold all of the locks
+	 * at the same time, but doing it this way makes
+	 * us share a lot more code.
+	 */
+	lock_mnt_writers();
+	/*
+	 * vfsmount_lock is for mnt_flags.
+	 */
+	spin_lock(&vfsmount_lock);
+	/*
+	 * If coalescing the per-cpu writer counts did not
+	 * get us back to a positive writer count, we have
+	 * a bug.
+	 */
+	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+		printk(KERN_DEBUG "leak detected on mount(%p) writers "
+				"count: %d\n",
+			mnt, atomic_read(&mnt->__mnt_writers));
+		WARN_ON(1);
+		/* use the flag to keep the dmesg spam down */
+		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+	}
+	spin_unlock(&vfsmount_lock);
+	unlock_mnt_writers();
+}
+
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
+	int must_check_underflow = 0;
+	struct mnt_writer *cpu_writer;
+
+	cpu_writer = &get_cpu_var(mnt_writers);
+	spin_lock(&cpu_writer->lock);
+
+	use_cpu_writer_for_mount(cpu_writer, mnt);
+	if (cpu_writer->count > 0) {
+		cpu_writer->count--;
+	} else {
+		must_check_underflow = 1;
+		atomic_dec(&mnt->__mnt_writers);
+	}
+
+	spin_unlock(&cpu_writer->lock);
+	/*
+	 * Logically, we could call this each time,
+	 * but the __mnt_writers cacheline tends to
+	 * be cold, and makes this expensive.
+	 */
+	if (must_check_underflow)
+		handle_write_count_underflow(mnt);
+	/*
+	 * This could be done right after the spinlock
+	 * is taken because the spinlock keeps us on
+	 * the cpu, and disables preemption.  However,
+	 * putting it here bounds the amount that
+	 * __mnt_writers can underflow.  Without it,
+	 * we could theoretically wrap __mnt_writers.
+	 */
+	put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-/*
- * __mnt_is_readonly: check whether a mount is read-only
- * @mnt: the mount to check for its write status
- *
- * This shouldn't be used directly ouside of the VFS.
- * It does not guarantee that the filesystem will stay
- * r/w, just that it is right *now*.  This can not and
- * should not be used in place of IS_RDONLY(inode).
- */
-int __mnt_is_readonly(struct vfsmount *mnt)
+int mnt_make_readonly(struct vfsmount *mnt)
 {
-	return (mnt->mnt_sb->s_flags & MS_RDONLY);
+	int ret = 0;
+
+	lock_mnt_writers();
+	/*
+	 * With all the locks held, this value is stable
+	 */
+	if (atomic_read(&mnt->__mnt_writers) > 0) {
+		ret = -EBUSY;
+		goto out;
+	}
+	/*
+	 * actually set mount's r/o flag here to make
+	 * __mnt_is_readonly() true, which keeps anyone
+	 * from doing a successful mnt_want_write().
+	 */
+out:
+	unlock_mnt_writers();
+	return ret;
 }
-EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
 static inline void __mntput(struct vfsmount *mnt)
 {
+	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
+	/*
+	 * We don't have to hold all of the locks at the
+	 * same time here because we know that we're the
+	 * last reference to mnt and that no new writers
+	 * can come in.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+		if (cpu_writer->mnt != mnt)
+			continue;
+		spin_lock(&cpu_writer->lock);
+		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+		cpu_writer->count = 0;
+		/*
+		 * Might as well do this so that no one
+		 * ever sees the pointer and expects
+		 * it to be valid.
+		 */
+		cpu_writer->mnt = NULL;
+		spin_unlock(&cpu_writer->lock);
+	}
+	/*
+	 * This probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this
+	 * happens, the filesystem was probably unable
+	 * to make r/w->r/o transitions.
+	 */
+	WARN_ON(atomic_read(&mnt->__mnt_writers));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 2eecd2c8c760..8c8e94369ac8 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
@@ -30,6 +31,7 @@ struct mnt_namespace;
 #define MNT_RELATIME	0x20
 
 #define MNT_SHRINKABLE	0x100
+#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -62,6 +64,11 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
+	/*
+	 * This value is not stable unless all of the mnt_writers[] spinlocks
+	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+	 */
+	atomic_t __mnt_writers;
 };
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
-- 
cgit v1.2.3


From 2e4b7fcd926006531935a4c79a5e9349fe51125b Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:38:00 -0800
Subject: [PATCH] r/o bind mounts: honor mount writer counts at remount

Originally from: Herbert Poetzl <herbert@13thfloor.at>

This is the core of the read-only bind mount patch set.

Note that this does _not_ add a "ro" option directly to the bind mount
operation.  If you require such a mount, you must first do the bind, then
follow it up with a 'mount -o remount,ro' operation:

If you wish to have a r/o bind mount of /foo on bar:

	mount --bind /foo /bar
	mount -o remount,ro /bar

Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/mount.h |  1 +
 2 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index e3ce18d91aad..678f7ce060f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -105,7 +105,11 @@ struct vfsmount *alloc_vfsmnt(const char *name)
  */
 int __mnt_is_readonly(struct vfsmount *mnt)
 {
-	return (mnt->mnt_sb->s_flags & MS_RDONLY);
+	if (mnt->mnt_flags & MNT_READONLY)
+		return 1;
+	if (mnt->mnt_sb->s_flags & MS_RDONLY)
+		return 1;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
@@ -305,7 +309,7 @@ void mnt_drop_write(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-int mnt_make_readonly(struct vfsmount *mnt)
+static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
@@ -318,15 +322,25 @@ int mnt_make_readonly(struct vfsmount *mnt)
 		goto out;
 	}
 	/*
-	 * actually set mount's r/o flag here to make
-	 * __mnt_is_readonly() true, which keeps anyone
-	 * from doing a successful mnt_want_write().
+	 * nobody can do a successful mnt_want_write() with all
+	 * of the counts in MNT_DENIED_WRITE and the locks held.
 	 */
+	spin_lock(&vfsmount_lock);
+	if (!ret)
+		mnt->mnt_flags |= MNT_READONLY;
+	spin_unlock(&vfsmount_lock);
 out:
 	unlock_mnt_writers();
 	return ret;
 }
 
+static void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_flags &= ~MNT_READONLY;
+	spin_unlock(&vfsmount_lock);
+}
+
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
@@ -693,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 		seq_putc(m, '.');
 		mangle(m, mnt->mnt_sb->s_subtype);
 	}
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
 	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 		if (mnt->mnt_sb->s_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
@@ -1295,6 +1309,23 @@ out:
 	return err;
 }
 
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+	int error = 0;
+	int readonly_request = 0;
+
+	if (ms_flags & MS_RDONLY)
+		readonly_request = 1;
+	if (readonly_request == __mnt_is_readonly(mnt))
+		return 0;
+
+	if (readonly_request)
+		error = mnt_make_readonly(mnt);
+	else
+		__mnt_unmake_readonly(mnt);
+	return error;
+}
+
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
@@ -1317,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
 		return -EINVAL;
 
 	down_write(&sb->s_umount);
-	err = do_remount_sb(sb, flags, data, 0);
+	if (flags & MS_BIND)
+		err = change_mount_flags(nd->path.mnt, flags);
+	else
+		err = do_remount_sb(sb, flags, data, 0);
 	if (!err)
 		nd->path.mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
@@ -1701,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_RELATIME)
 		mnt_flags |= MNT_RELATIME;
+	if (flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 8c8e94369ac8..d6600e3f7e45 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -29,6 +29,7 @@ struct mnt_namespace;
 #define MNT_NOATIME	0x08
 #define MNT_NODIRATIME	0x10
 #define MNT_RELATIME	0x20
+#define MNT_READONLY	0x40	/* does the user want this to be r/o? */
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
-- 
cgit v1.2.3


From ad775f5a8faa5845377f093ca11caf577404add9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Fri, 15 Feb 2008 14:38:01 -0800
Subject: [PATCH] r/o bind mounts: debugging for missed calls

There have been a few oopses caused by 'struct file's with NULL f_vfsmnts.
There was also a set of potentially missed mnt_want_write()s from
dentry_open() calls.

This patch provides a very simple debugging framework to catch these kinds of
bugs.  It will WARN_ON() them, but should stop us from having any oopses or
mnt_writer count imbalances.

I'm quite convinced that this is a good thing because it found bugs in the
stuff I was working on as soon as I wrote it.

[hch: made it conditional on a debug option.
      But it's still a little bit too ugly]

[hch: merged forced remount r/o fix from Dave and akpm's fix for the fix]

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 11 +++++++++--
 fs/open.c          | 12 +++++++++++-
 fs/super.c         |  3 +++
 include/linux/fs.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug  | 10 ++++++++++
 5 files changed, 82 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 71efc7000226..7a0a9b872251 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head)
 static inline void file_free(struct file *f)
 {
 	percpu_counter_dec(&nr_files);
+	file_check_state(f);
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -207,6 +208,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
 	 * that we can do debugging checks at __fput()
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+		file_take_write(file);
 		error = mnt_want_write(mnt);
 		WARN_ON(error);
 	}
@@ -237,8 +239,13 @@ void drop_file_write_access(struct file *file)
 	struct inode *inode = dentry->d_inode;
 
 	put_write_access(inode);
-	if (!special_file(inode->i_mode))
-		mnt_drop_write(mnt);
+
+	if (special_file(inode->i_mode))
+		return;
+	if (file_check_writeable(file) != 0)
+		return;
+	mnt_drop_write(mnt);
+	file_release_write(file);
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
 
diff --git a/fs/open.c b/fs/open.c
index e58382d57e72..b70e7666bb2c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -806,6 +806,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 		error = __get_file_write_access(inode, mnt);
 		if (error)
 			goto cleanup_file;
+		if (!special_file(inode->i_mode))
+			file_take_write(f);
 	}
 
 	f->f_mapping = inode->i_mapping;
@@ -847,8 +849,16 @@ cleanup_all:
 	fops_put(f->f_op);
 	if (f->f_mode & FMODE_WRITE) {
 		put_write_access(inode);
-		if (!special_file(inode->i_mode))
+		if (!special_file(inode->i_mode)) {
+			/*
+			 * We don't consider this a real
+			 * mnt_want/drop_write() pair
+			 * because it all happenend right
+			 * here, so just reset the state.
+			 */
+			file_reset_write(f);
 			mnt_drop_write(mnt);
+		}
 	}
 	file_kill(f);
 	f->f_path.dentry = NULL;
diff --git a/fs/super.c b/fs/super.c
index 01d5c40e9119..1f8f05ede437 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -579,6 +579,9 @@ retry:
 		if (!(f->f_mode & FMODE_WRITE))
 			continue;
 		f->f_mode &= ~FMODE_WRITE;
+		if (file_check_writeable(f) != 0)
+			continue;
+		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
 		file_list_unlock();
 		/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 013b9c2b88e6..d1eeea669d2c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -776,6 +776,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 		index <  ra->start + ra->size);
 }
 
+#define FILE_MNT_WRITE_TAKEN	1
+#define FILE_MNT_WRITE_RELEASED	2
+
 struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
@@ -810,6 +813,9 @@ struct file {
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
+#ifdef CONFIG_DEBUG_WRITECOUNT
+	unsigned long f_mnt_write_state;
+#endif
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
@@ -818,6 +824,49 @@ extern spinlock_t files_lock;
 #define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
+#ifdef CONFIG_DEBUG_WRITECOUNT
+static inline void file_take_write(struct file *f)
+{
+	WARN_ON(f->f_mnt_write_state != 0);
+	f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
+}
+static inline void file_release_write(struct file *f)
+{
+	f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED;
+}
+static inline void file_reset_write(struct file *f)
+{
+	f->f_mnt_write_state = 0;
+}
+static inline void file_check_state(struct file *f)
+{
+	/*
+	 * At this point, either both or neither of these bits
+	 * should be set.
+	 */
+	WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
+	WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
+}
+static inline int file_check_writeable(struct file *f)
+{
+	if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN)
+		return 0;
+	printk(KERN_WARNING "writeable file with no "
+			    "mnt_want_write()\n");
+	WARN_ON(1);
+	return -EINVAL;
+}
+#else /* !CONFIG_DEBUG_WRITECOUNT */
+static inline void file_take_write(struct file *filp) {}
+static inline void file_release_write(struct file *filp) {}
+static inline void file_reset_write(struct file *filp) {}
+static inline void file_check_state(struct file *filp) {}
+static inline int file_check_writeable(struct file *filp)
+{
+	return 0;
+}
+#endif /* CONFIG_DEBUG_WRITECOUNT */
+
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 95de3102bc87..623ef24c2381 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -427,6 +427,16 @@ config DEBUG_VM
 
 	  If unsure, say N.
 
+config DEBUG_WRITECOUNT
+	bool "Debug filesystem writers count"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to catch wrong use of the writers count in struct
+	  vfsmount.  This will increase the size of each file struct by
+	  32 bits.
+
+	  If unsure, say N.
+
 config DEBUG_LIST
 	bool "Debug linked list manipulation"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From 6bcf19d02a5d7e627fa054f2f10e0a8d830df326 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 9 Mar 2008 21:26:02 +0100
Subject: block: send disk "change" event for rescan_partitions()

Userspace likes to get notified that the disk may have changed, when
rescan_partitions() is called after partitioning or media change. It will
make it possible to update the state of the disk with the "change" event,
before the following partition "add" events are handled.

Cc: David Zeuthen <david@fubar.dk>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 03f808c5b79d..6149e4b58c88 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		return 0;
 	if (IS_ERR(state))	/* I/O error reading the partition table */
 		return -EIO;
+
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
 		sector_t from = state->parts[p].from;
-- 
cgit v1.2.3


From c6f87733823d69a8f12e391688ceeb1ff4922530 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Thu, 13 Mar 2008 22:41:52 -0400
Subject: SYSFS: Explicitly include required header file slab.h.

After an experimental deletion of the unnecessary inclusion of
<linux/slab.h> from the header file <linux/percpu.h>, the following
files under fs/sysfs were exposed as needing to explicitly include
<linux/slab.h>.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c  | 1 +
 fs/sysfs/file.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4948d9bc405d..a1c3a1fab7f0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -20,6 +20,7 @@
 #include <linux/idr.h>
 #include <linux/completion.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index baa663e69388..a859c32ff93a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/list.h>
-- 
cgit v1.2.3


From 2424b5dd062cbe3e0578ae7b11a1b360ad22f451 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 7 Apr 2008 15:35:01 -0700
Subject: sysfs: refill attribute buffer when reading from offset 0

Requiring userspace to close and re-open sysfs attributes has been the
policy since before 2.6.12.  It allows userspace to get a consistent
snapshot of kernel state and consume it with incremental reads and seeks.

Now, if the file position is zero the kernel assumes userspace wants to see
the new value.  The application for this change is to allow a userspace
RAID metadata handler to check the state of an array without causing any
memory allocations.  Thus not causing writeback to a raid array that might
be blocked waiting for userspace to take action.

Cc: Neil Brown <neilb@suse.de>
Acked-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/filesystems/sysfs.txt | 9 +++++++--
 fs/sysfs/file.c                     | 5 ++---
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 4598ef7b622b..7f27b8f840d0 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -176,8 +176,10 @@ implementations:
   Recall that an attribute should only be exporting one value, or an
   array of similar values, so this shouldn't be that expensive. 
 
-  This allows userspace to do partial reads and seeks arbitrarily over
-  the entire file at will. 
+  This allows userspace to do partial reads and forward seeks
+  arbitrarily over the entire file at will. If userspace seeks back to
+  zero or does a pread(2) with an offset of '0' the show() method will
+  be called again, rearmed, to fill the buffer.
 
 - On write(2), sysfs expects the entire buffer to be passed during the
   first write. Sysfs then passes the entire buffer to the store()
@@ -192,6 +194,9 @@ implementations:
 
 Other notes:
 
+- Writing causes the show() method to be rearmed regardless of current
+  file position.
+
 - The buffer will always be PAGE_SIZE bytes in length. On i386, this
   is 4096. 
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a859c32ff93a..ade9a7e6a757 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -129,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	ssize_t retval = 0;
 
 	mutex_lock(&buffer->mutex);
-	if (buffer->needs_read_fill) {
+	if (buffer->needs_read_fill || *ppos == 0) {
 		retval = fill_read_buffer(file->f_path.dentry,buffer);
 		if (retval)
 			goto out;
@@ -410,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
  * return POLLERR|POLLPRI, and select will return the fd whether
  * it is waiting for read, write, or exceptions.
  * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, as simply seeking and reading
- * again will not get new data, or reset the state of 'poll'.
+ * need to close and re-open the file, or seek to 0 and read again.
  * Reminder: this only works for attributes which actively support
  * it, and it is not possible to test an attribute from userspace
  * to see if it supports poll (Neither 'poll' nor 'select' return
-- 
cgit v1.2.3


From c5dec1c3034f1ae3503efbf641ff3b0273b64797 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 11 Apr 2008 12:56:49 +0200
Subject: block: convert bio_copy_user to bio_copy_user_iov

This patch enables bio_copy_user to take struct sg_iovec (renamed
bio_copy_user_iov). bio_copy_user uses bio_copy_user_iov internally as
bio_map_user uses bio_map_user_iov.

The major changes are:

- adds sg_iovec array to struct bio_map_data

- adds __bio_copy_iov that copy data between bio and
sg_iovec. bio_copy_user_iov and bio_uncopy_user use it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 158 ++++++++++++++++++++++++++++++++++++++--------------
 include/linux/bio.h |   2 +
 2 files changed, 119 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 553b5b7960ad..6e0b6f66df03 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 
 struct bio_map_data {
 	struct bio_vec *iovecs;
-	void __user *userptr;
+	int nr_sgvecs;
+	struct sg_iovec *sgvecs;
 };
 
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio)
+static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
+			     struct sg_iovec *iov, int iov_count)
 {
 	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
+	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
+	bmd->nr_sgvecs = iov_count;
 	bio->bi_private = bmd;
 }
 
 static void bio_free_map_data(struct bio_map_data *bmd)
 {
 	kfree(bmd->iovecs);
+	kfree(bmd->sgvecs);
 	kfree(bmd);
 }
 
-static struct bio_map_data *bio_alloc_map_data(int nr_segs)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
 {
 	struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
 
@@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
 		return NULL;
 
 	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
-	if (bmd->iovecs)
+	if (!bmd->iovecs) {
+		kfree(bmd);
+		return NULL;
+	}
+
+	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+	if (bmd->sgvecs)
 		return bmd;
 
+	kfree(bmd->iovecs);
 	kfree(bmd);
 	return NULL;
 }
 
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+			  int uncopy)
+{
+	int ret = 0, i;
+	struct bio_vec *bvec;
+	int iov_idx = 0;
+	unsigned int iov_off = 0;
+	int read = bio_data_dir(bio) == READ;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		char *bv_addr = page_address(bvec->bv_page);
+		unsigned int bv_len = bvec->bv_len;
+
+		while (bv_len && iov_idx < iov_count) {
+			unsigned int bytes;
+			char *iov_addr;
+
+			bytes = min_t(unsigned int,
+				      iov[iov_idx].iov_len - iov_off, bv_len);
+			iov_addr = iov[iov_idx].iov_base + iov_off;
+
+			if (!ret) {
+				if (!read && !uncopy)
+					ret = copy_from_user(bv_addr, iov_addr,
+							     bytes);
+				if (read && uncopy)
+					ret = copy_to_user(iov_addr, bv_addr,
+							   bytes);
+
+				if (ret)
+					ret = -EFAULT;
+			}
+
+			bv_len -= bytes;
+			bv_addr += bytes;
+			iov_addr += bytes;
+			iov_off += bytes;
+
+			if (iov[iov_idx].iov_len == iov_off) {
+				iov_idx++;
+				iov_off = 0;
+			}
+		}
+
+		if (uncopy)
+			__free_page(bvec->bv_page);
+	}
+
+	return ret;
+}
+
 /**
  *	bio_uncopy_user	-	finish previously mapped bio
  *	@bio: bio being terminated
@@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
 int bio_uncopy_user(struct bio *bio)
 {
 	struct bio_map_data *bmd = bio->bi_private;
-	const int read = bio_data_dir(bio) == READ;
-	struct bio_vec *bvec;
-	int i, ret = 0;
+	int ret;
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		char *addr = page_address(bvec->bv_page);
-		unsigned int len = bmd->iovecs[i].bv_len;
+	ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
 
-		if (read && !ret && copy_to_user(bmd->userptr, addr, len))
-			ret = -EFAULT;
-
-		__free_page(bvec->bv_page);
-		bmd->userptr += len;
-	}
 	bio_free_map_data(bmd);
 	bio_put(bio);
 	return ret;
 }
 
 /**
- *	bio_copy_user	-	copy user data to bio
+ *	bio_copy_user_iov	-	copy user data to bio
  *	@q: destination block queue
- *	@uaddr: start of user address
- *	@len: length in bytes
+ *	@iov:	the iovec.
+ *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
-			  unsigned int len, int write_to_vm)
+struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
+			      int iov_count, int write_to_vm)
 {
-	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = uaddr >> PAGE_SHIFT;
 	struct bio_map_data *bmd;
 	struct bio_vec *bvec;
 	struct page *page;
 	struct bio *bio;
 	int i, ret;
+	int nr_pages = 0;
+	unsigned int len = 0;
 
-	bmd = bio_alloc_map_data(end - start);
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr;
+		unsigned long end;
+		unsigned long start;
+
+		uaddr = (unsigned long)iov[i].iov_base;
+		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		start = uaddr >> PAGE_SHIFT;
+
+		nr_pages += end - start;
+		len += iov[i].iov_len;
+	}
+
+	bmd = bio_alloc_map_data(nr_pages, iov_count);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
-	bmd->userptr = (void __user *) uaddr;
-
 	ret = -ENOMEM;
-	bio = bio_alloc(GFP_KERNEL, end - start);
+	bio = bio_alloc(GFP_KERNEL, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
@@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
 	 * success
 	 */
 	if (!write_to_vm) {
-		char __user *p = (char __user *) uaddr;
-
-		/*
-		 * for a write, copy in data to kernel pages
-		 */
-		ret = -EFAULT;
-		bio_for_each_segment(bvec, bio, i) {
-			char *addr = page_address(bvec->bv_page);
-
-			if (copy_from_user(addr, p, bvec->bv_len))
-				goto cleanup;
-			p += bvec->bv_len;
-		}
+		ret = __bio_copy_iov(bio, iov, iov_count, 0);
+		if (ret)
+			goto cleanup;
 	}
 
-	bio_set_map_data(bmd, bio);
+	bio_set_map_data(bmd, bio, iov, iov_count);
 	return bio;
 cleanup:
 	bio_for_each_segment(bvec, bio, i)
@@ -591,6 +645,28 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
+/**
+ *	bio_copy_user	-	copy user data to bio
+ *	@q: destination block queue
+ *	@uaddr: start of user address
+ *	@len: length in bytes
+ *	@write_to_vm: bool indicating writing to pages or not
+ *
+ *	Prepares and returns a bio for indirect user io, bouncing data
+ *	to/from kernel pages as necessary. Must be paired with
+ *	call bio_uncopy_user() on io completion.
+ */
+struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+			  unsigned int len, int write_to_vm)
+{
+	struct sg_iovec iov;
+
+	iov.iov_base = (void __user *)uaddr;
+	iov.iov_len = len;
+
+	return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+}
+
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
 				      struct sg_iovec *iov, int iov_count,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4c59bdccd3ee..d259690863fb 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -327,6 +327,8 @@ extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
+extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
+				     int, int);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 
-- 
cgit v1.2.3


From 5416b704aef0b7350073421f4f6ac1a21bc213e7 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 13 Feb 2008 16:54:29 -0800
Subject: dlm: match signedness between dlm_config_info and cluster_set

cluster_set is only called from the macro CLUSTER_ATTR which defines read/write
access functions.  Make the signedness match to avoid sparse warnings every time
CLUSTER_ATTR is used (lines 149-159) all of the form:

fs/dlm/config.c:149:1: warning: incorrect type in argument 3 (different signedness)
fs/dlm/config.c:149:1:    expected unsigned int *info_field
fs/dlm/config.c:149:1:    got int extern [toplevel] *<noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c3ad1dff3b25..7ceaea3d983b 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -114,7 +114,7 @@ struct cluster_attribute {
 };
 
 static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
-			   unsigned int *info_field, int check_zero,
+			   int *info_field, int check_zero,
 			   const char *buf, size_t len)
 {
 	unsigned int x;
-- 
cgit v1.2.3


From 170e19ab2900b7c959d7a0e627fd12f383efcfa1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 13 Feb 2008 23:29:38 +0200
Subject: dlm: make dlm_print_rsb() static

dlm_print_rsb() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 2 +-
 fs/dlm/lock.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 8f250ac8b928..1e9e8ebee251 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -165,7 +165,7 @@ void dlm_print_lkb(struct dlm_lkb *lkb)
 	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
 }
 
-void dlm_print_rsb(struct dlm_rsb *r)
+static void dlm_print_rsb(struct dlm_rsb *r)
 {
 	printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
 	       r->res_nodeid, r->res_flags, r->res_first_lkid,
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 05d9c82e646b..88e93c80cc22 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -13,7 +13,6 @@
 #ifndef __LOCK_DOT_H__
 #define __LOCK_DOT_H__
 
-void dlm_print_rsb(struct dlm_rsb *r);
 void dlm_dump_rsb(struct dlm_rsb *r);
 void dlm_print_lkb(struct dlm_lkb *lkb);
 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
-- 
cgit v1.2.3


From 761b9d3ffc953c24ceb55d8e12ff7e02b17e0484 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 21 Feb 2008 11:25:42 -0600
Subject: dlm: save master info after failed no-queue request

When a NOQUEUE request fails, the rsb res_master field is unnecessarily
reset to -1, instead of leaving the valid master setting in place.  We
want to save the looked-up master values while the rsb is on the "toss
list" so that another lookup can be avoided if the rsb is soon reused.
The fix is to simply leave res_master value alone.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1e9e8ebee251..2d3d1027ce2b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1956,8 +1956,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
 			list_del_init(&lkb->lkb_rsb_lookup);
 			r->res_first_lkid = lkb->lkb_id;
 			_request_lock(r, lkb);
-		} else
-			r->res_nodeid = -1;
+		}
 		break;
 
 	default:
-- 
cgit v1.2.3


From d44e0fc704143624b3e88fbf8fbcfda7a83fd299 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 18 Mar 2008 14:22:11 -0500
Subject: dlm: recover nodes that are removed and re-added

If a node is removed from a lockspace, and then added back before the
dlm is notified of the removal, the dlm will not detect the removal
and won't clear the old state from the node.  This is fixed by using a
list of added nodes so the membership recovery can detect when a newly
added node is already in the member list.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c       | 48 +++++++++++++++++++++++++++++++++++++++---------
 fs/dlm/config.h       |  3 ++-
 fs/dlm/dlm_internal.h |  4 +++-
 fs/dlm/member.c       | 34 +++++++++++++++++++++++++++++-----
 fs/dlm/recoverd.c     |  1 +
 5 files changed, 74 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7ceaea3d983b..eac23bd288b2 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -284,6 +284,7 @@ struct node {
 	struct list_head list; /* space->members */
 	int nodeid;
 	int weight;
+	int new;
 };
 
 static struct configfs_group_operations clusters_ops = {
@@ -565,6 +566,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 	config_item_init_type_name(&nd->item, name, &node_type);
 	nd->nodeid = -1;
 	nd->weight = 1;  /* default weight of 1 if none is set */
+	nd->new = 1;     /* set to 0 once it's been read by dlm_nodeid_list() */
 
 	mutex_lock(&sp->members_lock);
 	list_add(&nd->list, &sp->members);
@@ -805,12 +807,13 @@ static void put_comm(struct comm *cm)
 }
 
 /* caller must free mem */
-int dlm_nodeid_list(char *lsname, int **ids_out)
+int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+		    int **new_out, int *new_count_out)
 {
 	struct space *sp;
 	struct node *nd;
-	int i = 0, rv = 0;
-	int *ids;
+	int i = 0, rv = 0, ids_count = 0, new_count = 0;
+	int *ids, *new;
 
 	sp = get_space(lsname);
 	if (!sp)
@@ -818,23 +821,50 @@ int dlm_nodeid_list(char *lsname, int **ids_out)
 
 	mutex_lock(&sp->members_lock);
 	if (!sp->members_count) {
-		rv = 0;
+		rv = -EINVAL;
+		printk(KERN_ERR "dlm: zero members_count\n");
 		goto out;
 	}
 
-	ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+	ids_count = sp->members_count;
+
+	ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
 	if (!ids) {
 		rv = -ENOMEM;
 		goto out;
 	}
 
-	rv = sp->members_count;
-	list_for_each_entry(nd, &sp->members, list)
+	list_for_each_entry(nd, &sp->members, list) {
 		ids[i++] = nd->nodeid;
+		if (nd->new)
+			new_count++;
+	}
+
+	if (ids_count != i)
+		printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
+
+	if (!new_count)
+		goto out_ids;
+
+	new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+	if (!new) {
+		kfree(ids);
+		rv = -ENOMEM;
+		goto out;
+	}
 
-	if (rv != i)
-		printk("bad nodeid count %d %d\n", rv, i);
+	i = 0;
+	list_for_each_entry(nd, &sp->members, list) {
+		if (nd->new) {
+			new[i++] = nd->nodeid;
+			nd->new = 0;
+		}
+	}
+	*new_count_out = new_count;
+	*new_out = new;
 
+ out_ids:
+	*ids_count_out = ids_count;
 	*ids_out = ids;
  out:
 	mutex_unlock(&sp->members_lock);
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index a3170fe22090..4f1d6fce58c5 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -35,7 +35,8 @@ extern struct dlm_config_info dlm_config;
 int dlm_config_init(void);
 void dlm_config_exit(void);
 int dlm_node_weight(char *lsname, int nodeid);
-int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+		    int **new_out, int *new_count_out);
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
 int dlm_our_nodeid(void);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d30ea8b433a2..c70c8e58358f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -133,8 +133,10 @@ struct dlm_member {
 
 struct dlm_recover {
 	struct list_head	list;
-	int			*nodeids;
+	int			*nodeids;   /* nodeids of all members */
 	int			node_count;
+	int			*new;       /* nodeids of new members */
+	int			new_count;
 	uint64_t		seq;
 };
 
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index fa17f5a27883..26133f05ae3a 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -210,6 +210,23 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 		}
 	}
 
+	/* Add an entry to ls_nodes_gone for members that were removed and
+	   then added again, so that previous state for these nodes will be
+	   cleared during recovery. */
+
+	for (i = 0; i < rv->new_count; i++) {
+		if (!dlm_is_member(ls, rv->new[i]))
+			continue;
+		log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
+
+		memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+		if (!memb)
+			return -ENOMEM;
+		memb->nodeid = rv->new[i];
+		list_add_tail(&memb->list, &ls->ls_nodes_gone);
+		neg++;
+	}
+
 	/* add new members to ls_nodes */
 
 	for (i = 0; i < rv->node_count; i++) {
@@ -314,15 +331,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
 int dlm_ls_start(struct dlm_ls *ls)
 {
 	struct dlm_recover *rv = NULL, *rv_old;
-	int *ids = NULL;
-	int error, count;
+	int *ids = NULL, *new = NULL;
+	int error, ids_count = 0, new_count = 0;
 
 	rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
 	if (!rv)
 		return -ENOMEM;
 
-	error = count = dlm_nodeid_list(ls->ls_name, &ids);
-	if (error <= 0)
+	error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
+				&new, &new_count);
+	if (error < 0)
 		goto fail;
 
 	spin_lock(&ls->ls_recover_lock);
@@ -337,14 +355,19 @@ int dlm_ls_start(struct dlm_ls *ls)
 	}
 
 	rv->nodeids = ids;
-	rv->node_count = count;
+	rv->node_count = ids_count;
+	rv->new = new;
+	rv->new_count = new_count;
 	rv->seq = ++ls->ls_recover_seq;
 	rv_old = ls->ls_recover_args;
 	ls->ls_recover_args = rv;
 	spin_unlock(&ls->ls_recover_lock);
 
 	if (rv_old) {
+		log_error(ls, "unused recovery %llx %d",
+			  (unsigned long long)rv_old->seq, rv_old->node_count);
 		kfree(rv_old->nodeids);
+		kfree(rv_old->new);
 		kfree(rv_old);
 	}
 
@@ -354,6 +377,7 @@ int dlm_ls_start(struct dlm_ls *ls)
  fail:
 	kfree(rv);
 	kfree(ids);
+	kfree(new);
 	return error;
 }
 
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 997f9531d594..fd677c8c3d3b 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -257,6 +257,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
 	if (rv) {
 		ls_recover(ls, rv);
 		kfree(rv->nodeids);
+		kfree(rv->new);
 		kfree(rv);
 	}
 }
-- 
cgit v1.2.3


From 2402211a8389282fd2942fad4511f02c0eeeffc5 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 14 Mar 2008 15:09:15 -0500
Subject: dlm: move plock code from gfs2

Move the code that handles cluster posix locks from gfs2 into the dlm
so that it can be used by both gfs2 and ocfs2.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/Makefile                |   1 +
 fs/dlm/dlm_internal.h          |   2 +
 fs/dlm/main.c                  |   7 +
 fs/dlm/plock.c                 | 439 +++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/locking/dlm/Makefile   |   2 +-
 fs/gfs2/locking/dlm/lock_dlm.h |  12 +-
 fs/gfs2/locking/dlm/main.c     |   8 -
 fs/gfs2/locking/dlm/mount.c    |  21 ++
 fs/gfs2/locking/dlm/plock.c    | 406 -------------------------------------
 include/linux/Kbuild           |   2 +-
 include/linux/dlm_plock.h      |  50 +++++
 include/linux/lock_dlm_plock.h |  41 ----
 12 files changed, 523 insertions(+), 468 deletions(-)
 create mode 100644 fs/dlm/plock.c
 delete mode 100644 fs/gfs2/locking/dlm/plock.c
 create mode 100644 include/linux/dlm_plock.h
 delete mode 100644 include/linux/lock_dlm_plock.h

(limited to 'fs')

diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index d248e60951ba..ca1c9124c8ce 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -10,6 +10,7 @@ dlm-y :=			ast.o \
 				midcomms.o \
 				netlink.o \
 				lowcomms.o \
+				plock.o \
 				rcom.o \
 				recover.o \
 				recoverd.o \
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index c70c8e58358f..caa1581e158f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -582,6 +582,8 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
 int dlm_netlink_init(void);
 void dlm_netlink_exit(void);
 void dlm_timeout_warn(struct dlm_lkb *lkb);
+int dlm_plock_init(void);
+void dlm_plock_exit(void);
 
 #ifdef CONFIG_DLM_DEBUG
 int dlm_register_debugfs(void);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 58487fb95a4c..b80e0aa3cfa5 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -46,10 +46,16 @@ static int __init init_dlm(void)
 	if (error)
 		goto out_user;
 
+	error = dlm_plock_init();
+	if (error)
+		goto out_netlink;
+
 	printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
 
+ out_netlink:
+	dlm_netlink_exit();
  out_user:
 	dlm_user_exit();
  out_debug:
@@ -66,6 +72,7 @@ static int __init init_dlm(void)
 
 static void __exit exit_dlm(void)
 {
+	dlm_plock_exit();
 	dlm_netlink_exit();
 	dlm_user_exit();
 	dlm_config_exit();
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
new file mode 100644
index 000000000000..d6d6e370f89c
--- /dev/null
+++ b/fs/dlm/plock.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+
+struct plock_op {
+	struct list_head list;
+	int done;
+	struct dlm_plock_info info;
+};
+
+struct plock_xop {
+	struct plock_op xop;
+	void *callback;
+	void *fl;
+	void *file;
+	struct file_lock flc;
+};
+
+
+static inline void set_version(struct dlm_plock_info *info)
+{
+	info->version[0] = DLM_PLOCK_VERSION_MAJOR;
+	info->version[1] = DLM_PLOCK_VERSION_MINOR;
+	info->version[2] = DLM_PLOCK_VERSION_PATCH;
+}
+
+static int check_version(struct dlm_plock_info *info)
+{
+	if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+	    (DLM_PLOCK_VERSION_MINOR < info->version[1])) {
+		log_print("plock device version mismatch: "
+			  "kernel (%u.%u.%u), user (%u.%u.%u)",
+			  DLM_PLOCK_VERSION_MAJOR,
+			  DLM_PLOCK_VERSION_MINOR,
+			  DLM_PLOCK_VERSION_PATCH,
+			  info->version[0],
+			  info->version[1],
+			  info->version[2]);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void send_op(struct plock_op *op)
+{
+	set_version(&op->info);
+	INIT_LIST_HEAD(&op->list);
+	spin_lock(&ops_lock);
+	list_add_tail(&op->list, &send_list);
+	spin_unlock(&ops_lock);
+	wake_up(&send_wq);
+}
+
+int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		   int cmd, struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	struct plock_xop *xop;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+	if (!xop) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	op = &xop->xop;
+	op->info.optype		= DLM_PLOCK_OP_LOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.wait		= IS_SETLKW(cmd);
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		/* fl_owner is lockd which doesn't distinguish
+		   processes on the nfs client */
+		op->info.owner	= (__u64) fl->fl_pid;
+		xop->callback	= fl->fl_lmops->fl_grant;
+		locks_init_lock(&xop->flc);
+		locks_copy_lock(&xop->flc, fl);
+		xop->fl		= fl;
+		xop->file	= file;
+	} else {
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+		xop->callback	= NULL;
+	}
+
+	send_op(op);
+
+	if (xop->callback == NULL)
+		wait_event(recv_wq, (op->done != 0));
+	else {
+		rv = -EINPROGRESS;
+		goto out;
+	}
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_lock: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (!rv) {
+		if (posix_lock_file_wait(file, fl) < 0)
+			log_error(ls, "dlm_posix_lock: vfs lock error %llx",
+				  (unsigned long long)number);
+	}
+
+	kfree(xop);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_lock);
+
+/* Returns failure iff a succesful lock operation should be canceled */
+static int dlm_plock_callback(struct plock_op *op)
+{
+	struct file *file;
+	struct file_lock *fl;
+	struct file_lock *flc;
+	int (*notify)(void *, void *, int) = NULL;
+	struct plock_xop *xop = (struct plock_xop *)op;
+	int rv = 0;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_print("dlm_plock_callback: op on list %llx",
+			  (unsigned long long)op->info.number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* check if the following 2 are still valid or make a copy */
+	file = xop->file;
+	flc = &xop->flc;
+	fl = xop->fl;
+	notify = xop->callback;
+
+	if (op->info.rv) {
+		notify(flc, NULL, op->info.rv);
+		goto out;
+	}
+
+	/* got fs lock; bookkeep locally as well: */
+	flc->fl_flags &= ~FL_SLEEP;
+	if (posix_lock_file(file, flc, NULL)) {
+		/*
+		 * This can only happen in the case of kmalloc() failure.
+		 * The filesystem's own lock is the authoritative lock,
+		 * so a failure to get the lock locally is not a disaster.
+		 * As long as the fs cannot reliably cancel locks (especially
+		 * in a low-memory situation), we're better off ignoring
+		 * this failure than trying to recover.
+		 */
+		log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p",
+			  (unsigned long long)op->info.number, file, fl);
+	}
+
+	rv = notify(flc, NULL, 0);
+	if (rv) {
+		/* XXX: We need to cancel the fs lock here: */
+		log_print("dlm_plock_callback: lock granted after lock request "
+			  "failed; dangling lock!\n");
+		goto out;
+	}
+
+out:
+	kfree(xop);
+	return rv;
+}
+
+int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		     struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	if (posix_lock_file_wait(file, fl) < 0)
+		log_error(ls, "dlm_posix_unlock: vfs unlock error %llx",
+			  (unsigned long long)number);
+
+	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_unlock: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (rv == -ENOENT)
+		rv = 0;
+
+	kfree(op);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_unlock);
+
+int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		  struct file_lock *fl)
+{
+	struct dlm_ls *ls;
+	struct plock_op *op;
+	int rv;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	op->info.optype		= DLM_PLOCK_OP_GET;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.fsid		= ls->ls_global_id;
+	op->info.number		= number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		log_error(ls, "dlm_posix_get: op on list %llx",
+			  (unsigned long long)number);
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* info.rv from userspace is 1 for conflict, 0 for no-conflict,
+	   -ENOENT if there are no locks on the file */
+
+	rv = op->info.rv;
+
+	fl->fl_type = F_UNLCK;
+	if (rv == -ENOENT)
+		rv = 0;
+	else if (rv > 0) {
+		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->fl_pid = op->info.pid;
+		fl->fl_start = op->info.start;
+		fl->fl_end = op->info.end;
+		rv = 0;
+	}
+
+	kfree(op);
+out:
+	dlm_put_lockspace(ls);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_get);
+
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+			loff_t *ppos)
+{
+	struct dlm_plock_info info;
+	struct plock_op *op = NULL;
+
+	if (count < sizeof(info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list)) {
+		op = list_entry(send_list.next, struct plock_op, list);
+		list_move(&op->list, &recv_list);
+		memcpy(&info, &op->info, sizeof(info));
+	}
+	spin_unlock(&ops_lock);
+
+	if (!op)
+		return -EAGAIN;
+
+	if (copy_to_user(u, &info, sizeof(info)))
+		return -EFAULT;
+	return sizeof(info);
+}
+
+/* a write copies in one plock result that should match a plock_op
+   on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+			 loff_t *ppos)
+{
+	struct dlm_plock_info info;
+	struct plock_op *op;
+	int found = 0;
+
+	if (count != sizeof(info))
+		return -EINVAL;
+
+	if (copy_from_user(&info, u, sizeof(info)))
+		return -EFAULT;
+
+	if (check_version(&info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	list_for_each_entry(op, &recv_list, list) {
+		if (op->info.fsid == info.fsid && op->info.number == info.number &&
+		    op->info.owner == info.owner) {
+			list_del_init(&op->list);
+			found = 1;
+			op->done = 1;
+			memcpy(&op->info, &info, sizeof(info));
+			break;
+		}
+	}
+	spin_unlock(&ops_lock);
+
+	if (found) {
+		struct plock_xop *xop;
+		xop = (struct plock_xop *)op;
+		if (xop->callback)
+			count = dlm_plock_callback(op);
+		else
+			wake_up(&recv_wq);
+	} else
+		log_print("dev_write no op %x %llx", info.fsid,
+			  (unsigned long long)info.number);
+	return count;
+}
+
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask = 0;
+
+	poll_wait(file, &send_wq, wait);
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list))
+		mask = POLLIN | POLLRDNORM;
+	spin_unlock(&ops_lock);
+
+	return mask;
+}
+
+static const struct file_operations dev_fops = {
+	.read    = dev_read,
+	.write   = dev_write,
+	.poll    = dev_poll,
+	.owner   = THIS_MODULE
+};
+
+static struct miscdevice plock_dev_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = DLM_PLOCK_MISC_NAME,
+	.fops = &dev_fops
+};
+
+int dlm_plock_init(void)
+{
+	int rv;
+
+	spin_lock_init(&ops_lock);
+	INIT_LIST_HEAD(&send_list);
+	INIT_LIST_HEAD(&recv_list);
+	init_waitqueue_head(&send_wq);
+	init_waitqueue_head(&recv_wq);
+
+	rv = misc_register(&plock_dev_misc);
+	if (rv)
+		log_print("dlm_plock_init: misc_register failed %d", rv);
+	return rv;
+}
+
+void dlm_plock_exit(void)
+{
+	if (misc_deregister(&plock_dev_misc) < 0)
+		log_print("dlm_plock_exit: misc_deregister failed");
+}
+
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
index 89b93b6b45cf..2609bb6cd013 100644
--- a/fs/gfs2/locking/dlm/Makefile
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
-lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
+lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
 
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 58fcf8c5bf39..a243cf69c54e 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -25,6 +25,7 @@
 #include <net/sock.h>
 
 #include <linux/dlm.h>
+#include <linux/dlm_plock.h>
 #include <linux/lm_interface.h>
 
 /*
@@ -173,17 +174,6 @@ void gdlm_cancel(void *);
 int gdlm_hold_lvb(void *, char **);
 void gdlm_unhold_lvb(void *, char *);
 
-/* plock.c */
-
-int gdlm_plock_init(void);
-void gdlm_plock_exit(void);
-int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
-		struct file_lock *);
-int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
-		struct file_lock *);
-int gdlm_punlock(void *, struct lm_lockname *, struct file *,
-		struct file_lock *);
-
 /* mount.c */
 
 extern const struct lm_lockops gdlm_ops;
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index 36a225850bd8..b9a03a7ff801 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -28,13 +28,6 @@ static int __init init_lock_dlm(void)
 		return error;
 	}
 
-	error = gdlm_plock_init();
-	if (error) {
-		gdlm_sysfs_exit();
-		gfs2_unregister_lockproto(&gdlm_ops);
-		return error;
-	}
-
 	printk(KERN_INFO
 	       "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
 	return 0;
@@ -42,7 +35,6 @@ static int __init init_lock_dlm(void)
 
 static void __exit exit_lock_dlm(void)
 {
-	gdlm_plock_exit();
 	gdlm_sysfs_exit();
 	gfs2_unregister_lockproto(&gdlm_ops);
 }
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index f2efff424224..470bdf650b50 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -236,6 +236,27 @@ static void gdlm_withdraw(void *lockspace)
 	gdlm_kobject_release(ls);
 }
 
+static int gdlm_plock(void *lockspace, struct lm_lockname *name,
+	       struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gdlm_ls *ls = lockspace;
+	return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
+}
+
+static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
+		 struct file *file, struct file_lock *fl)
+{
+	struct gdlm_ls *ls = lockspace;
+	return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
+}
+
+static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
+		   struct file *file, struct file_lock *fl)
+{
+	struct gdlm_ls *ls = lockspace;
+	return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
+}
+
 const struct lm_lockops gdlm_ops = {
 	.lm_proto_name = "lock_dlm",
 	.lm_mount = gdlm_mount,
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
deleted file mode 100644
index 2ebd374b3143..000000000000
--- a/fs/gfs2/locking/dlm/plock.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/miscdevice.h>
-#include <linux/lock_dlm_plock.h>
-#include <linux/poll.h>
-
-#include "lock_dlm.h"
-
-
-static spinlock_t ops_lock;
-static struct list_head send_list;
-static struct list_head recv_list;
-static wait_queue_head_t send_wq;
-static wait_queue_head_t recv_wq;
-
-struct plock_op {
-	struct list_head list;
-	int done;
-	struct gdlm_plock_info info;
-};
-
-struct plock_xop {
-	struct plock_op xop;
-	void *callback;
-	void *fl;
-	void *file;
-	struct file_lock flc;
-};
-
-
-static inline void set_version(struct gdlm_plock_info *info)
-{
-	info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
-	info->version[1] = GDLM_PLOCK_VERSION_MINOR;
-	info->version[2] = GDLM_PLOCK_VERSION_PATCH;
-}
-
-static int check_version(struct gdlm_plock_info *info)
-{
-	if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
-	    (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
-		log_error("plock device version mismatch: "
-			  "kernel (%u.%u.%u), user (%u.%u.%u)",
-			  GDLM_PLOCK_VERSION_MAJOR,
-			  GDLM_PLOCK_VERSION_MINOR,
-			  GDLM_PLOCK_VERSION_PATCH,
-			  info->version[0],
-			  info->version[1],
-			  info->version[2]);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static void send_op(struct plock_op *op)
-{
-	set_version(&op->info);
-	INIT_LIST_HEAD(&op->list);
-	spin_lock(&ops_lock);
-	list_add_tail(&op->list, &send_list);
-	spin_unlock(&ops_lock);
-	wake_up(&send_wq);
-}
-
-int gdlm_plock(void *lockspace, struct lm_lockname *name,
-	       struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	struct plock_op *op;
-	struct plock_xop *xop;
-	int rv;
-
-	xop = kzalloc(sizeof(*xop), GFP_KERNEL);
-	if (!xop)
-		return -ENOMEM;
-
-	op = &xop->xop;
-	op->info.optype		= GDLM_PLOCK_OP_LOCK;
-	op->info.pid		= fl->fl_pid;
-	op->info.ex		= (fl->fl_type == F_WRLCK);
-	op->info.wait		= IS_SETLKW(cmd);
-	op->info.fsid		= ls->id;
-	op->info.number		= name->ln_number;
-	op->info.start		= fl->fl_start;
-	op->info.end		= fl->fl_end;
-	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
-		/* fl_owner is lockd which doesn't distinguish
-		   processes on the nfs client */
-		op->info.owner	= (__u64) fl->fl_pid;
-		xop->callback	= fl->fl_lmops->fl_grant;
-		locks_init_lock(&xop->flc);
-		locks_copy_lock(&xop->flc, fl);
-		xop->fl		= fl;
-		xop->file	= file;
-	} else {
-		op->info.owner	= (__u64)(long) fl->fl_owner;
-		xop->callback	= NULL;
-	}
-
-	send_op(op);
-
-	if (xop->callback == NULL)
-		wait_event(recv_wq, (op->done != 0));
-	else
-		return -EINPROGRESS;
-
-	spin_lock(&ops_lock);
-	if (!list_empty(&op->list)) {
-		printk(KERN_INFO "plock op on list\n");
-		list_del(&op->list);
-	}
-	spin_unlock(&ops_lock);
-
-	rv = op->info.rv;
-
-	if (!rv) {
-		if (posix_lock_file_wait(file, fl) < 0)
-			log_error("gdlm_plock: vfs lock error %x,%llx",
-				  name->ln_type,
-				  (unsigned long long)name->ln_number);
-	}
-
-	kfree(xop);
-	return rv;
-}
-
-/* Returns failure iff a succesful lock operation should be canceled */
-static int gdlm_plock_callback(struct plock_op *op)
-{
-	struct file *file;
-	struct file_lock *fl;
-	struct file_lock *flc;
-	int (*notify)(void *, void *, int) = NULL;
-	struct plock_xop *xop = (struct plock_xop *)op;
-	int rv = 0;
-
-	spin_lock(&ops_lock);
-	if (!list_empty(&op->list)) {
-		printk(KERN_INFO "plock op on list\n");
-		list_del(&op->list);
-	}
-	spin_unlock(&ops_lock);
-
-	/* check if the following 2 are still valid or make a copy */
-	file = xop->file;
-	flc = &xop->flc;
-	fl = xop->fl;
-	notify = xop->callback;
-
-	if (op->info.rv) {
-		notify(flc, NULL, op->info.rv);
-		goto out;
-	}
-
-	/* got fs lock; bookkeep locally as well: */
-	flc->fl_flags &= ~FL_SLEEP;
-	if (posix_lock_file(file, flc, NULL)) {
-		/*
-		 * This can only happen in the case of kmalloc() failure.
-		 * The filesystem's own lock is the authoritative lock,
-		 * so a failure to get the lock locally is not a disaster.
-		 * As long as GFS cannot reliably cancel locks (especially
-		 * in a low-memory situation), we're better off ignoring
-		 * this failure than trying to recover.
-		 */
-		log_error("gdlm_plock: vfs lock error file %p fl %p",
-				file, fl);
-	}
-
-	rv = notify(flc, NULL, 0);
-	if (rv) {
-		/* XXX: We need to cancel the fs lock here: */
-		printk("gfs2 lock granted after lock request failed;"
-						" dangling lock!\n");
-		goto out;
-	}
-
-out:
-	kfree(xop);
-	return rv;
-}
-
-int gdlm_punlock(void *lockspace, struct lm_lockname *name,
-		 struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	struct plock_op *op;
-	int rv;
-
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op)
-		return -ENOMEM;
-
-	if (posix_lock_file_wait(file, fl) < 0)
-		log_error("gdlm_punlock: vfs unlock error %x,%llx",
-			  name->ln_type, (unsigned long long)name->ln_number);
-
-	op->info.optype		= GDLM_PLOCK_OP_UNLOCK;
-	op->info.pid		= fl->fl_pid;
-	op->info.fsid		= ls->id;
-	op->info.number		= name->ln_number;
-	op->info.start		= fl->fl_start;
-	op->info.end		= fl->fl_end;
-	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
-		op->info.owner	= (__u64) fl->fl_pid;
-	else
-		op->info.owner	= (__u64)(long) fl->fl_owner;
-
-	send_op(op);
-	wait_event(recv_wq, (op->done != 0));
-
-	spin_lock(&ops_lock);
-	if (!list_empty(&op->list)) {
-		printk(KERN_INFO "punlock op on list\n");
-		list_del(&op->list);
-	}
-	spin_unlock(&ops_lock);
-
-	rv = op->info.rv;
-
-	if (rv == -ENOENT)
-		rv = 0;
-
-	kfree(op);
-	return rv;
-}
-
-int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
-		   struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	struct plock_op *op;
-	int rv;
-
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op)
-		return -ENOMEM;
-
-	op->info.optype		= GDLM_PLOCK_OP_GET;
-	op->info.pid		= fl->fl_pid;
-	op->info.ex		= (fl->fl_type == F_WRLCK);
-	op->info.fsid		= ls->id;
-	op->info.number		= name->ln_number;
-	op->info.start		= fl->fl_start;
-	op->info.end		= fl->fl_end;
-	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
-		op->info.owner	= (__u64) fl->fl_pid;
-	else
-		op->info.owner	= (__u64)(long) fl->fl_owner;
-
-	send_op(op);
-	wait_event(recv_wq, (op->done != 0));
-
-	spin_lock(&ops_lock);
-	if (!list_empty(&op->list)) {
-		printk(KERN_INFO "plock_get op on list\n");
-		list_del(&op->list);
-	}
-	spin_unlock(&ops_lock);
-
-	/* info.rv from userspace is 1 for conflict, 0 for no-conflict,
-	   -ENOENT if there are no locks on the file */
-
-	rv = op->info.rv;
-
-	fl->fl_type = F_UNLCK;
-	if (rv == -ENOENT)
-		rv = 0;
-	else if (rv > 0) {
-		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
-		fl->fl_pid = op->info.pid;
-		fl->fl_start = op->info.start;
-		fl->fl_end = op->info.end;
-		rv = 0;
-	}
-
-	kfree(op);
-	return rv;
-}
-
-/* a read copies out one plock request from the send list */
-static ssize_t dev_read(struct file *file, char __user *u, size_t count,
-			loff_t *ppos)
-{
-	struct gdlm_plock_info info;
-	struct plock_op *op = NULL;
-
-	if (count < sizeof(info))
-		return -EINVAL;
-
-	spin_lock(&ops_lock);
-	if (!list_empty(&send_list)) {
-		op = list_entry(send_list.next, struct plock_op, list);
-		list_move(&op->list, &recv_list);
-		memcpy(&info, &op->info, sizeof(info));
-	}
-	spin_unlock(&ops_lock);
-
-	if (!op)
-		return -EAGAIN;
-
-	if (copy_to_user(u, &info, sizeof(info)))
-		return -EFAULT;
-	return sizeof(info);
-}
-
-/* a write copies in one plock result that should match a plock_op
-   on the recv list */
-static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
-			 loff_t *ppos)
-{
-	struct gdlm_plock_info info;
-	struct plock_op *op;
-	int found = 0;
-
-	if (count != sizeof(info))
-		return -EINVAL;
-
-	if (copy_from_user(&info, u, sizeof(info)))
-		return -EFAULT;
-
-	if (check_version(&info))
-		return -EINVAL;
-
-	spin_lock(&ops_lock);
-	list_for_each_entry(op, &recv_list, list) {
-		if (op->info.fsid == info.fsid && op->info.number == info.number &&
-		    op->info.owner == info.owner) {
-			list_del_init(&op->list);
-			found = 1;
-			op->done = 1;
-			memcpy(&op->info, &info, sizeof(info));
-			break;
-		}
-	}
-	spin_unlock(&ops_lock);
-
-	if (found) {
-		struct plock_xop *xop;
-		xop = (struct plock_xop *)op;
-		if (xop->callback)
-			count = gdlm_plock_callback(op);
-		else
-			wake_up(&recv_wq);
-	} else
-		printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
-			(unsigned long long)info.number);
-	return count;
-}
-
-static unsigned int dev_poll(struct file *file, poll_table *wait)
-{
-	unsigned int mask = 0;
-
-	poll_wait(file, &send_wq, wait);
-
-	spin_lock(&ops_lock);
-	if (!list_empty(&send_list))
-		mask = POLLIN | POLLRDNORM;
-	spin_unlock(&ops_lock);
-
-	return mask;
-}
-
-static const struct file_operations dev_fops = {
-	.read    = dev_read,
-	.write   = dev_write,
-	.poll    = dev_poll,
-	.owner   = THIS_MODULE
-};
-
-static struct miscdevice plock_dev_misc = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = GDLM_PLOCK_MISC_NAME,
-	.fops = &dev_fops
-};
-
-int gdlm_plock_init(void)
-{
-	int rv;
-
-	spin_lock_init(&ops_lock);
-	INIT_LIST_HEAD(&send_list);
-	INIT_LIST_HEAD(&recv_list);
-	init_waitqueue_head(&send_wq);
-	init_waitqueue_head(&recv_wq);
-
-	rv = misc_register(&plock_dev_misc);
-	if (rv)
-		printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
-		       rv);
-	return rv;
-}
-
-void gdlm_plock_exit(void)
-{
-	if (misc_deregister(&plock_dev_misc) < 0)
-		printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
-}
-
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index b3d9ccde0c27..376fc972efed 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -100,7 +100,7 @@ header-y += ixjuser.h
 header-y += jffs2.h
 header-y += keyctl.h
 header-y += limits.h
-header-y += lock_dlm_plock.h
+header-y += dlm_plock.h
 header-y += magic.h
 header-y += major.h
 header-y += matroxfb.h
diff --git a/include/linux/dlm_plock.h b/include/linux/dlm_plock.h
new file mode 100644
index 000000000000..18d5fdbceb74
--- /dev/null
+++ b/include/linux/dlm_plock.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __DLM_PLOCK_DOT_H__
+#define __DLM_PLOCK_DOT_H__
+
+#define DLM_PLOCK_MISC_NAME		"dlm_plock"
+
+#define DLM_PLOCK_VERSION_MAJOR	1
+#define DLM_PLOCK_VERSION_MINOR	1
+#define DLM_PLOCK_VERSION_PATCH	0
+
+enum {
+	DLM_PLOCK_OP_LOCK = 1,
+	DLM_PLOCK_OP_UNLOCK,
+	DLM_PLOCK_OP_GET,
+};
+
+struct dlm_plock_info {
+	__u32 version[3];
+	__u8 optype;
+	__u8 ex;
+	__u8 wait;
+	__u8 pad;
+	__u32 pid;
+	__s32 nodeid;
+	__s32 rv;
+	__u32 fsid;
+	__u64 number;
+	__u64 start;
+	__u64 end;
+	__u64 owner;
+};
+
+#ifdef __KERNEL__
+int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		int cmd, struct file_lock *fl);
+int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		struct file_lock *fl);
+int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+		struct file_lock *fl);
+#endif /* __KERNEL__ */
+
+#endif
+
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
deleted file mode 100644
index fc3415113973..000000000000
--- a/include/linux/lock_dlm_plock.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
- */
-
-#ifndef __LOCK_DLM_PLOCK_DOT_H__
-#define __LOCK_DLM_PLOCK_DOT_H__
-
-#define GDLM_PLOCK_MISC_NAME		"lock_dlm_plock"
-
-#define GDLM_PLOCK_VERSION_MAJOR	1
-#define GDLM_PLOCK_VERSION_MINOR	1
-#define GDLM_PLOCK_VERSION_PATCH	0
-
-enum {
-	GDLM_PLOCK_OP_LOCK = 1,
-	GDLM_PLOCK_OP_UNLOCK,
-	GDLM_PLOCK_OP_GET,
-};
-
-struct gdlm_plock_info {
-	__u32 version[3];
-	__u8 optype;
-	__u8 ex;
-	__u8 wait;
-	__u8 pad;
-	__u32 pid;
-	__s32 nodeid;
-	__s32 rv;
-	__u32 fsid;
-	__u64 number;
-	__u64 start;
-	__u64 end;
-	__u64 owner;
-};
-
-#endif
-
-- 
cgit v1.2.3


From 3d564fa3472d36cd6aa70514c37b8bbbec5b17ab Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 14 Apr 2008 14:06:29 -0500
Subject: dlm: common max length definitions

Add central definitions for max lockspace name length and max resource
name length.  The lack of central definitions has resulted in scattered
private definitions which we can now clean up, including an unused one
in dlm_device.h.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h        | 2 --
 include/linux/dlm.h          | 4 ----
 include/linux/dlm_device.h   | 3 ---
 include/linux/dlmconstants.h | 4 ++++
 4 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index caa1581e158f..73c36f501707 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -43,8 +43,6 @@
 #include <linux/dlm.h>
 #include "config.h"
 
-#define DLM_LOCKSPACE_LEN	64
-
 /* Size of the temp buffer midcomms allocates on the stack.
    We try to make this large enough so most messages fit.
    FIXME: should sctp make this unnecessary? */
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index c743fbc769db..188f6e6925a0 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -22,10 +22,6 @@
 /* Lock levels and flags are here */
 #include <linux/dlmconstants.h>
 
-
-#define DLM_RESNAME_MAXLEN	64
-
-
 typedef void dlm_lockspace_t;
 
 /*
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
index 9642277a152a..dcfd2499902f 100644
--- a/include/linux/dlm_device.h
+++ b/include/linux/dlm_device.h
@@ -94,9 +94,6 @@ struct dlm_lock_result {
 #define DLM_USER_PURGE        6
 #define DLM_USER_DEADLOCK     7
 
-/* Arbitrary length restriction */
-#define MAX_LS_NAME_LEN 64
-
 /* Lockspace flags */
 #define DLM_USER_LSFLG_AUTOFREE   1
 #define DLM_USER_LSFLG_FORCEFREE  2
diff --git a/include/linux/dlmconstants.h b/include/linux/dlmconstants.h
index fddb3d3ff321..47bf08dc7566 100644
--- a/include/linux/dlmconstants.h
+++ b/include/linux/dlmconstants.h
@@ -18,6 +18,10 @@
  * Constants used by DLM interface.
  */
 
+#define DLM_LOCKSPACE_LEN       64
+#define DLM_RESNAME_MAXLEN      64
+
+
 /*
  * Lock Modes
  */
-- 
cgit v1.2.3


From ec98c6b9b47df6df1c1fa6cf3d427414f8c2cf16 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 20 Apr 2008 02:14:23 -0700
Subject: [SPARC]: Remove SunOS and Solaris binary support.

As per Documentation/feature-removal-schedule.txt

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/feature-removal-schedule.txt |   11 -
 Documentation/highuid.txt                  |    2 -
 Documentation/magic-number.txt             |    1 -
 arch/sparc/Kconfig                         |   12 -
 arch/sparc/defconfig                       |  165 ++--
 arch/sparc/kernel/Makefile                 |   10 +-
 arch/sparc/kernel/entry.S                  |  206 +----
 arch/sparc/kernel/errtbls.c                |  144 ---
 arch/sparc/kernel/head.S                   |   37 +-
 arch/sparc/kernel/sclow.S                  |   86 --
 arch/sparc/kernel/signal.c                 |  307 +------
 arch/sparc/kernel/sparc_ksyms.c            |    8 -
 arch/sparc/kernel/sunos_asm.S              |   67 --
 arch/sparc/kernel/sunos_ioctl.c            |  230 -----
 arch/sparc/kernel/sys_solaris.c            |   35 -
 arch/sparc/kernel/sys_sunos.c              | 1210 -------------------------
 arch/sparc/kernel/systbls.S                |  121 ---
 arch/sparc64/Kconfig                       |   47 +-
 arch/sparc64/Makefile                      |    1 -
 arch/sparc64/defconfig                     |   29 +-
 arch/sparc64/kernel/Makefile               |    9 -
 arch/sparc64/kernel/binfmt_aout32.c        |  419 ---------
 arch/sparc64/kernel/entry.S                |   61 +-
 arch/sparc64/kernel/signal.c               |    1 -
 arch/sparc64/kernel/signal32.c             |  300 +-----
 arch/sparc64/kernel/sparc64_ksyms.c        |   41 -
 arch/sparc64/kernel/sunos_ioctl32.c        |  275 ------
 arch/sparc64/kernel/sys_sparc.c            |   38 -
 arch/sparc64/kernel/sys_sunos32.c          | 1359 ----------------------------
 arch/sparc64/kernel/systbls.S              |  122 ---
 arch/sparc64/kernel/systbls.h              |    2 -
 arch/sparc64/kernel/ttable.S               |   14 +-
 arch/sparc64/solaris/Makefile              |   10 -
 arch/sparc64/solaris/conv.h                |   38 -
 arch/sparc64/solaris/entry64.S             |  223 -----
 arch/sparc64/solaris/fs.c                  |  745 ---------------
 arch/sparc64/solaris/ioctl.c               |  825 -----------------
 arch/sparc64/solaris/ipc.c                 |  126 ---
 arch/sparc64/solaris/misc.c                |  786 ----------------
 arch/sparc64/solaris/signal.c              |  429 ---------
 arch/sparc64/solaris/signal.h              |  108 ---
 arch/sparc64/solaris/socket.c              |  461 ----------
 arch/sparc64/solaris/socksys.c             |  203 -----
 arch/sparc64/solaris/socksys.h             |  208 -----
 arch/sparc64/solaris/systbl.S              |  285 ------
 arch/sparc64/solaris/timod.c               |  976 --------------------
 fs/Kconfig.binfmt                          |    2 +-
 include/asm-sparc/Kbuild                   |    1 -
 include/asm-sparc/a.out-core.h             |   52 --
 include/asm-sparc/a.out.h                  |   97 --
 include/asm-sparc/head.h                   |   33 -
 include/asm-sparc/ioctls.h                 |    2 -
 include/asm-sparc/mman.h                   |   13 -
 include/asm-sparc/namei.h                  |   15 +-
 include/asm-sparc/pconf.h                  |   25 -
 include/asm-sparc/processor.h              |    3 -
 include/asm-sparc/socket.h                 |    3 -
 include/asm-sparc/solerrno.h               |  132 ---
 include/asm-sparc/svr4.h                   |  119 ---
 include/asm-sparc/termios.h                |    5 -
 include/asm-sparc/user.h                   |   56 +-
 include/asm-sparc64/Kbuild                 |    1 -
 include/asm-sparc64/a.out-core.h           |   31 -
 include/asm-sparc64/a.out.h                |    1 -
 include/asm-sparc64/ioctls.h               |    2 -
 include/asm-sparc64/mman.h                 |   13 -
 include/asm-sparc64/namei.h                |   15 +-
 include/asm-sparc64/pconf.h                |   25 -
 include/asm-sparc64/socket.h               |    3 -
 include/asm-sparc64/solerrno.h             |  132 ---
 include/asm-sparc64/svr4.h                 |  120 ---
 include/asm-sparc64/termios.h              |    5 -
 include/asm-sparc64/ttable.h               |   18 -
 include/asm-sparc64/unistd.h               |   10 -
 include/asm-sparc64/user.h                 |   61 +-
 net/core/sock.c                            |    9 -
 76 files changed, 172 insertions(+), 11625 deletions(-)
 delete mode 100644 arch/sparc/kernel/errtbls.c
 delete mode 100644 arch/sparc/kernel/sclow.S
 delete mode 100644 arch/sparc/kernel/sunos_asm.S
 delete mode 100644 arch/sparc/kernel/sunos_ioctl.c
 delete mode 100644 arch/sparc/kernel/sys_solaris.c
 delete mode 100644 arch/sparc/kernel/sys_sunos.c
 delete mode 100644 arch/sparc64/kernel/binfmt_aout32.c
 delete mode 100644 arch/sparc64/kernel/sunos_ioctl32.c
 delete mode 100644 arch/sparc64/kernel/sys_sunos32.c
 delete mode 100644 arch/sparc64/solaris/Makefile
 delete mode 100644 arch/sparc64/solaris/conv.h
 delete mode 100644 arch/sparc64/solaris/entry64.S
 delete mode 100644 arch/sparc64/solaris/fs.c
 delete mode 100644 arch/sparc64/solaris/ioctl.c
 delete mode 100644 arch/sparc64/solaris/ipc.c
 delete mode 100644 arch/sparc64/solaris/misc.c
 delete mode 100644 arch/sparc64/solaris/signal.c
 delete mode 100644 arch/sparc64/solaris/signal.h
 delete mode 100644 arch/sparc64/solaris/socket.c
 delete mode 100644 arch/sparc64/solaris/socksys.c
 delete mode 100644 arch/sparc64/solaris/socksys.h
 delete mode 100644 arch/sparc64/solaris/systbl.S
 delete mode 100644 arch/sparc64/solaris/timod.c
 delete mode 100644 include/asm-sparc/a.out-core.h
 delete mode 100644 include/asm-sparc/a.out.h
 delete mode 100644 include/asm-sparc/pconf.h
 delete mode 100644 include/asm-sparc/solerrno.h
 delete mode 100644 include/asm-sparc/svr4.h
 delete mode 100644 include/asm-sparc64/a.out-core.h
 delete mode 100644 include/asm-sparc64/a.out.h
 delete mode 100644 include/asm-sparc64/pconf.h
 delete mode 100644 include/asm-sparc64/solerrno.h
 delete mode 100644 include/asm-sparc64/svr4.h

(limited to 'fs')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index af0e9393bf68..76ab78e83bb1 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -261,17 +261,6 @@ Who:	Michael Buesch <mb@bu3sch.de>
 
 ---------------------------
 
-What:	Solaris/SunOS syscall and binary support on Sparc
-When:	2.6.26
-Why:	Largely unmaintained and almost entirely unused.  File system
-	layering used to divert library and dynamic linker searches to
-	/usr/gnemul is extremely buggy and unfixable.  Making it work
-	is largely pointless as without a lot of work only the most
-	trivial of Solaris binaries can work with the emulation code.
-Who:	David S. Miller <davem@davemloft.net>
-
----------------------------
-
 What:	init_mm export
 When:	2.6.26
 Why:	Not used in-tree. The current out-of-tree users used it to
diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt
index 76034d9dbfc0..6bad6f1d1cac 100644
--- a/Documentation/highuid.txt
+++ b/Documentation/highuid.txt
@@ -28,8 +28,6 @@ What's left to be done for 32-bit UIDs on all Linux architectures:
   uses the 32-bit UID system calls properly otherwise.
 
   This affects at least:
-	SunOS emulation
-	Solaris emulation
 	iBCS on Intel
 
 	sparc32 emulation on sparc64
diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt
index bd450e797558..95070028d15e 100644
--- a/Documentation/magic-number.txt
+++ b/Documentation/magic-number.txt
@@ -95,7 +95,6 @@ RFCOMM_TTY_MAGIC      0x6d02                        net/bluetooth/rfcomm/tty.c
 USB_SERIAL_PORT_MAGIC 0x7301      usb_serial_port   drivers/usb/serial/usb-serial.h
 CG_MAGIC              0x00090255  ufs_cylinder_group include/linux/ufs_fs.h
 A2232_MAGIC           0x000a2232  gs_port           drivers/char/ser_a2232.h
-SOLARIS_SOCKET_MAGIC  0x000ADDED  sol_socket_struct arch/sparc64/solaris/socksys.h
 RPORT_MAGIC           0x00525001  r_port            drivers/char/rocket_int.h
 LSEMAGIC              0x05091998  lse               drivers/fc4/fc.c
 GDTIOCTL_MAGIC        0x06030f07  gdth_iowr_str     drivers/scsi/gdth_ioctl.h
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c40343c54920..49590f8fe98c 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -27,9 +27,6 @@ config ARCH_NO_VIRT_TO_BUS
 config OF
 	def_bool y
 
-config ARCH_SUPPORTS_AOUT
-	def_bool y
-
 config HZ
 	int
 	default 100
@@ -257,15 +254,6 @@ config SPARC_LED
 
 source "fs/Kconfig.binfmt"
 
-config SUNOS_EMUL
-	bool "SunOS binary emulation"
-	help
-	  This allows you to run most SunOS binaries.  If you want to do this,
-	  say Y here and place appropriate files in /usr/gnemul/sunos. See
-	  <http://www.ultralinux.org/faq.html> for more information.  If you
-	  want to run SunOS binaries on an Ultra you must also say Y to
-	  "Kernel support for 32-bit a.out binaries" above.
-
 source "mm/Kconfig"
 
 endmenu
diff --git a/arch/sparc/defconfig b/arch/sparc/defconfig
index f7a509149199..6a2c57a2fe71 100644
--- a/arch/sparc/defconfig
+++ b/arch/sparc/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.23-rc1
-# Wed Jul 25 15:30:21 2007
+# Linux kernel version: 2.6.25
+# Sun Apr 20 01:49:51 2008
 #
 CONFIG_MMU=y
 CONFIG_HIGHMEM=y
@@ -9,18 +9,15 @@ CONFIG_ZONE_DMA=y
 CONFIG_GENERIC_ISA_DMA=y
 CONFIG_ARCH_NO_VIRT_TO_BUS=y
 CONFIG_OF=y
+CONFIG_HZ=100
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
 #
-# Code maturity level options
+# General setup
 #
 CONFIG_EXPERIMENTAL=y
 CONFIG_BROKEN_ON_SMP=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
-
-#
-# General setup
-#
 CONFIG_LOCALVERSION=""
 CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SWAP=y
@@ -29,12 +26,23 @@ CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
-# CONFIG_USER_NS is not set
 # CONFIG_AUDIT is not set
 # CONFIG_IKCONFIG is not set
 CONFIG_LOG_BUF_SHIFT=14
+# CONFIG_CGROUPS is not set
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
 CONFIG_SYSFS_DEPRECATED=y
+CONFIG_SYSFS_DEPRECATED_V2=y
 # CONFIG_RELAY is not set
+CONFIG_NAMESPACES=y
+# CONFIG_UTS_NS is not set
+# CONFIG_IPC_NS is not set
+# CONFIG_USER_NS is not set
+# CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
@@ -49,6 +57,7 @@ CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
+CONFIG_COMPAT_BRK=y
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
 CONFIG_ANON_INODES=y
@@ -61,6 +70,13 @@ CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
+# CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
+CONFIG_HAVE_OPROFILE=y
+# CONFIG_HAVE_KPROBES is not set
+# CONFIG_HAVE_KRETPROBES is not set
+CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
 # CONFIG_TINY_SHMEM is not set
 CONFIG_BASE_SMALL=0
@@ -88,6 +104,7 @@ CONFIG_IOSCHED_CFQ=y
 CONFIG_DEFAULT_CFQ=y
 # CONFIG_DEFAULT_NOOP is not set
 CONFIG_DEFAULT_IOSCHED="cfq"
+CONFIG_CLASSIC_RCU=y
 
 #
 # General machine setup
@@ -113,14 +130,13 @@ CONFIG_SUN_PM=y
 CONFIG_PCI=y
 CONFIG_PCI_SYSCALL=y
 # CONFIG_ARCH_SUPPORTS_MSI is not set
+CONFIG_PCI_LEGACY=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_NO_DMA is not set
 CONFIG_SUN_OPENPROMFS=m
 # CONFIG_SPARC_LED is not set
 CONFIG_BINFMT_ELF=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_BINFMT_MISC=m
-CONFIG_SUNOS_EMUL=y
 CONFIG_SELECT_MEMORY_MODEL=y
 CONFIG_FLATMEM_MANUAL=y
 # CONFIG_DISCONTIGMEM_MANUAL is not set
@@ -128,6 +144,7 @@ CONFIG_FLATMEM_MANUAL=y
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 # CONFIG_SPARSEMEM_STATIC is not set
+# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
 CONFIG_SPLIT_PTLOCK_CPUS=4
 # CONFIG_RESOURCES_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=1
@@ -148,6 +165,7 @@ CONFIG_XFRM=y
 CONFIG_XFRM_USER=m
 # CONFIG_XFRM_SUB_POLICY is not set
 # CONFIG_XFRM_MIGRATE is not set
+# CONFIG_XFRM_STATISTICS is not set
 CONFIG_NET_KEY=m
 # CONFIG_NET_KEY_MIGRATE is not set
 CONFIG_INET=y
@@ -170,6 +188,7 @@ CONFIG_INET_TUNNEL=y
 CONFIG_INET_XFRM_MODE_TRANSPORT=y
 CONFIG_INET_XFRM_MODE_TUNNEL=y
 CONFIG_INET_XFRM_MODE_BEET=y
+# CONFIG_INET_LRO is not set
 CONFIG_INET_DIAG=y
 CONFIG_INET_TCP_DIAG=y
 # CONFIG_TCP_CONG_ADVANCED is not set
@@ -191,8 +210,10 @@ CONFIG_INET6_XFRM_MODE_TUNNEL=m
 CONFIG_INET6_XFRM_MODE_BEET=m
 # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
 CONFIG_IPV6_SIT=m
+CONFIG_IPV6_NDISC_NODETYPE=y
 CONFIG_IPV6_TUNNEL=m
 # CONFIG_IPV6_MULTIPLE_TABLES is not set
+# CONFIG_IPV6_MROUTE is not set
 # CONFIG_NETWORK_SECMARK is not set
 # CONFIG_NETFILTER is not set
 # CONFIG_IP_DCCP is not set
@@ -214,10 +235,6 @@ CONFIG_SCTP_HMAC_MD5=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
-
-#
-# QoS and/or fair queueing
-#
 # CONFIG_NET_SCHED is not set
 
 #
@@ -225,6 +242,7 @@ CONFIG_SCTP_HMAC_MD5=y
 #
 CONFIG_NET_PKTGEN=m
 # CONFIG_HAMRADIO is not set
+# CONFIG_CAN is not set
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 CONFIG_AF_RXRPC=m
@@ -248,6 +266,7 @@ CONFIG_AF_RXRPC=m
 #
 # Generic Driver Options
 #
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_STANDALONE=y
 CONFIG_PREVENT_FIRMWARE_BUILD=y
 # CONFIG_FW_LOADER is not set
@@ -271,7 +290,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=16
 CONFIG_BLK_DEV_RAM_SIZE=4096
-CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
+# CONFIG_BLK_DEV_XIP is not set
 # CONFIG_CDROM_PKTCDVD is not set
 # CONFIG_ATA_OVER_ETH is not set
 CONFIG_MISC_DEVICES=y
@@ -279,6 +298,8 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_EEPROM_93CX6 is not set
 # CONFIG_SGI_IOC4 is not set
 # CONFIG_TIFM_CORE is not set
+# CONFIG_ENCLOSURE_SERVICES is not set
+CONFIG_HAVE_IDE=y
 # CONFIG_IDE is not set
 
 #
@@ -318,6 +339,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_FC_ATTRS is not set
 # CONFIG_SCSI_ISCSI_ATTRS is not set
 # CONFIG_SCSI_SAS_LIBSAS is not set
+# CONFIG_SCSI_SRP_ATTRS is not set
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_BLK_DEV_3W_XXXX_RAID is not set
@@ -338,6 +360,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_IPS is not set
 # CONFIG_SCSI_INITIO is not set
 # CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_MVSAS is not set
 # CONFIG_SCSI_STEX is not set
 # CONFIG_SCSI_SYM53C8XX_2 is not set
 # CONFIG_SCSI_QLOGIC_1280 is not set
@@ -353,14 +376,7 @@ CONFIG_SCSI_SUNESP=y
 # CONFIG_SCSI_SRP is not set
 # CONFIG_ATA is not set
 # CONFIG_MD is not set
-
-#
-# Fusion MPT device support
-#
 # CONFIG_FUSION is not set
-# CONFIG_FUSION_SPI is not set
-# CONFIG_FUSION_FC is not set
-# CONFIG_FUSION_SAS is not set
 
 #
 # IEEE 1394 (FireWire) support
@@ -375,6 +391,7 @@ CONFIG_DUMMY=m
 # CONFIG_MACVLAN is not set
 # CONFIG_EQUALIZER is not set
 CONFIG_TUN=m
+# CONFIG_VETH is not set
 # CONFIG_ARCNET is not set
 # CONFIG_PHYLIB is not set
 CONFIG_NET_ETHERNET=y
@@ -388,11 +405,20 @@ CONFIG_SUNQE=m
 # CONFIG_NET_VENDOR_3COM is not set
 # CONFIG_NET_TULIP is not set
 # CONFIG_HP100 is not set
+# CONFIG_IBM_NEW_EMAC_ZMII is not set
+# CONFIG_IBM_NEW_EMAC_RGMII is not set
+# CONFIG_IBM_NEW_EMAC_TAH is not set
+# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
 # CONFIG_NET_PCI is not set
+# CONFIG_B44 is not set
 CONFIG_NETDEV_1000=y
 # CONFIG_ACENIC is not set
 # CONFIG_DL2K is not set
 # CONFIG_E1000 is not set
+# CONFIG_E1000E is not set
+# CONFIG_E1000E_ENABLED is not set
+# CONFIG_IP1000 is not set
+# CONFIG_IGB is not set
 # CONFIG_MYRI_SBUS is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
@@ -409,11 +435,15 @@ CONFIG_NETDEV_1000=y
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
 # CONFIG_CHELSIO_T3 is not set
+# CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
+# CONFIG_NIU is not set
 # CONFIG_MLX4_CORE is not set
+# CONFIG_TEHUTI is not set
+# CONFIG_BNX2X is not set
 # CONFIG_TR is not set
 
 #
@@ -421,13 +451,13 @@ CONFIG_NETDEV_10000=y
 #
 # CONFIG_WLAN_PRE80211 is not set
 # CONFIG_WLAN_80211 is not set
+# CONFIG_IWLWIFI_LEDS is not set
 # CONFIG_WAN is not set
 # CONFIG_FDDI is not set
 # CONFIG_HIPPI is not set
 # CONFIG_PPP is not set
 # CONFIG_SLIP is not set
 # CONFIG_NET_FC is not set
-# CONFIG_SHAPER is not set
 # CONFIG_NETCONSOLE is not set
 # CONFIG_NETPOLL is not set
 # CONFIG_NET_POLL_CONTROLLER is not set
@@ -449,7 +479,6 @@ CONFIG_INPUT_MOUSEDEV_PSAUX=y
 CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
 CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
 CONFIG_INPUT_JOYDEV=m
-# CONFIG_INPUT_TSDEV is not set
 CONFIG_INPUT_EVDEV=m
 CONFIG_INPUT_EVBUG=m
 
@@ -498,6 +527,7 @@ CONFIG_VT_CONSOLE=y
 CONFIG_HW_CONSOLE=y
 # CONFIG_VT_HW_CONSOLE_BINDING is not set
 # CONFIG_SERIAL_NONSTANDARD is not set
+# CONFIG_NOZOMI is not set
 
 #
 # Serial drivers
@@ -519,7 +549,6 @@ CONFIG_UNIX98_PTYS=y
 CONFIG_LEGACY_PTYS=y
 CONFIG_LEGACY_PTY_COUNT=256
 # CONFIG_IPMI_HANDLER is not set
-# CONFIG_WATCHDOG is not set
 CONFIG_HW_RANDOM=m
 CONFIG_JS_RTC=m
 # CONFIG_R3964 is not set
@@ -538,9 +567,9 @@ CONFIG_DEVPORT=y
 # CONFIG_POWER_SUPPLY is not set
 CONFIG_HWMON=y
 # CONFIG_HWMON_VID is not set
-# CONFIG_SENSORS_ABITUGURU is not set
-# CONFIG_SENSORS_ABITUGURU3 is not set
+# CONFIG_SENSORS_I5K_AMB is not set
 # CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_IT87 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
@@ -553,6 +582,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_W83627HF is not set
 # CONFIG_SENSORS_W83627EHF is not set
 # CONFIG_HWMON_DEBUG_CHIP is not set
+# CONFIG_THERMAL is not set
+# CONFIG_WATCHDOG is not set
+
+#
+# Sonics Silicon Backplane
+#
+CONFIG_SSB_POSSIBLE=y
+# CONFIG_SSB is not set
 
 #
 # Multifunction device drivers
@@ -569,15 +606,15 @@ CONFIG_HWMON=y
 #
 # Graphics support
 #
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+# CONFIG_FB is not set
 # CONFIG_BACKLIGHT_LCD_SUPPORT is not set
 
 #
 # Display device support
 #
 # CONFIG_DISPLAY_SUPPORT is not set
-# CONFIG_VGASTATE is not set
-# CONFIG_VIDEO_OUTPUT_CONTROL is not set
-# CONFIG_FB is not set
 
 #
 # Console display driver support
@@ -592,6 +629,7 @@ CONFIG_DUMMY_CONSOLE=y
 CONFIG_HID_SUPPORT=y
 CONFIG_HID=y
 # CONFIG_HID_DEBUG is not set
+# CONFIG_HIDRAW is not set
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -601,33 +639,13 @@ CONFIG_USB_ARCH_HAS_EHCI=y
 #
 # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
 #
-
-#
-# USB Gadget Support
-#
 # CONFIG_USB_GADGET is not set
 # CONFIG_MMC is not set
+# CONFIG_MEMSTICK is not set
 # CONFIG_NEW_LEDS is not set
 # CONFIG_INFINIBAND is not set
-
-#
-# Real Time Clock
-#
 # CONFIG_RTC_CLASS is not set
 
-#
-# DMA Engine support
-#
-# CONFIG_DMA_ENGINE is not set
-
-#
-# DMA Clients
-#
-
-#
-# DMA Devices
-#
-
 #
 # Userspace I/O
 #
@@ -664,18 +682,14 @@ CONFIG_FS_MBCACHE=y
 CONFIG_FS_POSIX_ACL=y
 CONFIG_XFS_FS=m
 CONFIG_XFS_QUOTA=y
-CONFIG_XFS_SECURITY=y
 CONFIG_XFS_POSIX_ACL=y
 CONFIG_XFS_RT=y
-# CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
-# CONFIG_MINIX_FS is not set
-CONFIG_ROMFS_FS=m
+CONFIG_DNOTIFY=y
 CONFIG_INOTIFY=y
 CONFIG_INOTIFY_USER=y
 # CONFIG_QUOTA is not set
 CONFIG_QUOTACTL=y
-CONFIG_DNOTIFY=y
 CONFIG_AUTOFS_FS=m
 CONFIG_AUTOFS4_FS=m
 # CONFIG_FUSE_FS is not set
@@ -704,7 +718,6 @@ CONFIG_PROC_SYSCTL=y
 CONFIG_SYSFS=y
 # CONFIG_TMPFS is not set
 # CONFIG_HUGETLB_PAGE is not set
-CONFIG_RAMFS=y
 # CONFIG_CONFIGFS_FS is not set
 
 #
@@ -721,14 +734,13 @@ CONFIG_BEFS_FS=m
 # CONFIG_EFS_FS is not set
 # CONFIG_CRAMFS is not set
 # CONFIG_VXFS_FS is not set
+# CONFIG_MINIX_FS is not set
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
+CONFIG_ROMFS_FS=m
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
-
-#
-# Network File Systems
-#
+CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
 # CONFIG_NFS_V4 is not set
@@ -760,10 +772,6 @@ CONFIG_AFS_FS=m
 # CONFIG_PARTITION_ADVANCED is not set
 CONFIG_MSDOS_PARTITION=y
 CONFIG_SUN_PARTITION=y
-
-#
-# Native Language Support
-#
 CONFIG_NLS=y
 CONFIG_NLS_DEFAULT="iso8859-1"
 # CONFIG_NLS_CODEPAGE_437 is not set
@@ -804,21 +812,14 @@ CONFIG_NLS_DEFAULT="iso8859-1"
 # CONFIG_NLS_KOI8_R is not set
 # CONFIG_NLS_KOI8_U is not set
 # CONFIG_NLS_UTF8 is not set
-
-#
-# Distributed Lock Manager
-#
 # CONFIG_DLM is not set
 
-#
-# Instrumentation Support
-#
-# CONFIG_PROFILING is not set
-
 #
 # Kernel hacking
 #
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
 # CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
 CONFIG_ENABLE_MUST_CHECK=y
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_UNUSED_SYMBOLS is not set
@@ -842,9 +843,12 @@ CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_INFO is not set
 # CONFIG_DEBUG_VM is not set
 # CONFIG_DEBUG_LIST is not set
-CONFIG_FORCED_INLINING=y
+# CONFIG_DEBUG_SG is not set
+# CONFIG_BOOT_PRINTK_DELAY is not set
 # CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_BACKTRACE_SELF_TEST is not set
 # CONFIG_FAULT_INJECTION is not set
+# CONFIG_SAMPLES is not set
 # CONFIG_DEBUG_STACK_USAGE is not set
 
 #
@@ -853,9 +857,12 @@ CONFIG_FORCED_INLINING=y
 CONFIG_KEYS=y
 # CONFIG_KEYS_DEBUG_PROC_KEYS is not set
 # CONFIG_SECURITY is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
 CONFIG_CRYPTO=y
 CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_AEAD=y
 CONFIG_CRYPTO_BLKCIPHER=y
+# CONFIG_CRYPTO_SEQIV is not set
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_HMAC=y
@@ -873,6 +880,10 @@ CONFIG_CRYPTO_ECB=m
 CONFIG_CRYPTO_CBC=y
 CONFIG_CRYPTO_PCBC=m
 # CONFIG_CRYPTO_LRW is not set
+# CONFIG_CRYPTO_XTS is not set
+# CONFIG_CRYPTO_CTR is not set
+# CONFIG_CRYPTO_GCM is not set
+# CONFIG_CRYPTO_CCM is not set
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_FCRYPT is not set
@@ -887,11 +898,15 @@ CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_ARC4=m
 # CONFIG_CRYPTO_KHAZAD is not set
 # CONFIG_CRYPTO_ANUBIS is not set
+# CONFIG_CRYPTO_SEED is not set
+# CONFIG_CRYPTO_SALSA20 is not set
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_CRC32C=m
 # CONFIG_CRYPTO_CAMELLIA is not set
 # CONFIG_CRYPTO_TEST is not set
+CONFIG_CRYPTO_AUTHENC=y
+# CONFIG_CRYPTO_LZO is not set
 # CONFIG_CRYPTO_HW is not set
 
 #
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 2712bb166f6f..59700aaaae93 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -9,9 +9,9 @@ EXTRA_AFLAGS	:= -ansi
 IRQ_OBJS := irq.o sun4m_irq.o sun4c_irq.o sun4d_irq.o
 obj-y    := entry.o wof.o wuf.o etrap.o rtrap.o traps.o $(IRQ_OBJS) \
 	    process.o signal.o ioport.o setup.o idprom.o \
-	    sys_sparc.o sunos_asm.o systbls.o \
-	    time.o windows.o cpu.o devices.o sclow.o \
-	    tadpole.o tick14.o ptrace.o sys_solaris.o \
+	    sys_sparc.o systbls.o \
+	    time.o windows.o cpu.o devices.o \
+	    tadpole.o tick14.o ptrace.o \
 	    unaligned.o una_asm.o muldiv.o \
 	    prom.o of_device.o devres.o
 
@@ -25,7 +25,3 @@ obj-$(CONFIG_PCI) += ebus.o
 obj-$(CONFIG_SUN_PM) += apc.o pmc.o
 obj-$(CONFIG_MODULES) += module.o sparc_ksyms.o
 obj-$(CONFIG_SPARC_LED) += led.o
-
-ifdef CONFIG_SUNOS_EMUL
-obj-y += sys_sunos.o sunos_ioctl.o
-endif
diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S
index c2eed8f71516..135644f8add7 100644
--- a/arch/sparc/kernel/entry.S
+++ b/arch/sparc/kernel/entry.S
@@ -1186,36 +1186,6 @@ srmmu_fault:
 
 	RESTORE_ALL
 
-#ifdef CONFIG_SUNOS_EMUL
-	/* SunOS uses syscall zero as the 'indirect syscall' it looks
-	 * like indir_syscall(scall_num, arg0, arg1, arg2...);  etc.
-	 * This is complete brain damage.
-	 */
-	.globl	sunos_indir
-sunos_indir:
-	mov	%o7, %l4
-	cmp	%o0, NR_SYSCALLS
-	blu,a	1f
-	 sll	%o0, 0x2, %o0
-
-	sethi	%hi(sunos_nosys), %l6
-	b	2f
-	 or	%l6, %lo(sunos_nosys), %l6
-
-1:
-	set	sunos_sys_table, %l7
-	ld	[%l7 + %o0], %l6
-
-2:	
-	mov	%o1, %o0
-	mov	%o2, %o1
-	mov	%o3, %o2
-	mov	%o4, %o3
-	mov	%o5, %o4
-	call	%l6
-	 mov	%l4, %o7
-#endif
-
 	.align	4
 	.globl	sys_nis_syscall
 sys_nis_syscall:
@@ -1232,6 +1202,16 @@ sys_execve:
 	call	sparc_execve
 	 mov	%l5, %o7
 
+	.globl	sunos_execv
+sunos_execv:
+	st	%g0, [%sp + STACKFRAME_SZ + PT_I2]
+
+	call	sparc_execve
+	 add	%sp, STACKFRAME_SZ, %o0
+
+	b	ret_sys_call
+	 ld	[%sp + STACKFRAME_SZ + PT_I0], %o0
+
 	.align	4
 	.globl	sys_pipe
 sys_pipe:
@@ -1394,7 +1374,7 @@ ret_from_fork:
 	b	ret_sys_call
 	 ld	[%sp + STACKFRAME_SZ + PT_I0], %o0
 
-	/* Linux native and SunOS system calls enter here... */
+	/* Linux native system calls enter here... */
 	.align	4
 	.globl	linux_sparc_syscall
 linux_sparc_syscall:
@@ -1472,170 +1452,6 @@ linux_syscall_trace2:
 	 st	%l2, [%sp + STACKFRAME_SZ + PT_NPC]
 
 
-	/*
-	 * Solaris system calls and indirect system calls enter here.
-         *
-	 * I have named the solaris indirect syscalls like that because
-	 * it seems like Solaris has some fast path syscalls that can
-	 * be handled as indirect system calls. - mig
-	 */
-
-linux_syscall_for_solaris:
-	sethi	%hi(sys_call_table), %l7
-	b	linux_sparc_syscall
-	 or	%l7, %lo(sys_call_table), %l7
-	
-	.align	4
-	.globl	solaris_syscall
-solaris_syscall:
-	cmp	%g1,59
-	be	linux_syscall_for_solaris
-	 cmp	%g1,2
-	be	linux_syscall_for_solaris
-	 cmp    %g1,42
-	be      linux_syscall_for_solaris
-	 cmp	%g1,119
-	be,a	linux_syscall_for_solaris
-	 mov	2, %g1
-1:	
-	SAVE_ALL_HEAD
-	 rd	%wim, %l3
-
-	wr	%l0, PSR_ET, %psr
-	nop
-	nop
-	mov	%i0, %l5
-
-	call	do_solaris_syscall
-	 add	%sp, STACKFRAME_SZ, %o0
-
-	st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-	set	PSR_C, %g2
-	cmp	%o0, -ERESTART_RESTARTBLOCK
-	bgeu	1f
-	 ld	[%sp + STACKFRAME_SZ + PT_PSR], %g3
-
-	/* System call success, clear Carry condition code. */		
-	andn	%g3, %g2, %g3
-	clr	%l6
-	b	2f
-	 st	%g3, [%sp + STACKFRAME_SZ + PT_PSR]	
-
-1:
-	/* System call failure, set Carry condition code.
-	 * Also, get abs(errno) to return to the process.
-	 */
-	sub	%g0, %o0, %o0
-	mov	1, %l6
-	st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-	or	%g3, %g2, %g3
-	st	%g3, [%sp + STACKFRAME_SZ + PT_PSR]
-
-	/* Advance the pc and npc over the trap instruction.
-	 * If the npc is unaligned (has a 1 in the lower byte), it means
-	 * the kernel does not want us to play magic (ie, skipping over
-	 * traps).  Mainly when the Solaris code wants to set some PC and
-	 * nPC (setcontext).
-	 */
-2:
-	ld	[%sp + STACKFRAME_SZ + PT_NPC], %l1	/* pc  = npc   */
-	andcc	%l1, 1, %g0
-	bne	1f
-	 add	%l1, 0x4, %l2			/* npc = npc+4 */
-	st	%l1, [%sp + STACKFRAME_SZ + PT_PC]
-	b	ret_trap_entry
-	 st	%l2, [%sp + STACKFRAME_SZ + PT_NPC]
-
-	/* kernel knows what it is doing, fixup npc and continue */
-1:
-	sub	%l1, 1, %l1
- 	b	ret_trap_entry	
-	 st	%l1, [%sp + STACKFRAME_SZ + PT_NPC]
-
-#ifndef CONFIG_SUNOS_EMUL
-	.align	4
-	.globl	sunos_syscall
-sunos_syscall:
-	SAVE_ALL_HEAD
-	 rd	%wim, %l3
-	wr	%l0, PSR_ET, %psr
-	nop
-	nop
-	mov	%i0, %l5
-	call	do_sunos_syscall
-	 add	%sp, STACKFRAME_SZ, %o0
-#endif
-
-	/* {net, open}bsd system calls enter here... */
-	.align	4
-	.globl	bsd_syscall
-bsd_syscall:
-	/* Direct access to user regs, must faster. */
-	cmp	%g1, NR_SYSCALLS
-	blu,a	1f
-	 sll	%g1, 2, %l4
-
-	set	sys_ni_syscall, %l7
-	b	bsd_is_too_hard
-	 nop
-
-1:
-	ld	[%l7 + %l4], %l7
-
-	.globl	bsd_is_too_hard
-bsd_is_too_hard:
-	rd	%wim, %l3
-	SAVE_ALL
-
-	wr	%l0, PSR_ET, %psr
-	WRITE_PAUSE
-
-2:
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i0, %l5
-	mov	%i3, %o3
-	mov	%i4, %o4
-	call	%l7
-	 mov	%i5, %o5
-
-	st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-	set	PSR_C, %g2
-	cmp	%o0, -ERESTART_RESTARTBLOCK
-	bgeu	1f
-	 ld	[%sp + STACKFRAME_SZ + PT_PSR], %g3
-
-	/* System call success, clear Carry condition code. */		
-	andn	%g3, %g2, %g3
-	clr	%l6
-	b	2f
-	 st	%g3, [%sp + STACKFRAME_SZ + PT_PSR]	
-
-1:
-	/* System call failure, set Carry condition code.
-	 * Also, get abs(errno) to return to the process.
-	 */
-	sub	%g0, %o0, %o0
-#if 0 /* XXX todo XXX */
-	sethi	%hi(bsd_xlatb_rorl), %o3
-	or	%o3, %lo(bsd_xlatb_rorl), %o3
-	sll	%o0, 2, %o0
-	ld	[%o3 + %o0], %o0
-#endif
-	mov	1, %l6
-	st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-	or	%g3, %g2, %g3
-	st	%g3, [%sp + STACKFRAME_SZ + PT_PSR]
-
-	/* Advance the pc and npc over the trap instruction. */
-2:
-	ld	[%sp + STACKFRAME_SZ + PT_NPC], %l1	/* pc  = npc   */
-	add	%l1, 0x4, %l2			/* npc = npc+4 */
-	st	%l1, [%sp + STACKFRAME_SZ + PT_PC]
-	b	ret_trap_entry
-	 st	%l2, [%sp + STACKFRAME_SZ + PT_NPC]
-
 /* Saving and restoring the FPU state is best done from lowlevel code.
  *
  * void fpsave(unsigned long *fpregs, unsigned long *fsr,
diff --git a/arch/sparc/kernel/errtbls.c b/arch/sparc/kernel/errtbls.c
deleted file mode 100644
index ed14df7116e9..000000000000
--- a/arch/sparc/kernel/errtbls.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/* errtbls.c: Error number conversion tables.
- *
- * Copyright (C) 1995, 2007 David S. Miller (davem@davemloft.net)
- *
- * Based upon preliminary work which is:
- *
- * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu)
- */
-
-#include <asm/solerrno.h>        /* Solaris errnos */
-
-/* Here is the table which converts between Linux error number values
- * to the equivalent under Solaris.  Note that since the Linux ones
- * have been set up to match exactly those of SunOS, no translation
- * table is needed for that OS.
- */
-
-int solaris_errno[] = {
-	0,
-	SOL_EPERM,
-	SOL_ENOENT,
-	SOL_ESRCH,
-	SOL_EINTR,
-	SOL_EIO,
-	SOL_ENXIO,
-	SOL_E2BIG,
-	SOL_ENOEXEC,
-	SOL_EBADF,
-	SOL_ECHILD,
-	SOL_EAGAIN,
-	SOL_ENOMEM,
-	SOL_EACCES,
-	SOL_EFAULT,
-	SOL_NOTBLK,
-	SOL_EBUSY,
-	SOL_EEXIST,
-	SOL_EXDEV,
-	SOL_ENODEV,
-	SOL_ENOTDIR,
-	SOL_EISDIR,
-	SOL_EINVAL,
-	SOL_ENFILE,
-	SOL_EMFILE,
-	SOL_ENOTTY,
-	SOL_ETXTBSY,
-	SOL_EFBIG,
-	SOL_ENOSPC,
-	SOL_ESPIPE,
-	SOL_EROFS,
-	SOL_EMLINK,
-	SOL_EPIPE,
-	SOL_EDOM,
-	SOL_ERANGE,
-	SOL_EWOULDBLOCK,
-	SOL_EINPROGRESS,
-	SOL_EALREADY,
-	SOL_ENOTSOCK,
-	SOL_EDESTADDRREQ,
-	SOL_EMSGSIZE,
-	SOL_EPROTOTYPE,
-	SOL_ENOPROTOOPT,
-	SOL_EPROTONOSUPPORT,
-	SOL_ESOCKTNOSUPPORT,
-	SOL_EOPNOTSUPP,
-	SOL_EPFNOSUPPORT,
-	SOL_EAFNOSUPPORT,
-	SOL_EADDRINUSE,
-	SOL_EADDRNOTAVAIL,
-	SOL_ENETDOWN,
-	SOL_ENETUNREACH,
-	SOL_ENETRESET,
-	SOL_ECONNABORTED,
-	SOL_ECONNRESET,
-	SOL_ENOBUFS,
-	SOL_EISCONN,
-	SOL_ENOTONN,
-	SOL_ESHUTDOWN,
-	SOL_ETOOMANYREFS,
-	SOL_ETIMEDOUT,
-	SOL_ECONNREFUSED,
-	SOL_ELOOP,
-	SOL_ENAMETOOLONG,
-	SOL_EHOSTDOWN,
-	SOL_EHOSTUNREACH,
-	SOL_ENOTEMPTY,
-	SOL_EPROCLIM,
-	SOL_EUSERS,
-	SOL_EDQUOT,
-	SOL_ESTALE,
-	SOL_EREMOTE,
-	SOL_ENOSTR,
-	SOL_ETIME,
-	SOL_ENOSR,
-	SOL_ENOMSG,
-	SOL_EBADMSG,
-	SOL_IDRM,
-	SOL_EDEADLK,
-	SOL_ENOLCK,
-	SOL_ENONET,
-	SOL_ERREMOTE,
-	SOL_ENOLINK,
-	SOL_EADV,
-	SOL_ESRMNT,
-	SOL_ECOMM,
-	SOL_EPROTO,
-	SOL_EMULTIHOP,
-	SOL_EINVAL,    /* EDOTDOT XXX??? */
-	SOL_REMCHG,
-	SOL_NOSYS,
-	SOL_STRPIPE,
-	SOL_EOVERFLOW,
-	SOL_EBADFD,
-	SOL_ECHRNG,
-	SOL_EL2NSYNC,
-	SOL_EL3HLT,
-	SOL_EL3RST,
-	SOL_NRNG,
-	SOL_EUNATCH,
-	SOL_ENOCSI,
-	SOL_EL2HLT,
-	SOL_EBADE,
-	SOL_EBADR,
-	SOL_EXFULL,
-	SOL_ENOANO,
-	SOL_EBADRQC,
-	SOL_EBADSLT,
-	SOL_EDEADLOCK,
-	SOL_EBFONT,
-	SOL_ELIBEXEC,
-	SOL_ENODATA,
-	SOL_ELIBBAD,
-	SOL_ENOPKG,
-	SOL_ELIBACC,
-	SOL_ENOTUNIQ,
-	SOL_ERESTART,
-	SOL_EUCLEAN,
-	SOL_ENOTNAM,
-	SOL_ENAVAIL,
-	SOL_EISNAM,
-	SOL_EREMOTEIO,
-	SOL_EILSEQ,
-	SOL_ELIBMAX,
-	SOL_ELIBSCN,
-};
diff --git a/arch/sparc/kernel/head.S b/arch/sparc/kernel/head.S
index 9a219e8b5ddb..b7f1e81c8ff2 100644
--- a/arch/sparc/kernel/head.S
+++ b/arch/sparc/kernel/head.S
@@ -78,11 +78,6 @@ sun4e_notsup:
         .asciz  "Sparc-Linux sun4e support does not exist\n\n"
 	.align 4
 
-#ifndef CONFIG_SUNOS_EMUL
-#undef SUNOS_SYSCALL_TRAP
-#define SUNOS_SYSCALL_TRAP SUNOS_NO_SYSCALL_TRAP
-#endif
-
 	/* The Sparc trap table, bootloader gives us control at _start. */
 	.text
 	.globl	start, _stext, _start, __stext
@@ -158,7 +153,7 @@ t_bad6f:BAD_TRAP(0x6f) BAD_TRAP(0x70) BAD_TRAP(0x71) BAD_TRAP(0x72) BAD_TRAP(0x7
 t_bad74:BAD_TRAP(0x74) BAD_TRAP(0x75) BAD_TRAP(0x76) BAD_TRAP(0x77) BAD_TRAP(0x78)
 t_bad79:BAD_TRAP(0x79) BAD_TRAP(0x7a) BAD_TRAP(0x7b) BAD_TRAP(0x7c) BAD_TRAP(0x7d)
 t_bad7e:BAD_TRAP(0x7e) BAD_TRAP(0x7f)
-t_sunos:SUNOS_SYSCALL_TRAP                  /* SunOS System Call             */
+t_bad80:BAD_TRAP(0x80)                      /* SunOS System Call             */
 t_sbkpt:BREAKPOINT_TRAP                     /* Software Breakpoint/KGDB      */
 t_divz:	TRAP_ENTRY(0x82, do_hw_divzero)     /* Divide by zero trap           */
 t_flwin:TRAP_ENTRY(0x83, do_flush_windows)  /* Flush Windows Trap            */
@@ -166,8 +161,8 @@ t_clwin:BAD_TRAP(0x84)                      /* Clean Windows Trap            */
 t_rchk:	BAD_TRAP(0x85)                      /* Range Check                   */
 t_funal:BAD_TRAP(0x86)                      /* Fix Unaligned Access Trap     */
 t_iovf:	BAD_TRAP(0x87)                      /* Integer Overflow Trap         */
-t_slowl:SOLARIS_SYSCALL_TRAP                /* Slowaris System Call          */
-t_netbs:NETBSD_SYSCALL_TRAP                 /* Net-B.S. System Call          */
+t_bad88:BAD_TRAP(0x88)                      /* Slowaris System Call          */
+t_bad89:BAD_TRAP(0x89)                      /* Net-B.S. System Call          */
 t_bad8a:BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c) BAD_TRAP(0x8d) BAD_TRAP(0x8e)
 t_bad8f:BAD_TRAP(0x8f)
 t_linux:LINUX_SYSCALL_TRAP                  /* Linux System Call             */
@@ -178,7 +173,7 @@ t_getcc:GETCC_TRAP                          /* Get Condition Codes           */
 t_setcc:SETCC_TRAP                          /* Set Condition Codes           */
 t_getpsr:GETPSR_TRAP                        /* Get PSR Register              */
 t_bada3:BAD_TRAP(0xa3) BAD_TRAP(0xa4) BAD_TRAP(0xa5) BAD_TRAP(0xa6)
-t_slowi:INDIRECT_SOLARIS_SYSCALL(156)
+t_bada7:BAD_TRAP(0xa7)
 t_bada8:BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
 t_badac:BAD_TRAP(0xac) BAD_TRAP(0xad) BAD_TRAP(0xae) BAD_TRAP(0xaf) BAD_TRAP(0xb0)
 t_badb1:BAD_TRAP(0xb1) BAD_TRAP(0xb2) BAD_TRAP(0xb3) BAD_TRAP(0xb4) BAD_TRAP(0xb5)
@@ -243,19 +238,19 @@ trapbase_cpu1:
 	BAD_TRAP(0x74) BAD_TRAP(0x75) BAD_TRAP(0x76) BAD_TRAP(0x77) BAD_TRAP(0x78)
 	BAD_TRAP(0x79) BAD_TRAP(0x7a) BAD_TRAP(0x7b) BAD_TRAP(0x7c) BAD_TRAP(0x7d)
 	BAD_TRAP(0x7e) BAD_TRAP(0x7f)
-	SUNOS_SYSCALL_TRAP 
+	BAD_TRAP(0x80)
 	BREAKPOINT_TRAP
 	TRAP_ENTRY(0x82, do_hw_divzero)
 	TRAP_ENTRY(0x83, do_flush_windows) BAD_TRAP(0x84) BAD_TRAP(0x85)
-	BAD_TRAP(0x86) BAD_TRAP(0x87) SOLARIS_SYSCALL_TRAP
-	NETBSD_SYSCALL_TRAP BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c)
+	BAD_TRAP(0x86) BAD_TRAP(0x87) BAD_TRAP(0x88)
+	BAD_TRAP(0x89) BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c)
 	BAD_TRAP(0x8d) BAD_TRAP(0x8e) BAD_TRAP(0x8f)
 	LINUX_SYSCALL_TRAP BAD_TRAP(0x91) BAD_TRAP(0x92) BAD_TRAP(0x93) BAD_TRAP(0x94)
 	BAD_TRAP(0x95) BAD_TRAP(0x96) BAD_TRAP(0x97) BAD_TRAP(0x98) BAD_TRAP(0x99)
 	BAD_TRAP(0x9a) BAD_TRAP(0x9b) BAD_TRAP(0x9c) BAD_TRAP(0x9d) BAD_TRAP(0x9e)
 	BAD_TRAP(0x9f) GETCC_TRAP SETCC_TRAP GETPSR_TRAP
 	BAD_TRAP(0xa3) BAD_TRAP(0xa4) BAD_TRAP(0xa5) BAD_TRAP(0xa6)
-	INDIRECT_SOLARIS_SYSCALL(156) BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
+	BAD_TRAP(0xa7) BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
 	BAD_TRAP(0xac) BAD_TRAP(0xad) BAD_TRAP(0xae) BAD_TRAP(0xaf) BAD_TRAP(0xb0)
 	BAD_TRAP(0xb1) BAD_TRAP(0xb2) BAD_TRAP(0xb3) BAD_TRAP(0xb4) BAD_TRAP(0xb5)
 	BAD_TRAP(0xb6) BAD_TRAP(0xb7) BAD_TRAP(0xb8) BAD_TRAP(0xb9) BAD_TRAP(0xba)
@@ -311,19 +306,19 @@ trapbase_cpu2:
 	BAD_TRAP(0x74) BAD_TRAP(0x75) BAD_TRAP(0x76) BAD_TRAP(0x77) BAD_TRAP(0x78)
 	BAD_TRAP(0x79) BAD_TRAP(0x7a) BAD_TRAP(0x7b) BAD_TRAP(0x7c) BAD_TRAP(0x7d)
 	BAD_TRAP(0x7e) BAD_TRAP(0x7f)
-	SUNOS_SYSCALL_TRAP 
+	BAD_TRAP(0x80)
 	BREAKPOINT_TRAP
 	TRAP_ENTRY(0x82, do_hw_divzero)
 	TRAP_ENTRY(0x83, do_flush_windows) BAD_TRAP(0x84) BAD_TRAP(0x85)
-	BAD_TRAP(0x86) BAD_TRAP(0x87) SOLARIS_SYSCALL_TRAP
-	NETBSD_SYSCALL_TRAP BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c)
+	BAD_TRAP(0x86) BAD_TRAP(0x87) BAD_TRAP(0x88)
+	BAD_TRAP(0x89) BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c)
 	BAD_TRAP(0x8d) BAD_TRAP(0x8e) BAD_TRAP(0x8f)
 	LINUX_SYSCALL_TRAP BAD_TRAP(0x91) BAD_TRAP(0x92) BAD_TRAP(0x93) BAD_TRAP(0x94)
 	BAD_TRAP(0x95) BAD_TRAP(0x96) BAD_TRAP(0x97) BAD_TRAP(0x98) BAD_TRAP(0x99)
 	BAD_TRAP(0x9a) BAD_TRAP(0x9b) BAD_TRAP(0x9c) BAD_TRAP(0x9d) BAD_TRAP(0x9e)
 	BAD_TRAP(0x9f) GETCC_TRAP SETCC_TRAP GETPSR_TRAP
 	BAD_TRAP(0xa3) BAD_TRAP(0xa4) BAD_TRAP(0xa5) BAD_TRAP(0xa6)
-	INDIRECT_SOLARIS_SYSCALL(156) BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
+	BAD_TRAP(0xa7) BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
 	BAD_TRAP(0xac) BAD_TRAP(0xad) BAD_TRAP(0xae) BAD_TRAP(0xaf) BAD_TRAP(0xb0)
 	BAD_TRAP(0xb1) BAD_TRAP(0xb2) BAD_TRAP(0xb3) BAD_TRAP(0xb4) BAD_TRAP(0xb5)
 	BAD_TRAP(0xb6) BAD_TRAP(0xb7) BAD_TRAP(0xb8) BAD_TRAP(0xb9) BAD_TRAP(0xba)
@@ -379,19 +374,19 @@ trapbase_cpu3:
 	BAD_TRAP(0x74) BAD_TRAP(0x75) BAD_TRAP(0x76) BAD_TRAP(0x77) BAD_TRAP(0x78)
 	BAD_TRAP(0x79) BAD_TRAP(0x7a) BAD_TRAP(0x7b) BAD_TRAP(0x7c) BAD_TRAP(0x7d)
 	BAD_TRAP(0x7e) BAD_TRAP(0x7f)
-	SUNOS_SYSCALL_TRAP  
+	BAD_TRAP(0x80)
 	BREAKPOINT_TRAP
 	TRAP_ENTRY(0x82, do_hw_divzero)
 	TRAP_ENTRY(0x83, do_flush_windows) BAD_TRAP(0x84) BAD_TRAP(0x85)
-	BAD_TRAP(0x86) BAD_TRAP(0x87) SOLARIS_SYSCALL_TRAP
-	NETBSD_SYSCALL_TRAP BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c)
+	BAD_TRAP(0x86) BAD_TRAP(0x87) BAD_TRAP(0x88)
+	BAD_TRAP(0x89) BAD_TRAP(0x8a) BAD_TRAP(0x8b) BAD_TRAP(0x8c)
 	BAD_TRAP(0x8d) BAD_TRAP(0x8e) BAD_TRAP(0x8f)
 	LINUX_SYSCALL_TRAP BAD_TRAP(0x91) BAD_TRAP(0x92) BAD_TRAP(0x93) BAD_TRAP(0x94)
 	BAD_TRAP(0x95) BAD_TRAP(0x96) BAD_TRAP(0x97) BAD_TRAP(0x98) BAD_TRAP(0x99)
 	BAD_TRAP(0x9a) BAD_TRAP(0x9b) BAD_TRAP(0x9c) BAD_TRAP(0x9d) BAD_TRAP(0x9e)
 	BAD_TRAP(0x9f) GETCC_TRAP SETCC_TRAP GETPSR_TRAP
 	BAD_TRAP(0xa3) BAD_TRAP(0xa4) BAD_TRAP(0xa5) BAD_TRAP(0xa6)
-	INDIRECT_SOLARIS_SYSCALL(156) BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
+	BAD_TRAP(0xa7) BAD_TRAP(0xa8) BAD_TRAP(0xa9) BAD_TRAP(0xaa) BAD_TRAP(0xab)
 	BAD_TRAP(0xac) BAD_TRAP(0xad) BAD_TRAP(0xae) BAD_TRAP(0xaf) BAD_TRAP(0xb0)
 	BAD_TRAP(0xb1) BAD_TRAP(0xb2) BAD_TRAP(0xb3) BAD_TRAP(0xb4) BAD_TRAP(0xb5)
 	BAD_TRAP(0xb6) BAD_TRAP(0xb7) BAD_TRAP(0xb8) BAD_TRAP(0xb9) BAD_TRAP(0xba)
diff --git a/arch/sparc/kernel/sclow.S b/arch/sparc/kernel/sclow.S
deleted file mode 100644
index 136e37c53d49..000000000000
--- a/arch/sparc/kernel/sclow.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* sclow.S: Low level special syscall handling.
- *          Basically these are cases where we can completely
- *          handle the system call without saving any state
- *          because we know that the process will not sleep.
- *
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#include <asm/ptrace.h>
-#include <asm/asm-offsets.h>
-#include <asm/errno.h>
-#include <asm/winmacro.h>
-#include <asm/thread_info.h>
-#include <asm/psr.h>
-#include <asm/page.h>
-
-#define CC_AND_RETT  \
-	set	PSR_C, %l4; \
-	andn	%l0, %l4, %l4; \
-	wr	%l4, 0x0, %psr; \
-	nop; nop; nop; \
-	jmp	%l2; \
-	rett	%l2 + 4;
-
-#define SC_AND_RETT  \
-	set	PSR_C, %l4; \
-	or	%l0, %l4, %l4; \
-	wr	%l4, 0x0, %psr; \
-	nop; nop; nop; \
-	jmp	%l2; \
-	rett	%l2 + 4;
-
-#define LABEL(func)  func##_low
-
-	.globl	LABEL(sunosnop)
-LABEL(sunosnop):
-	CC_AND_RETT
-
-#if (ASIZ_task_uid == 2 && ASIZ_task_euid == 2)
-	.globl	LABEL(sunosgetuid)
-LABEL(sunosgetuid):
-	LOAD_CURRENT(l4, l5)
-	ld	[%l4 + TI_TASK], %l4
-	lduh	[%l4 + AOFF_task_uid], %i0
-	lduh	[%l4 + AOFF_task_euid], %i1
-	CC_AND_RETT
-#endif
-
-#if (ASIZ_task_gid == 2 && ASIZ_task_egid == 2)
-	.globl	LABEL(sunosgetgid)
-LABEL(sunosgetgid):
-	LOAD_CURRENT(l4, l5)
-	ld	[%l4 + TI_TASK], %l4
-	lduh	[%l4 + AOFF_task_gid], %i0
-	lduh	[%l4 + AOFF_task_egid], %i1
-	CC_AND_RETT
-#endif
-
-	.globl	LABEL(sunosmctl)
-LABEL(sunosmctl):
-	mov	0, %i0
-	CC_AND_RETT
-
-	.globl	LABEL(sunosgdtsize)
-LABEL(sunosgdtsize):	
-	mov	256, %i0
-	CC_AND_RETT
-
-	.globl	LABEL(getpagesize)
-LABEL(getpagesize):
-	set	PAGE_SIZE, %i0
-	CC_AND_RETT
-
-	/* XXX sys_nice() XXX */
-	/* XXX sys_setpriority() XXX */
-	/* XXX sys_getpriority() XXX */
-	/* XXX sys_setregid() XXX */
-	/* XXX sys_setgid() XXX */
-	/* XXX sys_setreuid() XXX */
-	/* XXX sys_setuid() XXX */
-	/* XXX sys_setfsuid() XXX */
-	/* XXX sys_setfsgid() XXX */
-	/* XXX sys_setpgid() XXX */
-	/* XXX sys_getpgid() XXX */
-	/* XXX sys_setsid() XXX */
-	/* XXX sys_getsid() XXX */
diff --git a/arch/sparc/kernel/signal.c b/arch/sparc/kernel/signal.c
index 9994cac95078..1f730619a24a 100644
--- a/arch/sparc/kernel/signal.c
+++ b/arch/sparc/kernel/signal.c
@@ -22,7 +22,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
-#include <asm/svr4.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>	/* flush_sig_insns */
@@ -454,7 +453,6 @@ setup_frame(struct sigaction *sa, struct pt_regs *regs, int signr, sigset_t *old
 			break;
 		case SIGSYS:
 			if (info->si_code == (__SI_FAULT|0x100)) {
-				/* See sys_sunos.c */
 				sig_code = info->si_trapno;
 				break;
 			}
@@ -676,291 +674,17 @@ sigsegv:
 	force_sigsegv(signo, current);
 }
 
-/* Setup a Solaris stack frame */
-static inline void
-setup_svr4_frame(struct sigaction *sa, unsigned long pc, unsigned long npc,
-		 struct pt_regs *regs, int signr, sigset_t *oldset)
-{
-	svr4_signal_frame_t __user *sfp;
-	svr4_gregset_t  __user *gr;
-	svr4_siginfo_t  __user *si;
-	svr4_mcontext_t __user *mc;
-	svr4_gwindows_t __user *gw;
-	svr4_ucontext_t __user *uc;
-	svr4_sigset_t	setv;
-	struct thread_info *tp = current_thread_info();
-	int window = 0, err;
-
-	synchronize_user_stack();
-	sfp = (svr4_signal_frame_t __user *)
-		get_sigframe(sa, regs, SVR4_SF_ALIGNED + sizeof(struct reg_window));
-
-	if (invalid_frame_pointer(sfp, sizeof(*sfp)))
-		goto sigill_and_return;
-
-	/* Start with a clean frame pointer and fill it */
-	err = __clear_user(sfp, sizeof(*sfp));
-
-	/* Setup convenience variables */
-	si = &sfp->si;
-	uc = &sfp->uc;
-	gw = &sfp->gw;
-	mc = &uc->mcontext;
-	gr = &mc->greg;
-	
-	/* FIXME: where am I supposed to put this?
-	 * sc->sigc_onstack = old_status;
-	 * anyways, it does not look like it is used for anything at all.
-	 */
-	setv.sigbits[0] = oldset->sig[0];
-	setv.sigbits[1] = oldset->sig[1];
-	if (_NSIG_WORDS >= 4) {
-		setv.sigbits[2] = oldset->sig[2];
-		setv.sigbits[3] = oldset->sig[3];
-		err |= __copy_to_user(&uc->sigmask, &setv, sizeof(svr4_sigset_t));
-	} else
-		err |= __copy_to_user(&uc->sigmask, &setv,
-				      2 * sizeof(unsigned int));
-
-	/* Store registers */
-	err |= __put_user(regs->pc, &((*gr)[SVR4_PC]));
-	err |= __put_user(regs->npc, &((*gr)[SVR4_NPC]));
-	err |= __put_user(regs->psr, &((*gr)[SVR4_PSR]));
-	err |= __put_user(regs->y, &((*gr)[SVR4_Y]));
-	
-	/* Copy g[1..7] and o[0..7] registers */
-	err |= __copy_to_user(&(*gr)[SVR4_G1], &regs->u_regs[UREG_G1],
-			      sizeof(long) * 7);
-	err |= __copy_to_user(&(*gr)[SVR4_O0], &regs->u_regs[UREG_I0],
-			      sizeof(long) * 8);
-
-	/* Setup sigaltstack */
-	err |= __put_user(current->sas_ss_sp, &uc->stack.sp);
-	err |= __put_user(sas_ss_flags(regs->u_regs[UREG_FP]), &uc->stack.flags);
-	err |= __put_user(current->sas_ss_size, &uc->stack.size);
-
-	/* Save the currently window file: */
-
-	/* 1. Link sfp->uc->gwins to our windows */
-	err |= __put_user(gw, &mc->gwin);
-	    
-	/* 2. Number of windows to restore at setcontext(): */
-	err |= __put_user(tp->w_saved, &gw->count);
-
-	/* 3. Save each valid window
-	 *    Currently, it makes a copy of the windows from the kernel copy.
-	 *    David's code for SunOS, makes the copy but keeps the pointer to
-	 *    the kernel.  My version makes the pointer point to a userland 
-	 *    copy of those.  Mhm, I wonder if I shouldn't just ignore those
-	 *    on setcontext and use those that are on the kernel, the signal
-	 *    handler should not be modyfing those, mhm.
-	 *
-	 *    These windows are just used in case synchronize_user_stack failed
-	 *    to flush the user windows.
-	 */
-	for (window = 0; window < tp->w_saved; window++) {
-		err |= __put_user((int __user *) &(gw->win[window]), &gw->winptr[window]);
-		err |= __copy_to_user(&gw->win[window],
-				      &tp->reg_window[window],
-				      sizeof(svr4_rwindow_t));
-		err |= __put_user(0, gw->winptr[window]);
-	}
-
-	/* 4. We just pay attention to the gw->count field on setcontext */
-	tp->w_saved = 0; /* So process is allowed to execute. */
-
-	/* Setup the signal information.  Solaris expects a bunch of
-	 * information to be passed to the signal handler, we don't provide
-	 * that much currently, should use siginfo.
-	 */
-	err |= __put_user(signr, &si->siginfo.signo);
-	err |= __put_user(SVR4_SINOINFO, &si->siginfo.code);
-	if (err)
-		goto sigsegv;
-
-	regs->u_regs[UREG_FP] = (unsigned long) sfp;
-	regs->pc = (unsigned long) sa->sa_handler;
-	regs->npc = (regs->pc + 4);
-
-	/* Arguments passed to signal handler */
-	if (regs->u_regs[14]){
-		struct reg_window __user *rw = (struct reg_window __user *)
-			regs->u_regs[14];
-
-		err |= __put_user(signr, &rw->ins[0]);
-		err |= __put_user(si, &rw->ins[1]);
-		err |= __put_user(uc, &rw->ins[2]);
-		err |= __put_user(sfp, &rw->ins[6]);	/* frame pointer */
-		if (err)
-			goto sigsegv;
-
-		regs->u_regs[UREG_I0] = signr;
-		regs->u_regs[UREG_I1] = (unsigned long) si;
-		regs->u_regs[UREG_I2] = (unsigned long) uc;
-	}
-	return;
-
-sigill_and_return:
-	do_exit(SIGILL);
-sigsegv:
-	force_sigsegv(signr, current);
-}
-
-asmlinkage int svr4_getcontext(svr4_ucontext_t __user *uc, struct pt_regs *regs)
-{
-	svr4_gregset_t  __user *gr;
-	svr4_mcontext_t __user *mc;
-	svr4_sigset_t	setv;
-	int err = 0;
-
-	synchronize_user_stack();
-
-	if (current_thread_info()->w_saved)
-		return -EFAULT;
-
-	err = clear_user(uc, sizeof(*uc));
-	if (err)
-		return -EFAULT;
-
-	/* Setup convenience variables */
-	mc = &uc->mcontext;
-	gr = &mc->greg;
-
-	setv.sigbits[0] = current->blocked.sig[0];
-	setv.sigbits[1] = current->blocked.sig[1];
-	if (_NSIG_WORDS >= 4) {
-		setv.sigbits[2] = current->blocked.sig[2];
-		setv.sigbits[3] = current->blocked.sig[3];
-		err |= __copy_to_user(&uc->sigmask, &setv, sizeof(svr4_sigset_t));
-	} else
-		err |= __copy_to_user(&uc->sigmask, &setv,
-				      2 * sizeof(unsigned int));
-
-	/* Store registers */
-	err |= __put_user(regs->pc, &uc->mcontext.greg[SVR4_PC]);
-	err |= __put_user(regs->npc, &uc->mcontext.greg[SVR4_NPC]);
-	err |= __put_user(regs->psr, &uc->mcontext.greg[SVR4_PSR]);
-	err |= __put_user(regs->y, &uc->mcontext.greg[SVR4_Y]);
-	
-	/* Copy g[1..7] and o[0..7] registers */
-	err |= __copy_to_user(&(*gr)[SVR4_G1], &regs->u_regs[UREG_G1],
-			      sizeof(uint) * 7);
-	err |= __copy_to_user(&(*gr)[SVR4_O0], &regs->u_regs[UREG_I0],
-			      sizeof(uint) * 8);
-
-	/* Setup sigaltstack */
-	err |= __put_user(current->sas_ss_sp, &uc->stack.sp);
-	err |= __put_user(sas_ss_flags(regs->u_regs[UREG_FP]), &uc->stack.flags);
-	err |= __put_user(current->sas_ss_size, &uc->stack.size);
-
-	/* The register file is not saved
-	 * we have already stuffed all of it with sync_user_stack
-	 */
-	return (err ? -EFAULT : 0);
-}
-
-/* Set the context for a svr4 application, this is Solaris way to sigreturn */
-asmlinkage int svr4_setcontext(svr4_ucontext_t __user *c, struct pt_regs *regs)
-{
-	svr4_gregset_t  __user *gr;
-	unsigned long pc, npc, psr;
-	mm_segment_t old_fs;
-	sigset_t set;
-	svr4_sigset_t setv;
-	int err;
-	stack_t st;
-	
-	/* Fixme: restore windows, or is this already taken care of in
-	 * svr4_setup_frame when sync_user_windows is done?
-	 */
-	flush_user_windows();
-
-	if (current_thread_info()->w_saved)
-		goto sigsegv_and_return;
-
-	if (((unsigned long) c) & 3)
-		goto sigsegv_and_return;
-
-	if (!__access_ok((unsigned long)c, sizeof(*c)))
-		goto sigsegv_and_return;
-
-	/* Check for valid PC and nPC */
-	gr = &c->mcontext.greg;
-	err = __get_user(pc, &((*gr)[SVR4_PC]));
-	err |= __get_user(npc, &((*gr)[SVR4_NPC]));
-
-	if ((pc | npc) & 3)
-		goto sigsegv_and_return;
-
-	/* Retrieve information from passed ucontext */
-	/* note that nPC is ored a 1, this is used to inform entry.S */
-	/* that we don't want it to mess with our PC and nPC */
-
-	/* This is pretty much atomic, no amount locking would prevent
-	 * the races which exist anyways.
-	 */
-	err |= __copy_from_user(&setv, &c->sigmask, sizeof(svr4_sigset_t));
-	
-	err |= __get_user(st.ss_sp, &c->stack.sp);
-	err |= __get_user(st.ss_flags, &c->stack.flags);
-	err |= __get_user(st.ss_size, &c->stack.size);
-	
-	if (err)
-		goto sigsegv_and_return;
-		
-	/* It is more difficult to avoid calling this function than to
-	   call it and ignore errors.  */
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	do_sigaltstack((const stack_t __user *) &st, NULL,
-		       regs->u_regs[UREG_I6]);
-	set_fs(old_fs);
-	
-	set.sig[0] = setv.sigbits[0];
-	set.sig[1] = setv.sigbits[1];
-	if (_NSIG_WORDS >= 4) {
-		set.sig[2] = setv.sigbits[2];
-		set.sig[3] = setv.sigbits[3];
-	}
-	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	regs->pc = pc;
-	regs->npc = npc | 1;
-	err |= __get_user(regs->y, &((*gr)[SVR4_Y]));
-	err |= __get_user(psr, &((*gr)[SVR4_PSR]));
-	regs->psr &= ~(PSR_ICC);
-	regs->psr |= (psr & PSR_ICC);
-
-	/* Restore g[1..7] and o[0..7] registers */
-	err |= __copy_from_user(&regs->u_regs[UREG_G1], &(*gr)[SVR4_G1],
-			      sizeof(long) * 7);
-	err |= __copy_from_user(&regs->u_regs[UREG_I0], &(*gr)[SVR4_O0],
-			      sizeof(long) * 8);
-	return (err ? -EFAULT : 0);
-
-sigsegv_and_return:
-	force_sig(SIGSEGV, current);
-	return -EFAULT;
-}
-
 static inline void
 handle_signal(unsigned long signr, struct k_sigaction *ka,
-	      siginfo_t *info, sigset_t *oldset, struct pt_regs *regs,
-	      int svr4_signal)
+	      siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
 {
-	if (svr4_signal)
-		setup_svr4_frame(&ka->sa, regs->pc, regs->npc, regs, signr, oldset);
-	else {
-		if (ka->sa.sa_flags & SA_SIGINFO)
-			new_setup_rt_frame(ka, regs, signr, oldset, info);
-		else if (current->thread.new_signal)
-			new_setup_frame(ka, regs, signr, oldset);
-		else
-			setup_frame(&ka->sa, regs, signr, oldset, info);
-	}
+	if (ka->sa.sa_flags & SA_SIGINFO)
+		new_setup_rt_frame(ka, regs, signr, oldset, info);
+	else if (current->thread.new_signal)
+		new_setup_frame(ka, regs, signr, oldset);
+	else
+		setup_frame(&ka->sa, regs, signr, oldset, info);
+
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NOMASK))
@@ -1002,17 +726,6 @@ asmlinkage void do_signal(struct pt_regs * regs, unsigned long orig_i0, int rest
 	int signr;
 	sigset_t *oldset;
 
-	/*
-	 * XXX Disable svr4 signal handling until solaris emulation works.
-	 * It is buggy - Anton
-	 */
-#define SVR4_SIGNAL_BROKEN 1
-#ifdef SVR4_SIGNAL_BROKEN
-	int svr4_signal = 0;
-#else
-	int svr4_signal = current->personality == PER_SVR4;
-#endif
-
 	cookie.restart_syscall = restart_syscall;
 	cookie.orig_i0 = orig_i0;
 
@@ -1025,8 +738,8 @@ asmlinkage void do_signal(struct pt_regs * regs, unsigned long orig_i0, int rest
 	if (signr > 0) {
 		if (cookie.restart_syscall)
 			syscall_restart(cookie.orig_i0, regs, &ka.sa);
-		handle_signal(signr, &ka, &info, oldset,
-			      regs, svr4_signal);
+		handle_signal(signr, &ka, &info, oldset, regs);
+
 		/* a signal was successfully delivered; the saved
 		 * sigmask will have been stored in the signal frame,
 		 * and will be restored by sigreturn, so we can simply
diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c
index 97b1de0e9094..0bcf98a7ef38 100644
--- a/arch/sparc/kernel/sparc_ksyms.c
+++ b/arch/sparc/kernel/sparc_ksyms.c
@@ -36,12 +36,10 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/idprom.h>
-#include <asm/svr4.h>
 #include <asm/head.h>
 #include <asm/smp.h>
 #include <asm/mostek.h>
 #include <asm/ptrace.h>
-#include <asm/user.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
 #ifdef CONFIG_SBUS
@@ -62,8 +60,6 @@ struct poll {
 	short revents;
 };
 
-extern int svr4_getcontext (svr4_ucontext_t *, struct pt_regs *);
-extern int svr4_setcontext (svr4_ucontext_t *, struct pt_regs *);
 extern void (*__copy_1page)(void *, const void *);
 extern void __memmove(void *, const void *, __kernel_size_t);
 extern void (*bzero_1page)(void *);
@@ -204,10 +200,6 @@ EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
 #endif
 
-/* Solaris/SunOS binary compatibility */
-EXPORT_SYMBOL(svr4_setcontext);
-EXPORT_SYMBOL(svr4_getcontext);
-
 /* prom symbols */
 EXPORT_SYMBOL(idprom);
 EXPORT_SYMBOL(prom_root_node);
diff --git a/arch/sparc/kernel/sunos_asm.S b/arch/sparc/kernel/sunos_asm.S
deleted file mode 100644
index 07fe86014fb5..000000000000
--- a/arch/sparc/kernel/sunos_asm.S
+++ /dev/null
@@ -1,67 +0,0 @@
-/* $Id: sunos_asm.S,v 1.15 2000/01/11 17:33:21 jj Exp $
- * sunos_asm.S: SunOS system calls which must have a low-level
- *              entry point to operate correctly.
- *
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- *
- * Based upon preliminary work which is:
- *
- * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu)
- */
-
-#include <asm/ptrace.h>
-
-	.text
-	.align 4
-
-	/* When calling ret_sys_call, %o0 should contain the same
-	 * value as in [%sp + STACKFRAME_SZ + PT_I0] */
-
-	/* SunOS getpid() returns pid in %o0 and ppid in %o1 */
-	.globl	sunos_getpid
-sunos_getpid:
-	call	sys_getppid
-	 nop
-
-	call	sys_getpid
-	 st	%o0, [%sp + STACKFRAME_SZ + PT_I1]
-
-	b	ret_sys_call
-	 st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-
-	/* SunOS getuid() returns uid in %o0 and euid in %o1 */
-	.globl	sunos_getuid
-sunos_getuid:
-	call	sys_geteuid16
-	 nop
-
-	call	sys_getuid16
-	 st	%o0, [%sp + STACKFRAME_SZ + PT_I1]
-
-	b	ret_sys_call
-	 st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-
-	/* SunOS getgid() returns gid in %o0 and egid in %o1 */
-	.globl	sunos_getgid
-sunos_getgid:
-	call	sys_getegid16
-	 nop
-
-	call	sys_getgid16
-	 st	%o0, [%sp + STACKFRAME_SZ + PT_I1]
-
-	b	ret_sys_call
-	 st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
-
-	/* SunOS's execv() call only specifies the argv argument, the
-	 * environment settings are the same as the calling processes.
-	 */
-	.globl	sunos_execv
-sunos_execv:
-	st	%g0, [%sp + STACKFRAME_SZ + PT_I2]
-
-	call	sparc_execve
-	 add	%sp, STACKFRAME_SZ, %o0
-
-	b	ret_sys_call
-	 ld	[%sp + STACKFRAME_SZ + PT_I0], %o0
diff --git a/arch/sparc/kernel/sunos_ioctl.c b/arch/sparc/kernel/sunos_ioctl.c
deleted file mode 100644
index e613cc6a10ba..000000000000
--- a/arch/sparc/kernel/sunos_ioctl.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/* $Id: sunos_ioctl.c,v 1.34 2000/09/03 14:10:56 anton Exp $
- * sunos_ioctl.c: The Linux Operating system: SunOS ioctl compatibility.
- * 
- * Copyright (C) 1995 Miguel de Icaza (miguel@nuclecu.unam.mx)
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#include <asm/uaccess.h>
-
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/termios.h>
-#include <linux/tty.h>
-#include <linux/ioctl.h>
-#include <linux/route.h>
-#include <linux/sockios.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/syscalls.h>
-#include <linux/file.h>
-
-#if 0
-extern char sunkbd_type;
-extern char sunkbd_layout;
-#endif
-
-/* NR_OPEN is now larger and dynamic in recent kernels. */
-#define SUNOS_NR_OPEN	256
-
-asmlinkage int sunos_ioctl (int fd, unsigned long cmd, unsigned long arg)
-{
-	int ret = -EBADF;
-
-	if (fd >= SUNOS_NR_OPEN || !fcheck(fd))
-		goto out;
-
-	/* First handle an easy compat. case for tty ldisc. */
-	if (cmd == TIOCSETD) {
-		int __user *p;
-		int ntty = N_TTY, tmp;
-		mm_segment_t oldfs;
-
-		p = (int __user *) arg;
-		ret = -EFAULT;
-		if (get_user(tmp, p))
-			goto out;
-		if (tmp == 2) {
-			oldfs = get_fs();
-			set_fs(KERNEL_DS);
-			ret = sys_ioctl(fd, cmd, (unsigned long) &ntty);
-			set_fs(oldfs);
-			ret = (ret == -EINVAL ? -EOPNOTSUPP : ret);
-			goto out;
-		}
-	}
-
-	/* Binary compatibility is good American knowhow fuckin' up. */
-	if (cmd == TIOCNOTTY) {
-		ret = sys_setsid();
-		goto out;
-	}
-
-	/* SunOS networking ioctls. */
-	switch (cmd) {
-	case _IOW('r', 10, struct rtentry):
-		ret = sys_ioctl(fd, SIOCADDRT, arg);
-		goto out;
-	case _IOW('r', 11, struct rtentry):
-		ret = sys_ioctl(fd, SIOCDELRT, arg);
-		goto out;
-	case _IOW('i', 12, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFADDR, arg);
-		goto out;
-	case _IOWR('i', 13, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFADDR, arg);
-		goto out;
-	case _IOW('i', 14, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFDSTADDR, arg);
-		goto out;
-	case _IOWR('i', 15, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFDSTADDR, arg);
-		goto out;
-	case _IOW('i', 16, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFFLAGS, arg);
-		goto out;
-	case _IOWR('i', 17, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFFLAGS, arg);
-		goto out;
-	case _IOW('i', 18, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFMEM, arg);
-		goto out;
-	case _IOWR('i', 19, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFMEM, arg);
-		goto out;
-	case _IOWR('i', 20, struct ifconf):
-		ret = sys_ioctl(fd, SIOCGIFCONF, arg);
-		goto out;
-	case _IOW('i', 21, struct ifreq): /* SIOCSIFMTU */
-		ret = sys_ioctl(fd, SIOCSIFMTU, arg);
-		goto out;
-	case _IOWR('i', 22, struct ifreq): /* SIOCGIFMTU */
-		ret = sys_ioctl(fd, SIOCGIFMTU, arg);
-		goto out;
-
-	case _IOWR('i', 23, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFBRDADDR, arg);
-		goto out;
-	case _IOW('i', 24, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFBRDADDR, arg);
-		goto out;
-	case _IOWR('i', 25, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFNETMASK, arg);
-		goto out;
-	case _IOW('i', 26, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFNETMASK, arg);
-		goto out;
-	case _IOWR('i', 27, struct ifreq):
-		ret = sys_ioctl(fd, SIOCGIFMETRIC, arg);
-		goto out;
-	case _IOW('i', 28, struct ifreq):
-		ret = sys_ioctl(fd, SIOCSIFMETRIC, arg);
-		goto out;
-
-	case _IOW('i', 30, struct arpreq):
-		ret = sys_ioctl(fd, SIOCSARP, arg);
-		goto out;
-	case _IOWR('i', 31, struct arpreq):
-		ret = sys_ioctl(fd, SIOCGARP, arg);
-		goto out;
-	case _IOW('i', 32, struct arpreq):
-		ret = sys_ioctl(fd, SIOCDARP, arg);
-		goto out;
-
-	case _IOW('i', 40, struct ifreq): /* SIOCUPPER */
-	case _IOW('i', 41, struct ifreq): /* SIOCLOWER */
-	case _IOW('i', 44, struct ifreq): /* SIOCSETSYNC */
-	case _IOW('i', 45, struct ifreq): /* SIOCGETSYNC */
-	case _IOW('i', 46, struct ifreq): /* SIOCSSDSTATS */
-	case _IOW('i', 47, struct ifreq): /* SIOCSSESTATS */
-	case _IOW('i', 48, struct ifreq): /* SIOCSPROMISC */
-		ret = -EOPNOTSUPP;
-		goto out;
-
-	case _IOW('i', 49, struct ifreq):
-		ret = sys_ioctl(fd, SIOCADDMULTI, arg);
-		goto out;
-	case _IOW('i', 50, struct ifreq):
-		ret = sys_ioctl(fd, SIOCDELMULTI, arg);
-		goto out;
-
-	/* FDDI interface ioctls, unsupported. */
-		
-	case _IOW('i', 51, struct ifreq): /* SIOCFDRESET */
-	case _IOW('i', 52, struct ifreq): /* SIOCFDSLEEP */
-	case _IOW('i', 53, struct ifreq): /* SIOCSTRTFMWAR */
-	case _IOW('i', 54, struct ifreq): /* SIOCLDNSTRTFW */
-	case _IOW('i', 55, struct ifreq): /* SIOCGETFDSTAT */
-	case _IOW('i', 56, struct ifreq): /* SIOCFDNMIINT */
-	case _IOW('i', 57, struct ifreq): /* SIOCFDEXUSER */
-	case _IOW('i', 58, struct ifreq): /* SIOCFDGNETMAP */
-	case _IOW('i', 59, struct ifreq): /* SIOCFDGIOCTL */
-		printk("FDDI ioctl, returning EOPNOTSUPP\n");
-		ret = -EOPNOTSUPP;
-		goto out;
-
-	case _IOW('t', 125, int):
-		/* More stupid tty sunos ioctls, just
-		 * say it worked.
-		 */
-		ret = 0;
-		goto out;
-	/* Non posix grp */
-	case _IOW('t', 118, int): {
-		int oldval, newval, __user *ptr;
-
-		cmd = TIOCSPGRP;
-		ptr = (int __user *) arg;
-		ret = -EFAULT;
-		if (get_user(oldval, ptr))
-			goto out;
-		ret = sys_ioctl(fd, cmd, arg);
-		__get_user(newval, ptr);
-		if (newval == -1) {
-			__put_user(oldval, ptr);
-			ret = -EIO;
-		}
-		if (ret == -ENOTTY)
-			ret = -EIO;
-		goto out;
-	}
-
-	case _IOR('t', 119, int): {
-		int oldval, newval, __user *ptr;
-
-		cmd = TIOCGPGRP;
-		ptr = (int __user *) arg;
-		ret = -EFAULT;
-		if (get_user(oldval, ptr))
-			goto out;
-		ret = sys_ioctl(fd, cmd, arg);
-		__get_user(newval, ptr);
-		if (newval == -1) {
-			__put_user(oldval, ptr);
-			ret = -EIO;
-		}
-		if (ret == -ENOTTY)
-			ret = -EIO;
-		goto out;
-	}
-	}
-
-#if 0
-	if ((cmd & 0xff00) == ('k' << 8)) {
-		printk ("[[KBIO: %8.8x\n", (unsigned int) cmd);
-	}
-#endif
-
-	ret = sys_ioctl(fd, cmd, arg);
-	/* so stupid... */
-	ret = (ret == -EINVAL ? -EOPNOTSUPP : ret);
-out:
-	return ret;
-}
-
-
diff --git a/arch/sparc/kernel/sys_solaris.c b/arch/sparc/kernel/sys_solaris.c
deleted file mode 100644
index 2226a5992484..000000000000
--- a/arch/sparc/kernel/sys_solaris.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/sparc/kernel/sys_solaris.c
- *
- * Copyright (C) 1996 Miguel de Icaza (miguel@nuclecu.unam.mx)
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/personality.h>
-#include <linux/ptrace.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-
-asmlinkage int
-do_solaris_syscall (struct pt_regs *regs)
-{
-	static int cnt = 0;
-	if (++cnt < 10) printk ("No solaris handler\n");
-	force_sig(SIGSEGV, current);
-	return 0;
-}
-
-#ifndef CONFIG_SUNOS_EMUL
-asmlinkage int
-do_sunos_syscall (struct pt_regs *regs)
-{
-	static int cnt = 0;
-	if (++cnt < 10) printk ("SunOS binary emulation not compiled in\n");
-	force_sig (SIGSEGV, current);
-	return 0;
-}
-#endif
diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c
deleted file mode 100644
index f5b608bbe8af..000000000000
--- a/arch/sparc/kernel/sys_sunos.c
+++ /dev/null
@@ -1,1210 +0,0 @@
-/* $Id: sys_sunos.c,v 1.137 2002/02/08 03:57:14 davem Exp $
- * sys_sunos.c: SunOS specific syscall compatibility support.
- *
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1995 Miguel de Icaza (miguel@nuclecu.unam.mx)
- *
- * Based upon preliminary work which is:
- *
- * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu)
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/resource.h>
-#include <linux/ipc.h>
-#include <linux/shm.h>
-#include <linux/msg.h>
-#include <linux/sem.h>
-#include <linux/signal.h>
-#include <linux/uio.h>
-#include <linux/utsname.h>
-#include <linux/major.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/smp.h>
-#include <linux/smp_lock.h>
-#include <linux/syscalls.h>
-
-#include <net/sock.h>
-
-#include <asm/uaccess.h>
-#ifndef KERNEL_DS
-#include <linux/segment.h>
-#endif
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pconf.h>
-#include <asm/idprom.h> /* for gethostid() */
-#include <asm/unistd.h>
-#include <asm/system.h>
-
-/* For the nfs mount emulation */
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs_mount.h>
-
-/* for sunos_select */
-#include <linux/time.h>
-#include <linux/personality.h>
-
-/* NR_OPEN is now larger and dynamic in recent kernels. */
-#define SUNOS_NR_OPEN	256
-
-/* We use the SunOS mmap() semantics. */
-asmlinkage unsigned long sunos_mmap(unsigned long addr, unsigned long len,
-				    unsigned long prot, unsigned long flags,
-				    unsigned long fd, unsigned long off)
-{
-	struct file * file = NULL;
-	unsigned long retval, ret_type;
-
-	if (flags & MAP_NORESERVE) {
-		static int cnt;
-		if (cnt++ < 10)
-			printk("%s: unimplemented SunOS MAP_NORESERVE mmap() flag\n",
-			       current->comm);
-		flags &= ~MAP_NORESERVE;
-	}
-	retval = -EBADF;
-	if (!(flags & MAP_ANONYMOUS)) {
-		if (fd >= SUNOS_NR_OPEN)
-			goto out;
-		file = fget(fd);
-		if (!file)
-			goto out;
-	}
-
-	retval = -EINVAL;
-	/* If this is ld.so or a shared library doing an mmap
-	 * of /dev/zero, transform it into an anonymous mapping.
-	 * SunOS is so stupid some times... hmph!
-	 */
-	if (file) {
-		if (imajor(file->f_path.dentry->d_inode) == MEM_MAJOR &&
-		    iminor(file->f_path.dentry->d_inode) == 5) {
-			flags |= MAP_ANONYMOUS;
-			fput(file);
-			file = NULL;
-		}
-	}
-	ret_type = flags & _MAP_NEW;
-	flags &= ~_MAP_NEW;
-
-	if (!(flags & MAP_FIXED))
-		addr = 0;
-	else {
-		if (ARCH_SUN4C_SUN4 &&
-		    (len > 0x20000000 ||
-		     ((flags & MAP_FIXED) &&
-		      addr < 0xe0000000 && addr + len > 0x20000000)))
-			goto out_putf;
-
-		/* See asm-sparc/uaccess.h */
-		if (len > TASK_SIZE - PAGE_SIZE ||
-		    addr + len > TASK_SIZE - PAGE_SIZE)
-			goto out_putf;
-	}
-
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(file, addr, len, prot, flags, off);
-	up_write(&current->mm->mmap_sem);
-	if (!ret_type)
-		retval = ((retval < PAGE_OFFSET) ? 0 : retval);
-
-out_putf:
-	if (file)
-		fput(file);
-out:
-	return retval;
-}
-
-/* lmbench calls this, just say "yeah, ok" */
-asmlinkage int sunos_mctl(unsigned long addr, unsigned long len, int function, char *arg)
-{
-	return 0;
-}
-
-/* SunOS is completely broken... it returns 0 on success, otherwise
- * ENOMEM.  For sys_sbrk() it wants the old brk value as a return
- * on success and ENOMEM as before on failure.
- */
-asmlinkage int sunos_brk(unsigned long brk)
-{
-	int freepages, retval = -ENOMEM;
-	unsigned long rlim;
-	unsigned long newbrk, oldbrk;
-
-	down_write(&current->mm->mmap_sem);
-	if (ARCH_SUN4C_SUN4) {
-		if (brk >= 0x20000000 && brk < 0xe0000000) {
-			goto out;
-		}
-	}
-
-	if (brk < current->mm->end_code)
-		goto out;
-
-	newbrk = PAGE_ALIGN(brk);
-	oldbrk = PAGE_ALIGN(current->mm->brk);
-	retval = 0;
-	if (oldbrk == newbrk) {
-		current->mm->brk = brk;
-		goto out;
-	}
-
-	/*
-	 * Always allow shrinking brk
-	 */
-	if (brk <= current->mm->brk) {
-		current->mm->brk = brk;
-		do_munmap(current->mm, newbrk, oldbrk-newbrk);
-		goto out;
-	}
-	/*
-	 * Check against rlimit and stack..
-	 */
-	retval = -ENOMEM;
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim >= RLIM_INFINITY)
-		rlim = ~0;
-	if (brk - current->mm->end_code > rlim)
-		goto out;
-
-	/*
-	 * Check against existing mmap mappings.
-	 */
-	if (find_vma_intersection(current->mm, oldbrk, newbrk+PAGE_SIZE))
-		goto out;
-
-	/*
-	 * stupid algorithm to decide if we have enough memory: while
-	 * simple, it hopefully works in most obvious cases.. Easy to
-	 * fool it, but this should catch most mistakes.
-	 */
-	freepages = global_page_state(NR_FILE_PAGES);
-	freepages >>= 1;
-	freepages += nr_free_pages();
-	freepages += nr_swap_pages;
-	freepages -= num_physpages >> 4;
-	freepages -= (newbrk-oldbrk) >> PAGE_SHIFT;
-	if (freepages < 0)
-		goto out;
-	/*
-	 * Ok, we have probably got enough memory - let it rip.
-	 */
-	current->mm->brk = brk;
-	do_brk(oldbrk, newbrk-oldbrk);
-	retval = 0;
-out:
-	up_write(&current->mm->mmap_sem);
-	return retval;
-}
-
-asmlinkage unsigned long sunos_sbrk(int increment)
-{
-	int error;
-	unsigned long oldbrk;
-
-	/* This should do it hopefully... */
-	lock_kernel();
-	oldbrk = current->mm->brk;
-	error = sunos_brk(((int) current->mm->brk) + increment);
-	if (!error)
-		error = oldbrk;
-	unlock_kernel();
-	return error;
-}
-
-/* XXX Completely undocumented, and completely magic...
- * XXX I believe it is to increase the size of the stack by
- * XXX argument 'increment' and return the new end of stack
- * XXX area.  Wheee...
- */
-asmlinkage unsigned long sunos_sstk(int increment)
-{
-	lock_kernel();
-	printk("%s: Call to sunos_sstk(increment<%d>) is unsupported\n",
-	       current->comm, increment);
-	unlock_kernel();
-	return -1;
-}
-
-/* Give hints to the kernel as to what paging strategy to use...
- * Completely bogus, don't remind me.
- */
-#define VA_NORMAL     0 /* Normal vm usage expected */
-#define VA_ABNORMAL   1 /* Abnormal/random vm usage probable */
-#define VA_SEQUENTIAL 2 /* Accesses will be of a sequential nature */
-#define VA_INVALIDATE 3 /* Page table entries should be flushed ??? */
-static char *vstrings[] = {
-	"VA_NORMAL",
-	"VA_ABNORMAL",
-	"VA_SEQUENTIAL",
-	"VA_INVALIDATE",
-};
-
-asmlinkage void sunos_vadvise(unsigned long strategy)
-{
-	/* I wanna see who uses this... */
-	lock_kernel();
-	printk("%s: Advises us to use %s paging strategy\n",
-	       current->comm,
-	       strategy <= 3 ? vstrings[strategy] : "BOGUS");
-	unlock_kernel();
-}
-
-/* This just wants the soft limit (ie. rlim_cur element) of the RLIMIT_NOFILE
- * resource limit and is for backwards compatibility with older sunos
- * revs.
- */
-asmlinkage long sunos_getdtablesize(void)
-{
-	return SUNOS_NR_OPEN;
-}
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-asmlinkage unsigned long sunos_sigblock(unsigned long blk_mask)
-{
-	unsigned long old;
-
-	spin_lock_irq(&current->sighand->siglock);
-	old = current->blocked.sig[0];
-	current->blocked.sig[0] |= (blk_mask & _BLOCKABLE);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	return old;
-}
-
-asmlinkage unsigned long sunos_sigsetmask(unsigned long newmask)
-{
-	unsigned long retval;
-
-	spin_lock_irq(&current->sighand->siglock);
-	retval = current->blocked.sig[0];
-	current->blocked.sig[0] = (newmask & _BLOCKABLE);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	return retval;
-}
-
-/* SunOS getdents is very similar to the newer Linux (iBCS2 compliant)    */
-/* getdents system call, the format of the structure just has a different */
-/* layout (d_off+d_ino instead of d_ino+d_off) */
-struct sunos_dirent {
-    long           d_off;
-    unsigned long  d_ino;
-    unsigned short d_reclen;
-    unsigned short d_namlen;
-    char           d_name[1];
-};
-
-struct sunos_dirent_callback {
-    struct sunos_dirent __user *curr;
-    struct sunos_dirent __user *previous;
-    int count;
-    int error;
-};
-
-#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
-#define ROUND_UP(x) (((x)+sizeof(long)-1) & ~(sizeof(long)-1))
-
-static int sunos_filldir(void * __buf, const char * name, int namlen,
-			 loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct sunos_dirent __user *dirent;
-	struct sunos_dirent_callback * buf = __buf;
-	unsigned long d_ino;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
-		return -EOVERFLOW;
-	dirent = buf->previous;
-	if (dirent)
-		put_user(offset, &dirent->d_off);
-	dirent = buf->curr;
-	buf->previous = dirent;
-	put_user(d_ino, &dirent->d_ino);
-	put_user(namlen, &dirent->d_namlen);
-	put_user(reclen, &dirent->d_reclen);
-	copy_to_user(dirent->d_name, name, namlen);
-	put_user(0, dirent->d_name + namlen);
-	dirent = (void __user *) dirent + reclen;
-	buf->curr = dirent;
-	buf->count -= reclen;
-	return 0;
-}
-
-asmlinkage int sunos_getdents(unsigned int fd, void __user *dirent, int cnt)
-{
-	struct file * file;
-	struct sunos_dirent __user *lastdirent;
-	struct sunos_dirent_callback buf;
-	int error = -EBADF;
-
-	if (fd >= SUNOS_NR_OPEN)
-		goto out;
-
-	file = fget(fd);
-	if (!file)
-		goto out;
-
-	error = -EINVAL;
-	if (cnt < (sizeof(struct sunos_dirent) + 255))
-		goto out_putf;
-
-	buf.curr = (struct sunos_dirent __user *) dirent;
-	buf.previous = NULL;
-	buf.count = cnt;
-	buf.error = 0;
-
-	error = vfs_readdir(file, sunos_filldir, &buf);
-	if (error < 0)
-		goto out_putf;
-
-	lastdirent = buf.previous;
-	error = buf.error;
-	if (lastdirent) {
-		put_user(file->f_pos, &lastdirent->d_off);
-		error = cnt - buf.count;
-	}
-
-out_putf:
-	fput(file);
-out:
-	return error;
-}
-
-/* Old sunos getdirentries, severely broken compatibility stuff here. */
-struct sunos_direntry {
-    unsigned long  d_ino;
-    unsigned short d_reclen;
-    unsigned short d_namlen;
-    char           d_name[1];
-};
-
-struct sunos_direntry_callback {
-    struct sunos_direntry __user *curr;
-    struct sunos_direntry __user *previous;
-    int count;
-    int error;
-};
-
-static int sunos_filldirentry(void * __buf, const char * name, int namlen,
-			      loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct sunos_direntry __user *dirent;
-	struct sunos_direntry_callback *buf = __buf;
-	unsigned long d_ino;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
-		return -EOVERFLOW;
-	dirent = buf->previous;
-	dirent = buf->curr;
-	buf->previous = dirent;
-	put_user(d_ino, &dirent->d_ino);
-	put_user(namlen, &dirent->d_namlen);
-	put_user(reclen, &dirent->d_reclen);
-	copy_to_user(dirent->d_name, name, namlen);
-	put_user(0, dirent->d_name + namlen);
-	dirent = (void __user *) dirent + reclen;
-	buf->curr = dirent;
-	buf->count -= reclen;
-	return 0;
-}
-
-asmlinkage int sunos_getdirentries(unsigned int fd, void __user *dirent,
-				   int cnt, unsigned int __user *basep)
-{
-	struct file * file;
-	struct sunos_direntry __user *lastdirent;
-	struct sunos_direntry_callback buf;
-	int error = -EBADF;
-
-	if (fd >= SUNOS_NR_OPEN)
-		goto out;
-
-	file = fget(fd);
-	if (!file)
-		goto out;
-
-	error = -EINVAL;
-	if (cnt < (sizeof(struct sunos_direntry) + 255))
-		goto out_putf;
-
-	buf.curr = (struct sunos_direntry __user *) dirent;
-	buf.previous = NULL;
-	buf.count = cnt;
-	buf.error = 0;
-
-	error = vfs_readdir(file, sunos_filldirentry, &buf);
-	if (error < 0)
-		goto out_putf;
-
-	lastdirent = buf.previous;
-	error = buf.error;
-	if (lastdirent) {
-		put_user(file->f_pos, basep);
-		error = cnt - buf.count;
-	}
-
-out_putf:
-	fput(file);
-out:
-	return error;
-}
-
-struct sunos_utsname {
-	char sname[9];
-	char nname[9];
-	char nnext[56];
-	char rel[9];
-	char ver[9];
-	char mach[9];
-};
-
-asmlinkage int sunos_uname(struct sunos_utsname __user *name)
-{
-	int ret;
-	down_read(&uts_sem);
-	ret = copy_to_user(&name->sname[0], &utsname()->sysname[0],
-			   sizeof(name->sname) - 1);
-	if (!ret) {
-		ret |= __copy_to_user(&name->nname[0], &utsname()->nodename[0],
-				      sizeof(name->nname) - 1);
-		ret |= __put_user('\0', &name->nname[8]);
-		ret |= __copy_to_user(&name->rel[0], &utsname()->release[0],
-				      sizeof(name->rel) - 1);
-		ret |= __copy_to_user(&name->ver[0], &utsname()->version[0],
-				      sizeof(name->ver) - 1);
-		ret |= __copy_to_user(&name->mach[0], &utsname()->machine[0],
-				      sizeof(name->mach) - 1);
-	}
-	up_read(&uts_sem);
-	return ret ? -EFAULT : 0;
-}
-
-asmlinkage int sunos_nosys(void)
-{
-	struct pt_regs *regs;
-	siginfo_t info;
-	static int cnt;
-
-	lock_kernel();
-	regs = current->thread.kregs;
-	info.si_signo = SIGSYS;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT|0x100;
-	info.si_addr = (void __user *)regs->pc;
-	info.si_trapno = regs->u_regs[UREG_G1];
-	send_sig_info(SIGSYS, &info, current);
-	if (cnt++ < 4) {
-		printk("Process makes ni_syscall number %d, register dump:\n",
-		       (int) regs->u_regs[UREG_G1]);
-		show_regs(regs);
-	}
-	unlock_kernel();
-	return -ENOSYS;
-}
-
-/* This is not a real and complete implementation yet, just to keep
- * the easy SunOS binaries happy.
- */
-asmlinkage int sunos_fpathconf(int fd, int name)
-{
-	int ret;
-
-	switch(name) {
-	case _PCONF_LINK:
-		ret = LINK_MAX;
-		break;
-	case _PCONF_CANON:
-		ret = MAX_CANON;
-		break;
-	case _PCONF_INPUT:
-		ret = MAX_INPUT;
-		break;
-	case _PCONF_NAME:
-		ret = NAME_MAX;
-		break;
-	case _PCONF_PATH:
-		ret = PATH_MAX;
-		break;
-	case _PCONF_PIPE:
-		ret = PIPE_BUF;
-		break;
-	case _PCONF_CHRESTRICT:		/* XXX Investigate XXX */
-		ret = 1;
-		break;
-	case _PCONF_NOTRUNC:		/* XXX Investigate XXX */
-	case _PCONF_VDISABLE:
-		ret = 0;
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	return ret;
-}
-
-asmlinkage int sunos_pathconf(char __user *path, int name)
-{
-	int ret;
-
-	ret = sunos_fpathconf(0, name); /* XXX cheese XXX */
-	return ret;
-}
-
-/* SunOS mount system call emulation */
-
-asmlinkage int sunos_select(int width, fd_set __user *inp, fd_set __user *outp,
-			    fd_set __user *exp, struct timeval __user *tvp)
-{
-	int ret;
-
-	/* SunOS binaries expect that select won't change the tvp contents */
-	ret = sys_select (width, inp, outp, exp, tvp);
-	if (ret == -EINTR && tvp) {
-		time_t sec, usec;
-
-		__get_user(sec, &tvp->tv_sec);
-		__get_user(usec, &tvp->tv_usec);
-
-		if (sec == 0 && usec == 0)
-			ret = 0;
-	}
-	return ret;
-}
-
-asmlinkage void sunos_nop(void)
-{
-	return;
-}
-
-/* SunOS mount/umount. */
-#define SMNT_RDONLY       1
-#define SMNT_NOSUID       2
-#define SMNT_NEWTYPE      4
-#define SMNT_GRPID        8
-#define SMNT_REMOUNT      16
-#define SMNT_NOSUB        32
-#define SMNT_MULTI        64
-#define SMNT_SYS5         128
-
-struct sunos_fh_t {
-	char fh_data [NFS_FHSIZE];
-};
-
-struct sunos_nfs_mount_args {
-	struct sockaddr_in  __user *addr; /* file server address */
-	struct nfs_fh __user *fh;     /* File handle to be mounted */
-	int        flags;      /* flags */
-	int        wsize;      /* write size in bytes */
-	int        rsize;      /* read size in bytes */
-	int        timeo;      /* initial timeout in .1 secs */
-	int        retrans;    /* times to retry send */
-	char       __user *hostname;  /* server's hostname */
-	int        acregmin;   /* attr cache file min secs */
-	int        acregmax;   /* attr cache file max secs */
-	int        acdirmin;   /* attr cache dir min secs */
-	int        acdirmax;   /* attr cache dir max secs */
-	char       __user *netname;   /* server's netname */
-};
-
-
-/* Bind the socket on a local reserved port and connect it to the
- * remote server.  This on Linux/i386 is done by the mount program,
- * not by the kernel.
- */
-static int
-sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr)
-{
-	struct sockaddr_in local;
-	struct sockaddr_in server;
-	int    try_port;
-	struct socket *socket;
-	struct inode  *inode;
-	struct file   *file;
-	int    ret, result = 0;
-
-	file = fget(fd);
-	if (!file)
-		goto out;
-
-	inode = file->f_path.dentry->d_inode;
-
-	socket = SOCKET_I(inode);
-	local.sin_family = AF_INET;
-	local.sin_addr.s_addr = htonl(INADDR_ANY);
-
-	/* IPPORT_RESERVED = 1024, can't find the definition in the kernel */
-	try_port = 1024;
-	do {
-		local.sin_port = htons (--try_port);
-		ret = socket->ops->bind(socket, (struct sockaddr*)&local,
-					sizeof(local));
-	} while (ret && try_port > (1024 / 2));
-
-	if (ret)
-		goto out_putf;
-
-	server.sin_family = AF_INET;
-	server.sin_addr = addr->sin_addr;
-	server.sin_port = NFS_PORT;
-
-	/* Call sys_connect */
-	ret = socket->ops->connect (socket, (struct sockaddr *) &server,
-				    sizeof (server), file->f_flags);
-	if (ret >= 0)
-		result = 1;
-
-out_putf:
-	fput(file);
-out:
-	return result;
-}
-
-static int get_default (int value, int def_value)
-{
-    if (value)
-	return value;
-    else
-	return def_value;
-}
-
-static int sunos_nfs_mount(char *dir_name, int linux_flags, void __user *data)
-{
-	int  server_fd, err;
-	char *the_name, *mount_page;
-	struct nfs_mount_data linux_nfs_mount;
-	struct sunos_nfs_mount_args sunos_mount;
-
-	/* Ok, here comes the fun part: Linux's nfs mount needs a
-	 * socket connection to the server, but SunOS mount does not
-	 * require this, so we use the information on the destination
-	 * address to create a socket and bind it to a reserved
-	 * port on this system
-	 */
-	if (copy_from_user(&sunos_mount, data, sizeof(sunos_mount)))
-		return -EFAULT;
-
-	server_fd = sys_socket (AF_INET, SOCK_DGRAM, IPPROTO_UDP);
-	if (server_fd < 0)
-		return -ENXIO;
-
-	if (copy_from_user(&linux_nfs_mount.addr,sunos_mount.addr,
-				sizeof(*sunos_mount.addr)) ||
-	    copy_from_user(&linux_nfs_mount.root,sunos_mount.fh,
-				sizeof(*sunos_mount.fh))) {
-		sys_close (server_fd);
-		return -EFAULT;
-	}
-
-	if (!sunos_nfs_get_server_fd (server_fd, &linux_nfs_mount.addr)){
-		sys_close (server_fd);
-		return -ENXIO;
-	}
-
-	/* Now, bind it to a locally reserved port */
-	linux_nfs_mount.version  = NFS_MOUNT_VERSION;
-	linux_nfs_mount.flags    = sunos_mount.flags;
-	linux_nfs_mount.fd       = server_fd;
-	
-	linux_nfs_mount.rsize    = get_default (sunos_mount.rsize, 8192);
-	linux_nfs_mount.wsize    = get_default (sunos_mount.wsize, 8192);
-	linux_nfs_mount.timeo    = get_default (sunos_mount.timeo, 10);
-	linux_nfs_mount.retrans  = sunos_mount.retrans;
-	
-	linux_nfs_mount.acregmin = sunos_mount.acregmin;
-	linux_nfs_mount.acregmax = sunos_mount.acregmax;
-	linux_nfs_mount.acdirmin = sunos_mount.acdirmin;
-	linux_nfs_mount.acdirmax = sunos_mount.acdirmax;
-
-	the_name = getname(sunos_mount.hostname);
-	if (IS_ERR(the_name))
-		return PTR_ERR(the_name);
-
-	strlcpy(linux_nfs_mount.hostname, the_name,
-		sizeof(linux_nfs_mount.hostname));
-	putname (the_name);
-	
-	mount_page = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!mount_page)
-		return -ENOMEM;
-
-	memcpy(mount_page, &linux_nfs_mount, sizeof(linux_nfs_mount));
-
-	err = do_mount("", dir_name, "nfs", linux_flags, mount_page);
-
-	free_page((unsigned long) mount_page);
-	return err;
-}
-
-asmlinkage int
-sunos_mount(char __user *type, char __user *dir, int flags, void __user *data)
-{
-	int linux_flags = 0;
-	int ret = -EINVAL;
-	char *dev_fname = NULL;
-	char *dir_page, *type_page;
-
-	if (!capable (CAP_SYS_ADMIN))
-		return -EPERM;
-		
-	lock_kernel();
-	/* We don't handle the integer fs type */
-	if ((flags & SMNT_NEWTYPE) == 0)
-		goto out;
-
-	/* Do not allow for those flags we don't support */
-	if (flags & (SMNT_GRPID|SMNT_NOSUB|SMNT_MULTI|SMNT_SYS5))
-		goto out;
-
-	if (flags & SMNT_REMOUNT)
-		linux_flags |= MS_REMOUNT;
-	if (flags & SMNT_RDONLY)
-		linux_flags |= MS_RDONLY;
-	if (flags & SMNT_NOSUID)
-		linux_flags |= MS_NOSUID;
-
-	dir_page = getname(dir);
-	ret = PTR_ERR(dir_page);
-	if (IS_ERR(dir_page))
-		goto out;
-
-	type_page = getname(type);
-	ret = PTR_ERR(type_page);
-	if (IS_ERR(type_page))
-		goto out1;
-
-	if (strcmp(type_page, "ext2") == 0) {
-		dev_fname = getname(data);
-	} else if (strcmp(type_page, "iso9660") == 0) {
-		dev_fname = getname(data);
-	} else if (strcmp(type_page, "minix") == 0) {
-		dev_fname = getname(data);
-	} else if (strcmp(type_page, "nfs") == 0) {
-		ret = sunos_nfs_mount (dir_page, flags, data);
-		goto out2;
-        } else if (strcmp(type_page, "ufs") == 0) {
-		printk("Warning: UFS filesystem mounts unsupported.\n");
-		ret = -ENODEV;
-		goto out2;
-	} else if (strcmp(type_page, "proc")) {
-		ret = -ENODEV;
-		goto out2;
-	}
-	ret = PTR_ERR(dev_fname);
-	if (IS_ERR(dev_fname))
-		goto out2;
-	ret = do_mount(dev_fname, dir_page, type_page, linux_flags, NULL);
-	if (dev_fname)
-		putname(dev_fname);
-out2:
-	putname(type_page);
-out1:
-	putname(dir_page);
-out:
-	unlock_kernel();
-	return ret;
-}
-
-
-asmlinkage int sunos_setpgrp(pid_t pid, pid_t pgid)
-{
-	int ret;
-
-	/* So stupid... */
-	if ((!pid || pid == current->pid) &&
-	    !pgid) {
-		sys_setsid();
-		ret = 0;
-	} else {
-		ret = sys_setpgid(pid, pgid);
-	}
-	return ret;
-}
-
-/* So stupid... */
-asmlinkage int sunos_wait4(pid_t pid, unsigned int __user *stat_addr,
-			   int options, struct rusage __user*ru)
-{
-	int ret;
-
-	ret = sys_wait4((pid ? pid : -1), stat_addr, options, ru);
-	return ret;
-}
-
-asmlinkage int sunos_killpg(int pgrp, int sig)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = -EINVAL;
-	if (pgrp > 0)
-		ret = kill_pgrp(find_vpid(pgrp), sig, 0);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-asmlinkage int sunos_audit(void)
-{
-	lock_kernel();
-	printk ("sys_audit\n");
-	unlock_kernel();
-	return -1;
-}
-
-asmlinkage unsigned long sunos_gethostid(void)
-{
-	unsigned long ret;
-
-	lock_kernel();
-	ret = ((unsigned long)idprom->id_machtype << 24) |
-		(unsigned long)idprom->id_sernum;
-	unlock_kernel();
-	return ret;
-}
-
-/* sysconf options, for SunOS compatibility */
-#define   _SC_ARG_MAX             1
-#define   _SC_CHILD_MAX           2
-#define   _SC_CLK_TCK             3
-#define   _SC_NGROUPS_MAX         4
-#define   _SC_OPEN_MAX            5
-#define   _SC_JOB_CONTROL         6
-#define   _SC_SAVED_IDS           7
-#define   _SC_VERSION             8
-
-asmlinkage long sunos_sysconf (int name)
-{
-	long ret;
-
-	switch (name){
-	case _SC_ARG_MAX:
-		ret = ARG_MAX;
-		break;
-	case _SC_CHILD_MAX:
-		ret = current->signal->rlim[RLIMIT_NPROC].rlim_cur;
-		break;
-	case _SC_CLK_TCK:
-		ret = HZ;
-		break;
-	case _SC_NGROUPS_MAX:
-		ret = NGROUPS_MAX;
-		break;
-	case _SC_OPEN_MAX:
-		ret = current->signal->rlim[RLIMIT_NOFILE].rlim_cur;
-		break;
-	case _SC_JOB_CONTROL:
-		ret = 1;	/* yes, we do support job control */
-		break;
-	case _SC_SAVED_IDS:
-		ret = 1;	/* yes, we do support saved uids  */
-		break;
-	case _SC_VERSION:
-		/* mhm, POSIX_VERSION is in /usr/include/unistd.h
-		 * should it go on /usr/include/linux?
-		 */
-		ret = 199009L; 
-		break;
-	default:
-		ret = -1;
-		break;
-	};
-	return ret;
-}
-
-asmlinkage int sunos_semsys(int op, unsigned long arg1, unsigned long arg2,
-			    unsigned long arg3, void *ptr)
-{
-	union semun arg4;
-	int ret;
-
-	switch (op) {
-	case 0:
-		/* Most arguments match on a 1:1 basis but cmd doesn't */
-		switch(arg3) {
-		case 4:
-			arg3=GETPID; break;
-		case 5:
-			arg3=GETVAL; break;
-		case 6:
-			arg3=GETALL; break;
-		case 3:
-			arg3=GETNCNT; break;
-		case 7:
-			arg3=GETZCNT; break;
-		case 8:
-			arg3=SETVAL; break;
-		case 9:
-			arg3=SETALL; break;
-		}
-		/* sys_semctl(): */
-		/* value to modify semaphore to */
-		arg4.__pad = (void __user *) ptr;
-		ret = sys_semctl((int)arg1, (int)arg2, (int)arg3, arg4 );
-		break;
-	case 1:
-		/* sys_semget(): */
-		ret = sys_semget((key_t)arg1, (int)arg2, (int)arg3);
-		break;
-	case 2:
-		/* sys_semop(): */
-		ret = sys_semop((int)arg1, (struct sembuf __user *)arg2, (unsigned)arg3);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	};
-	return ret;
-}
-
-asmlinkage int sunos_msgsys(int op, unsigned long arg1, unsigned long arg2,
-			    unsigned long arg3, unsigned long arg4)
-{
-	struct sparc_stackf *sp;
-	unsigned long arg5;
-	int rval;
-
-	switch(op) {
-	case 0:
-		rval = sys_msgget((key_t)arg1, (int)arg2);
-		break;
-	case 1:
-		rval = sys_msgctl((int)arg1, (int)arg2,
-				  (struct msqid_ds __user *)arg3);
-		break;
-	case 2:
-		lock_kernel();
-		sp = (struct sparc_stackf *)current->thread.kregs->u_regs[UREG_FP];
-		arg5 = sp->xxargs[0];
-		unlock_kernel();
-		rval = sys_msgrcv((int)arg1, (struct msgbuf __user *)arg2,
-				  (size_t)arg3, (long)arg4, (int)arg5);
-		break;
-	case 3:
-		rval = sys_msgsnd((int)arg1, (struct msgbuf __user *)arg2,
-				  (size_t)arg3, (int)arg4);
-		break;
-	default:
-		rval = -EINVAL;
-		break;
-	}
-	return rval;
-}
-
-asmlinkage int sunos_shmsys(int op, unsigned long arg1, unsigned long arg2,
-			    unsigned long arg3)
-{
-	unsigned long raddr;
-	int rval;
-
-	switch(op) {
-	case 0:
-		/* do_shmat(): attach a shared memory area */
-		rval = do_shmat((int)arg1,(char __user *)arg2,(int)arg3,&raddr);
-		if (!rval)
-			rval = (int) raddr;
-		break;
-	case 1:
-		/* sys_shmctl(): modify shared memory area attr. */
-		rval = sys_shmctl((int)arg1,(int)arg2,(struct shmid_ds __user *)arg3);
-		break;
-	case 2:
-		/* sys_shmdt(): detach a shared memory area */
-		rval = sys_shmdt((char __user *)arg1);
-		break;
-	case 3:
-		/* sys_shmget(): get a shared memory area */
-		rval = sys_shmget((key_t)arg1,(int)arg2,(int)arg3);
-		break;
-	default:
-		rval = -EINVAL;
-		break;
-	};
-	return rval;
-}
-
-#define SUNOS_EWOULDBLOCK 35
-
-/* see the sunos man page read(2v) for an explanation
-   of this garbage. We use O_NDELAY to mark
-   file descriptors that have been set non-blocking 
-   using 4.2BSD style calls. (tridge) */
-
-static inline int check_nonblock(int ret, int fd)
-{
-	if (ret == -EAGAIN) {
-		struct file * file = fget(fd);
-		if (file) {
-			if (file->f_flags & O_NDELAY)
-				ret = -SUNOS_EWOULDBLOCK;
-			fput(file);
-		}
-	}
-	return ret;
-}
-
-asmlinkage int sunos_read(unsigned int fd, char __user *buf, int count)
-{
-	int ret;
-
-	ret = check_nonblock(sys_read(fd,buf,count),fd);
-	return ret;
-}
-
-asmlinkage int sunos_readv(unsigned long fd, const struct iovec __user *vector,
-			   long count)
-{
-	int ret;
-
-	ret = check_nonblock(sys_readv(fd,vector,count),fd);
-	return ret;
-}
-
-asmlinkage int sunos_write(unsigned int fd, char __user *buf, int count)
-{
-	int ret;
-
-	ret = check_nonblock(sys_write(fd,buf,count),fd);
-	return ret;
-}
-
-asmlinkage int sunos_writev(unsigned long fd,
-			    const struct iovec __user *vector, long count)
-{
-	int ret;
-
-	ret = check_nonblock(sys_writev(fd,vector,count),fd);
-	return ret;
-}
-
-asmlinkage int sunos_recv(int fd, void __user *ubuf, int size, unsigned flags)
-{
-	int ret;
-
-	ret = check_nonblock(sys_recv(fd,ubuf,size,flags),fd);
-	return ret;
-}
-
-asmlinkage int sunos_send(int fd, void __user *buff, int len, unsigned flags)
-{
-	int ret;
-
-	ret = check_nonblock(sys_send(fd,buff,len,flags),fd);
-	return ret;
-}
-
-asmlinkage int sunos_accept(int fd, struct sockaddr __user *sa,
-			    int __user *addrlen)
-{
-	int ret;
-
-	while (1) {
-		ret = check_nonblock(sys_accept(fd,sa,addrlen),fd);	
-		if (ret != -ENETUNREACH && ret != -EHOSTUNREACH)
-			break;
-	}
-
-	return ret;
-}
-
-#define SUNOS_SV_INTERRUPT 2
-
-asmlinkage int
-sunos_sigaction(int sig, const struct old_sigaction __user *act,
-		struct old_sigaction __user *oact)
-{
-	struct k_sigaction new_ka, old_ka;
-	int ret;
-
-	if (act) {
-		old_sigset_t mask;
-
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
-		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
-		    __get_user(new_ka.sa.sa_flags, &act->sa_flags))
-			return -EFAULT;
-		__get_user(mask, &act->sa_mask);
-		new_ka.sa.sa_restorer = NULL;
-		new_ka.ka_restorer = NULL;
-		siginitset(&new_ka.sa.sa_mask, mask);
-		new_ka.sa.sa_flags ^= SUNOS_SV_INTERRUPT;
-	}
-
-	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
-	if (!ret && oact) {
-		/* In the clone() case we could copy half consistent
-		 * state to the user, however this could sleep and
-		 * deadlock us if we held the signal lock on SMP.  So for
-		 * now I take the easy way out and do no locking.
-		 * But then again we don't support SunOS lwp's anyways ;-)
-		 */
-		old_ka.sa.sa_flags ^= SUNOS_SV_INTERRUPT;
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
-		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
-		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags))
-			 return -EFAULT;
-		__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
-	}
-
-	return ret;
-}
-
-
-asmlinkage int sunos_setsockopt(int fd, int level, int optname,
-				char __user *optval, int optlen)
-{
-	int tr_opt = optname;
-	int ret;
-
-	if (level == SOL_IP) {
-		/* Multicast socketopts (ttl, membership) */
-		if (tr_opt >=2 && tr_opt <= 6)
-			tr_opt += 30;
-	}
-	ret = sys_setsockopt(fd, level, tr_opt, optval, optlen);
-	return ret;
-}
-
-asmlinkage int sunos_getsockopt(int fd, int level, int optname,
-				char __user *optval, int __user *optlen)
-{
-	int tr_opt = optname;
-	int ret;
-
-	if (level == SOL_IP) {
-		/* Multicast socketopts (ttl, membership) */
-		if (tr_opt >=2 && tr_opt <= 6)
-			tr_opt += 30;
-	}
-	ret = sys_getsockopt(fd, level, tr_opt, optval, optlen);
-	return ret;
-}
diff --git a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S
index 9064485dc40b..5a7c4c8345c3 100644
--- a/arch/sparc/kernel/systbls.S
+++ b/arch/sparc/kernel/systbls.S
@@ -81,124 +81,3 @@ sys_call_table:
 /*305*/	.long sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
 /*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 /*315*/	.long sys_timerfd_settime, sys_timerfd_gettime
-
-#ifdef CONFIG_SUNOS_EMUL
-	/* Now the SunOS syscall table. */
-
-	.align 4
-	.globl sunos_sys_table
-sunos_sys_table:
-/*0*/	.long sunos_indir, sys_exit, sys_fork
-	.long sunos_read, sunos_write, sys_open
-	.long sys_close, sunos_wait4, sys_creat
-	.long sys_link, sys_unlink, sunos_execv
-	.long sys_chdir, sunos_nosys, sys_mknod
-	.long sys_chmod, sys_lchown16, sunos_brk
-	.long sunos_nosys, sys_lseek, sunos_getpid
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_getuid, sunos_nosys, sys_ptrace
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sys_access, sunos_nosys, sunos_nosys
-	.long sys_sync, sys_kill, sys_newstat
-	.long sunos_nosys, sys_newlstat, sys_dup
-	.long sys_pipe, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_getgid
-	.long sunos_nosys, sunos_nosys
-/*50*/	.long sunos_nosys, sys_acct, sunos_nosys
-	.long sunos_mctl, sunos_ioctl, sys_reboot
-	.long sunos_nosys, sys_symlink, sys_readlink
-	.long sys_execve, sys_umask, sys_chroot
-	.long sys_newfstat, sunos_nosys, sys_getpagesize
-	.long sys_msync, sys_vfork, sunos_nosys
-	.long sunos_nosys, sunos_sbrk, sunos_sstk
-	.long sunos_mmap, sunos_vadvise, sys_munmap
-	.long sys_mprotect, sys_madvise, sys_vhangup
-	.long sunos_nosys, sys_mincore, sys_getgroups16
-	.long sys_setgroups16, sys_getpgrp, sunos_setpgrp
-	.long sys_setitimer, sunos_nosys, sys_swapon
-	.long sys_getitimer, sys_gethostname, sys_sethostname
-	.long sunos_getdtablesize, sys_dup2, sunos_nop
-	.long sys_fcntl, sunos_select, sunos_nop
-	.long sys_fsync, sys_setpriority, sys_socket
-	.long sys_connect, sunos_accept
-/*100*/	.long sys_getpriority, sunos_send, sunos_recv
-	.long sunos_nosys, sys_bind, sunos_setsockopt
-	.long sys_listen, sunos_nosys, sunos_sigaction
-	.long sunos_sigblock, sunos_sigsetmask, sys_sigpause
-	.long sys_sigstack, sys_recvmsg, sys_sendmsg
-	.long sunos_nosys, sys_gettimeofday, sys_getrusage
-	.long sunos_getsockopt, sunos_nosys, sunos_readv
-	.long sunos_writev, sys_settimeofday, sys_fchown16
-	.long sys_fchmod, sys_recvfrom, sys_setreuid16
-	.long sys_setregid16, sys_rename, sys_truncate
-	.long sys_ftruncate, sys_flock, sunos_nosys
-	.long sys_sendto, sys_shutdown, sys_socketpair
-	.long sys_mkdir, sys_rmdir, sys_utimes
-	.long sys_sigreturn, sunos_nosys, sys_getpeername
-	.long sunos_gethostid, sunos_nosys, sys_getrlimit
-	.long sys_setrlimit, sunos_killpg, sunos_nosys
-	.long sunos_nosys, sunos_nosys
-/*150*/	.long sys_getsockname, sunos_nosys, sunos_nosys
-	.long sys_poll, sunos_nosys, sunos_nosys
-	.long sunos_getdirentries, sys_statfs, sys_fstatfs
-	.long sys_oldumount, sunos_nosys, sunos_nosys
-	.long sys_getdomainname, sys_setdomainname
-	.long sunos_nosys, sys_quotactl, sunos_nosys
-	.long sunos_mount, sys_ustat, sunos_semsys
-	.long sunos_msgsys, sunos_shmsys, sunos_audit
-	.long sunos_nosys, sunos_getdents, sys_setsid
-	.long sys_fchdir, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sys_sigpending, sunos_nosys
-	.long sys_setpgid, sunos_pathconf, sunos_fpathconf
-	.long sunos_sysconf, sunos_uname, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-/*200*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys
-/*250*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-/*260*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-/*270*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-/*280*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-/*290*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-/*300*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-/*310*/	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys, sunos_nosys, sunos_nosys
-	.long sunos_nosys
-
-#endif
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 463d1be32c98..b782d874759c 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -87,9 +87,6 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	def_bool y
 
-config ARCH_SUPPORTS_AOUT
-	def_bool y
-
 choice
 	prompt "Kernel page size"
 	default SPARC64_PAGE_SIZE_8KB
@@ -147,11 +144,6 @@ config HOTPLUG_CPU
 
 source "init/Kconfig"
 
-config SYSVIPC_COMPAT
-	bool
-	depends on COMPAT && SYSVIPC
-	default y
-
 config GENERIC_HARDIRQS
 	bool
 	default y
@@ -379,6 +371,10 @@ config SUN_OPENPROMFS
 	  To compile the /proc/openprom support as a module, choose M here: the
 	  module will be called openpromfs.  If unsure, choose M.
 
+menu "Executable file formats"
+
+source "fs/Kconfig.binfmt"
+
 config SPARC32_COMPAT
 	bool "Kernel support for Linux/Sparc 32bit binary compatibility"
 	help
@@ -391,37 +387,10 @@ config COMPAT
 	default y
 	select COMPAT_BINFMT_ELF
 
-config BINFMT_AOUT32
-	bool "Kernel support for 32-bit (ie. SunOS) a.out binaries"
-	depends on SPARC32_COMPAT && ARCH_SUPPORTS_AOUT
-	help
-	  This allows you to run 32-bit a.out format binaries on your Ultra.
-	  If you want to run SunOS binaries (see SunOS binary emulation below)
-	  or other a.out binaries, say Y. If unsure, say N.
-
-menu "Executable file formats"
-
-source "fs/Kconfig.binfmt"
-
-config SUNOS_EMUL
-	bool "SunOS binary emulation"
-	depends on BINFMT_AOUT32
-	help
-	  This allows you to run most SunOS binaries.  If you want to do this,
-	  say Y here and place appropriate files in /usr/gnemul/sunos. See
-	  <http://www.ultralinux.org/faq.html> for more information.  If you
-	  want to run SunOS binaries on an Ultra you must also say Y to
-	  "Kernel support for 32-bit a.out binaries" above.
-
-config SOLARIS_EMUL
-	tristate "Solaris binary emulation (EXPERIMENTAL)"
-	depends on SPARC32_COMPAT && NET && EXPERIMENTAL
-	help
-	  This is experimental code which will enable you to run (many)
-	  Solaris binaries on your SPARC Linux machine.
-
-	  To compile this code as a module, choose M here: the
-	  module will be called solaris.
+config SYSVIPC_COMPAT
+	bool
+	depends on COMPAT && SYSVIPC
+	default y
 
 endmenu
 
diff --git a/arch/sparc64/Makefile b/arch/sparc64/Makefile
index f0c22f826982..9cb75c852b45 100644
--- a/arch/sparc64/Makefile
+++ b/arch/sparc64/Makefile
@@ -27,7 +27,6 @@ endif
 head-y := arch/sparc64/kernel/head.o arch/sparc64/kernel/init_task.o
 
 core-y				+= arch/sparc64/kernel/ arch/sparc64/mm/
-core-$(CONFIG_SOLARIS_EMUL)	+= arch/sparc64/solaris/
 core-y				+= arch/sparc64/math-emu/
 libs-y				+= arch/sparc64/prom/ arch/sparc64/lib/
 drivers-$(CONFIG_OPROFILE)	+= arch/sparc64/oprofile/
diff --git a/arch/sparc64/defconfig b/arch/sparc64/defconfig
index 9d4bd2229493..e1835868ad36 100644
--- a/arch/sparc64/defconfig
+++ b/arch/sparc64/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.25-rc3
-# Wed Mar 26 04:33:35 2008
+# Linux kernel version: 2.6.25
+# Sun Apr 20 01:33:21 2008
 #
 CONFIG_SPARC=y
 CONFIG_SPARC64=y
@@ -22,7 +22,6 @@ CONFIG_HAVE_SETUP_PER_CPU_AREA=y
 CONFIG_ARCH_NO_VIRT_TO_BUS=y
 CONFIG_OF=y
 CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
-CONFIG_ARCH_SUPPORTS_AOUT=y
 CONFIG_SPARC64_PAGE_SIZE_8KB=y
 # CONFIG_SPARC64_PAGE_SIZE_64KB is not set
 # CONFIG_SPARC64_PAGE_SIZE_512KB is not set
@@ -61,6 +60,7 @@ CONFIG_RT_GROUP_SCHED=y
 CONFIG_USER_SCHED=y
 # CONFIG_CGROUP_SCHED is not set
 CONFIG_SYSFS_DEPRECATED=y
+CONFIG_SYSFS_DEPRECATED_V2=y
 CONFIG_RELAY=y
 CONFIG_NAMESPACES=y
 # CONFIG_UTS_NS is not set
@@ -100,7 +100,9 @@ CONFIG_PROFILING=y
 CONFIG_OPROFILE=m
 CONFIG_HAVE_OPROFILE=y
 CONFIG_KPROBES=y
+CONFIG_KRETPROBES=y
 CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
 CONFIG_PROC_PAGE_MONITOR=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -131,8 +133,6 @@ CONFIG_DEFAULT_AS=y
 # CONFIG_DEFAULT_NOOP is not set
 CONFIG_DEFAULT_IOSCHED="anticipatory"
 CONFIG_CLASSIC_RCU=y
-# CONFIG_PREEMPT_RCU is not set
-CONFIG_SYSVIPC_COMPAT=y
 CONFIG_GENERIC_HARDIRQS=y
 
 #
@@ -182,9 +182,6 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_LEGACY is not set
 # CONFIG_PCI_DEBUG is not set
 CONFIG_SUN_OPENPROMFS=m
-CONFIG_SPARC32_COMPAT=y
-CONFIG_COMPAT=y
-# CONFIG_BINFMT_AOUT32 is not set
 
 #
 # Executable file formats
@@ -192,13 +189,14 @@ CONFIG_COMPAT=y
 CONFIG_BINFMT_ELF=y
 CONFIG_COMPAT_BINFMT_ELF=y
 CONFIG_BINFMT_MISC=m
-CONFIG_SOLARIS_EMUL=y
+CONFIG_SPARC32_COMPAT=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
 CONFIG_SCHED_SMT=y
 CONFIG_SCHED_MC=y
 # CONFIG_PREEMPT_NONE is not set
 CONFIG_PREEMPT_VOLUNTARY=y
 # CONFIG_PREEMPT is not set
-# CONFIG_RCU_TRACE is not set
 # CONFIG_CMDLINE_BOOL is not set
 
 #
@@ -263,8 +261,10 @@ CONFIG_INET6_XFRM_MODE_TUNNEL=m
 CONFIG_INET6_XFRM_MODE_BEET=m
 # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
 CONFIG_IPV6_SIT=m
+CONFIG_IPV6_NDISC_NODETYPE=y
 CONFIG_IPV6_TUNNEL=m
 # CONFIG_IPV6_MULTIPLE_TABLES is not set
+# CONFIG_IPV6_MROUTE is not set
 # CONFIG_NETWORK_SECMARK is not set
 # CONFIG_NETFILTER is not set
 CONFIG_IP_DCCP=m
@@ -368,7 +368,7 @@ CONFIG_IDE=y
 CONFIG_BLK_DEV_IDE=y
 
 #
-# Please see Documentation/ide.txt for help/info on IDE drives
+# Please see Documentation/ide/ide.txt for help/info on IDE drives
 #
 # CONFIG_BLK_DEV_IDE_SATA is not set
 CONFIG_BLK_DEV_IDEDISK=y
@@ -384,7 +384,6 @@ CONFIG_IDE_PROC_FS=y
 #
 # IDE chipset support/bugfixes
 #
-CONFIG_IDE_GENERIC=y
 # CONFIG_BLK_DEV_PLATFORM is not set
 CONFIG_BLK_DEV_IDEDMA_SFF=y
 
@@ -422,7 +421,7 @@ CONFIG_BLK_DEV_ALI15X3=y
 # CONFIG_BLK_DEV_VIA82CXXX is not set
 # CONFIG_BLK_DEV_TC86C001 is not set
 CONFIG_BLK_DEV_IDEDMA=y
-CONFIG_IDE_ARCH_OBSOLETE_INIT=y
+# CONFIG_BLK_DEV_HD_ONLY is not set
 # CONFIG_BLK_DEV_HD is not set
 
 #
@@ -588,7 +587,6 @@ CONFIG_E1000_NAPI=y
 # CONFIG_SIS190 is not set
 # CONFIG_SKGE is not set
 # CONFIG_SKY2 is not set
-# CONFIG_SK98LIN is not set
 # CONFIG_VIA_VELOCITY is not set
 CONFIG_TIGON3=m
 CONFIG_BNX2=m
@@ -613,6 +611,7 @@ CONFIG_NIU=m
 #
 # CONFIG_WLAN_PRE80211 is not set
 # CONFIG_WLAN_80211 is not set
+# CONFIG_IWLWIFI_LEDS is not set
 
 #
 # USB Network Adapters
@@ -1472,7 +1471,7 @@ CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_CRC32C=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_AUTHENC=m
+CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_LZO is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 459462e80a12..63c6ae0dd273 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o \
 obj-$(CONFIG_PCI_MSI)	+= pci_msi.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o hvtramp.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
-obj-$(CONFIG_BINFMT_AOUT32) += binfmt_aout32.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_US3_FREQ) += us3_cpufreq.o
 obj-$(CONFIG_US2E_FREQ) += us2e_cpufreq.o
@@ -30,11 +29,3 @@ obj-$(CONFIG_SUN_LDOMS) += ldc.o vio.o viohs.o ds.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDIT)$(CONFIG_SPARC32_COMPAT) += compat_audit.o
 obj-y += $(obj-yy)
-
-ifdef CONFIG_SUNOS_EMUL
-  obj-y += sys_sunos32.o sunos_ioctl32.o
-else
-  ifdef CONFIG_SOLARIS_EMUL
-    obj-y += sys_sunos32.o sunos_ioctl32.o
-  endif
-endif
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
deleted file mode 100644
index 9877f2d7672d..000000000000
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- *  linux/fs/binfmt_aout.c
- *
- *  Copyright (C) 1991, 1992, 1996  Linus Torvalds
- *
- *  Hacked a bit by DaveM to make it work with 32-bit SunOS
- *  binaries on the sparc64 port.
- */
-
-#include <linux/module.h>
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/slab.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-#include <asm/a.out-core.h>
-
-static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
-static int load_aout32_library(struct file*);
-static int aout32_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
-
-static struct linux_binfmt aout32_format = {
-	.module		= THIS_MODULE,
-	.load_binary	= load_aout32_binary,
-	.load_shlib	= load_aout32_library,
-	.core_dump	= aout32_core_dump,
-	.min_coredump	= PAGE_SIZE,
-};
-
-static void set_brk(unsigned long start, unsigned long end)
-{
-	start = PAGE_ALIGN(start);
-	end = PAGE_ALIGN(end);
-	if (end <= start)
-		return;
-	down_write(&current->mm->mmap_sem);
-	do_brk(start, end - start);
-	up_write(&current->mm->mmap_sem);
-}
-
-/*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-#define DUMP_WRITE(addr, nr)	\
-	if (!dump_write(file, (void *)(addr), (nr))) \
-		goto end_coredump;
-
-#define DUMP_SEEK(offset) \
-if (file->f_op->llseek) { \
-	if (file->f_op->llseek(file,(offset),0) != (offset)) \
- 		goto end_coredump; \
-} else file->f_pos = (offset)
-
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-
-static int aout32_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
-{
-	mm_segment_t fs;
-	int has_dumped = 0;
-	unsigned long dump_start, dump_size;
-	struct user dump;
-#       define START_DATA(u)    (u.u_tsize)
-#       define START_STACK(u)   ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
-
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
-       	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-	dump.signal = signr;
-	aout_dump_thread(regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
-   if we wrote the stack, but not the data area.  */
-	if (dump.u_dsize + dump.u_ssize > limit)
-		dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
-	if (dump.u_ssize > limit)
-		dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
-	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, (void __user *) START_DATA(dump), dump.u_dsize))
-		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, (void __user *) START_STACK(dump), dump.u_ssize))
-		dump.u_ssize = 0;
-
-	set_fs(KERNEL_DS);
-/* struct user */
-	DUMP_WRITE(&dump,sizeof(dump));
-/* now we start writing out the user space info */
-	set_fs(USER_DS);
-/* Dump the data area */
-	if (dump.u_dsize != 0) {
-		dump_start = START_DATA(dump);
-		dump_size = dump.u_dsize;
-		DUMP_WRITE(dump_start,dump_size);
-	}
-/* Now prepare to dump the stack area */
-	if (dump.u_ssize != 0) {
-		dump_start = START_STACK(dump);
-		dump_size = dump.u_ssize;
-		DUMP_WRITE(dump_start,dump_size);
-	}
-/* Finally dump the task struct.  Not be used by gdb, but could be useful */
-	set_fs(KERNEL_DS);
-	DUMP_WRITE(current,sizeof(*current));
-end_coredump:
-	set_fs(fs);
-	return has_dumped;
-}
-
-/*
- * create_aout32_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-
-static u32 __user *create_aout32_tables(char __user *p, struct linux_binprm *bprm)
-{
-	u32 __user *argv;
-	u32 __user *envp;
-	u32 __user *sp;
-	int argc = bprm->argc;
-	int envc = bprm->envc;
-
-	sp = (u32 __user *)((-(unsigned long)sizeof(char *))&(unsigned long)p);
-
-	/* This imposes the proper stack alignment for a new process. */
-	sp = (u32 __user *) (((unsigned long) sp) & ~7);
-	if ((envc+argc+3)&1)
-		--sp;
-
-	sp -= envc+1;
-	envp = sp;
-	sp -= argc+1;
-	argv = sp;
-	put_user(argc,--sp);
-	current->mm->arg_start = (unsigned long) p;
-	while (argc-->0) {
-		char c;
-		put_user(((u32)(unsigned long)(p)),argv++);
-		do {
-			get_user(c,p++);
-		} while (c);
-	}
-	put_user(0,argv);
-	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
-	while (envc-->0) {
-		char c;
-		put_user(((u32)(unsigned long)(p)),envp++);
-		do {
-			get_user(c,p++);
-		} while (c);
-	}
-	put_user(0,envp);
-	current->mm->env_end = (unsigned long) p;
-	return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries.  There is no binary dependent code anywhere else.
- */
-
-static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
-{
-	struct exec ex;
-	unsigned long error;
-	unsigned long fd_offset;
-	unsigned long rlim;
-	unsigned long orig_thr_flags;
-	int retval;
-
-	ex = *((struct exec *) bprm->buf);		/* exec-header */
-	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
-	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
-	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
-	    bprm->file->f_path.dentry->d_inode->i_size < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
-		return -ENOEXEC;
-	}
-
-	fd_offset = N_TXTOFF(ex);
-
-	/* Check initial limits. This avoids letting people circumvent
-	 * size limits imposed on them by creating programs with large
-	 * arrays in the data or bss.
-	 */
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim >= RLIM_INFINITY)
-		rlim = ~0;
-	if (ex.a_data + ex.a_bss > rlim)
-		return -ENOMEM;
-
-	/* Flush all traces of the currently running executable */
-	retval = flush_old_exec(bprm);
-	if (retval)
-		return retval;
-
-	/* OK, This is the point of no return */
-	set_personality(PER_SUNOS);
-
-	current->mm->end_code = ex.a_text +
-		(current->mm->start_code = N_TXTADDR(ex));
-	current->mm->end_data = ex.a_data +
-		(current->mm->start_data = N_DATADDR(ex));
-	current->mm->brk = ex.a_bss +
-		(current->mm->start_brk = N_BSSADDR(ex));
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
-
-	current->mm->mmap = NULL;
-	compute_creds(bprm);
- 	current->flags &= ~PF_FORKNOEXEC;
-	if (N_MAGIC(ex) == NMAGIC) {
-		loff_t pos = fd_offset;
-		/* Fuck me plenty... */
-		down_write(&current->mm->mmap_sem);	
-		error = do_brk(N_TXTADDR(ex), ex.a_text);
-		up_write(&current->mm->mmap_sem);
-		bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex),
-			  ex.a_text, &pos);
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(N_DATADDR(ex), ex.a_data);
-		up_write(&current->mm->mmap_sem);
-		bprm->file->f_op->read(bprm->file, (char __user *)N_DATADDR(ex),
-			  ex.a_data, &pos);
-		goto beyond_if;
-	}
-
-	if (N_MAGIC(ex) == OMAGIC) {
-		loff_t pos = fd_offset;
-		down_write(&current->mm->mmap_sem);
-		do_brk(N_TXTADDR(ex) & PAGE_MASK,
-			ex.a_text+ex.a_data + PAGE_SIZE - 1);
-		up_write(&current->mm->mmap_sem);
-		bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex),
-			  ex.a_text+ex.a_data, &pos);
-	} else {
-		static unsigned long error_time;
-		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
-		    (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time) > 5*HZ)
-		{
-			printk(KERN_NOTICE "executable not page aligned\n");
-			error_time = jiffies;
-		}
-
-		if (!bprm->file->f_op->mmap) {
-			loff_t pos = fd_offset;
-			down_write(&current->mm->mmap_sem);
-			do_brk(0, ex.a_text+ex.a_data);
-			up_write(&current->mm->mmap_sem);
-			bprm->file->f_op->read(bprm->file,
-				  (char __user *)N_TXTADDR(ex),
-				  ex.a_text+ex.a_data, &pos);
-			goto beyond_if;
-		}
-
-	        down_write(&current->mm->mmap_sem);
-		error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
-			PROT_READ | PROT_EXEC,
-			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
-			fd_offset);
-	        up_write(&current->mm->mmap_sem);
-
-		if (error != N_TXTADDR(ex)) {
-			send_sig(SIGKILL, current, 0);
-			return error;
-		}
-
-	        down_write(&current->mm->mmap_sem);
- 		error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
-				PROT_READ | PROT_WRITE | PROT_EXEC,
-				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
-				fd_offset + ex.a_text);
-	        up_write(&current->mm->mmap_sem);
-		if (error != N_DATADDR(ex)) {
-			send_sig(SIGKILL, current, 0);
-			return error;
-		}
-	}
-beyond_if:
-	set_binfmt(&aout32_format);
-
-	set_brk(current->mm->start_brk, current->mm->brk);
-
-	/* Make sure STACK_TOP returns the right thing.  */
-	orig_thr_flags = current_thread_info()->flags;
-	current_thread_info()->flags |= _TIF_32BIT;
-
-	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0) { 
-		current_thread_info()->flags = orig_thr_flags;
-
-		/* Someone check-me: is this error path enough? */ 
-		send_sig(SIGKILL, current, 0); 
-		return retval;
-	}
-
-	current->mm->start_stack =
-		(unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
-	tsb_context_switch(current->mm);
-
-	start_thread32(regs, ex.a_entry, current->mm->start_stack);
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
-	return 0;
-}
-
-/* N.B. Move to .h file and use code in fs/binfmt_aout.c? */
-static int load_aout32_library(struct file *file)
-{
-	struct inode * inode;
-	unsigned long bss, start_addr, len;
-	unsigned long error;
-	int retval;
-	struct exec ex;
-
-	inode = file->f_path.dentry->d_inode;
-
-	retval = -ENOEXEC;
-	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
-	if (error != sizeof(ex))
-		goto out;
-
-	/* We come in here for the regular a.out style of shared libraries */
-	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
-	    N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
-	    inode->i_size < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
-		goto out;
-	}
-
-	if (N_MAGIC(ex) == ZMAGIC && N_TXTOFF(ex) &&
-	    (N_TXTOFF(ex) < inode->i_sb->s_blocksize)) {
-		printk("N_TXTOFF < BLOCK_SIZE. Please convert library\n");
-		goto out;
-	}
-
-	if (N_FLAGS(ex))
-		goto out;
-
-	/* For  QMAGIC, the starting address is 0x20 into the page.  We mask
-	   this off to get the starting address for the page */
-
-	start_addr =  ex.a_entry & 0xfffff000;
-
-	/* Now use mmap to map the library into memory. */
-	down_write(&current->mm->mmap_sem);
-	error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
-			PROT_READ | PROT_WRITE | PROT_EXEC,
-			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
-			N_TXTOFF(ex));
-	up_write(&current->mm->mmap_sem);
-	retval = error;
-	if (error != start_addr)
-		goto out;
-
-	len = PAGE_ALIGN(ex.a_text + ex.a_data);
-	bss = ex.a_text + ex.a_data + ex.a_bss;
-	if (bss > len) {
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(start_addr + len, bss - len);
-		up_write(&current->mm->mmap_sem);
-		retval = error;
-		if (error != start_addr + len)
-			goto out;
-	}
-	retval = 0;
-out:
-	return retval;
-}
-
-static int __init init_aout32_binfmt(void)
-{
-	return register_binfmt(&aout32_format);
-}
-
-static void __exit exit_aout32_binfmt(void)
-{
-	unregister_binfmt(&aout32_format);
-}
-
-module_init(init_aout32_binfmt);
-module_exit(exit_aout32_binfmt);
diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
index 49eca4b1cf25..fb43c76bdc26 100644
--- a/arch/sparc64/kernel/entry.S
+++ b/arch/sparc64/kernel/entry.S
@@ -1353,63 +1353,6 @@ breakpoint_trap:
 	ba,pt		%xcc, rtrap
 	 nop
 
-#if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \
-    defined(CONFIG_SOLARIS_EMUL_MODULE)
-	/* SunOS uses syscall zero as the 'indirect syscall' it looks
-	 * like indir_syscall(scall_num, arg0, arg1, arg2...);  etc.
-	 * This is complete brain damage.
-	 */
-	.globl	sunos_indir
-sunos_indir:
-	srl		%o0, 0, %o0
-	mov		%o7, %l4
-	cmp		%o0, NR_SYSCALLS
-	blu,a,pt	%icc, 1f
-	 sll		%o0, 0x2, %o0
-	sethi		%hi(sunos_nosys), %l6
-	b,pt		%xcc, 2f
-	 or		%l6, %lo(sunos_nosys), %l6
-1:	sethi		%hi(sunos_sys_table), %l7
-	or		%l7, %lo(sunos_sys_table), %l7
-	lduw		[%l7 + %o0], %l6
-2:	mov		%o1, %o0
-	mov		%o2, %o1
-	mov		%o3, %o2
-	mov		%o4, %o3
-	mov		%o5, %o4
-	call		%l6
-	 mov		%l4, %o7
-
-	.globl	sunos_getpid
-sunos_getpid:
-	call	sys_getppid
-	 nop
-	call	sys_getpid
-	 stx	%o0, [%sp + PTREGS_OFF + PT_V9_I1]
-	b,pt	%xcc, ret_sys_call
-	 stx	%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-
-	/* SunOS getuid() returns uid in %o0 and euid in %o1 */
-	.globl	sunos_getuid
-sunos_getuid:
-	call	sys32_geteuid16
-	 nop
-	call	sys32_getuid16
-	 stx	%o0, [%sp + PTREGS_OFF + PT_V9_I1]
-	b,pt	%xcc, ret_sys_call
-	 stx	%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-
-	/* SunOS getgid() returns gid in %o0 and egid in %o1 */
-	.globl	sunos_getgid
-sunos_getgid:
-	call	sys32_getegid16
-	 nop
-	call	sys32_getgid16
-	 stx	%o0, [%sp + PTREGS_OFF + PT_V9_I1]
-	b,pt	%xcc, ret_sys_call
-	 stx	%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-#endif
-
 	/* SunOS's execv() call only specifies the argv argument, the
 	 * environment settings are the same as the calling processes.
 	 */
@@ -1591,7 +1534,7 @@ linux_syscall_trace:
 	 mov		%i4, %o4
 
 
-	/* Linux 32-bit and SunOS system calls enter here... */
+	/* Linux 32-bit system calls enter here... */
 	.align	32
 	.globl	linux_sparc_syscall32
 linux_sparc_syscall32:
@@ -1614,7 +1557,7 @@ linux_sparc_syscall32:
 	 srl		%i3, 0, %o3				! IEU0
 	ba,a,pt		%xcc, 3f
 
-	/* Linux native and SunOS system calls enter here... */
+	/* Linux native system calls enter here... */
 	.align	32
 	.globl	linux_sparc_syscall, ret_sys_call
 linux_sparc_syscall:
diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c
index 9d51956e8e2f..1c47009eb5ec 100644
--- a/arch/sparc64/kernel/signal.c
+++ b/arch/sparc64/kernel/signal.c
@@ -25,7 +25,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
-#include <asm/svr4.h>
 #include <asm/pgtable.h>
 #include <asm/fpumacro.h>
 #include <asm/uctx.h>
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index 8c1c121330fb..74e0512f135c 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -23,7 +23,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
-#include <asm/svr4.h>
 #include <asm/pgtable.h>
 #include <asm/psrcompat.h>
 #include <asm/fpumacro.h>
@@ -798,281 +797,6 @@ sigsegv:
 	force_sigsegv(signo, current);
 }
 
-/* Setup a Solaris stack frame */
-static void
-setup_svr4_frame32(struct sigaction *sa, unsigned long pc, unsigned long npc,
-		   struct pt_regs *regs, int signr, sigset_t *oldset)
-{
-	svr4_signal_frame_t __user *sfp;
-	svr4_gregset_t  __user *gr;
-	svr4_siginfo_t  __user *si;
-	svr4_mcontext_t __user *mc;
-	svr4_gwindows_t __user *gw;
-	svr4_ucontext_t __user *uc;
-	svr4_sigset_t setv;
-	unsigned int psr;
-	int i, err;
-
-	synchronize_user_stack();
-	save_and_clear_fpu();
-	
-	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
-	sfp = (svr4_signal_frame_t __user *)
-		get_sigframe(sa, regs,
-			     sizeof(struct reg_window32) + SVR4_SF_ALIGNED);
-
-	if (invalid_frame_pointer(sfp, sizeof(*sfp)))
-		do_exit(SIGILL);
-
-	/* Start with a clean frame pointer and fill it */
-	err = clear_user(sfp, sizeof(*sfp));
-
-	/* Setup convenience variables */
-	si = &sfp->si;
-	uc = &sfp->uc;
-	gw = &sfp->gw;
-	mc = &uc->mcontext;
-	gr = &mc->greg;
-	
-	/* FIXME: where am I supposed to put this?
-	 * sc->sigc_onstack = old_status;
-	 * anyways, it does not look like it is used for anything at all.
-	 */
-	setv.sigbits[0] = oldset->sig[0];
-	setv.sigbits[1] = (oldset->sig[0] >> 32);
-	if (_NSIG_WORDS >= 2) {
-		setv.sigbits[2] = oldset->sig[1];
-		setv.sigbits[3] = (oldset->sig[1] >> 32);
-		err |= __copy_to_user(&uc->sigmask, &setv, sizeof(svr4_sigset_t));
-	} else
-		err |= __copy_to_user(&uc->sigmask, &setv,
-				      2 * sizeof(unsigned int));
-	
-	/* Store registers */
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-	err |= __put_user(regs->tpc, &((*gr)[SVR4_PC]));
-	err |= __put_user(regs->tnpc, &((*gr)[SVR4_NPC]));
-	psr = tstate_to_psr(regs->tstate);
-	if (current_thread_info()->fpsaved[0] & FPRS_FEF)
-		psr |= PSR_EF;
-	err |= __put_user(psr, &((*gr)[SVR4_PSR]));
-	err |= __put_user(regs->y, &((*gr)[SVR4_Y]));
-	
-	/* Copy g[1..7] and o[0..7] registers */
-	for (i = 0; i < 7; i++)
-		err |= __put_user(regs->u_regs[UREG_G1+i], (&(*gr)[SVR4_G1])+i);
-	for (i = 0; i < 8; i++)
-		err |= __put_user(regs->u_regs[UREG_I0+i], (&(*gr)[SVR4_O0])+i);
-
-	/* Setup sigaltstack */
-	err |= __put_user(current->sas_ss_sp, &uc->stack.sp);
-	err |= __put_user(sas_ss_flags(regs->u_regs[UREG_FP]), &uc->stack.flags);
-	err |= __put_user(current->sas_ss_size, &uc->stack.size);
-
-	/* Save the currently window file: */
-
-	/* 1. Link sfp->uc->gwins to our windows */
-	err |= __put_user(ptr_to_compat(gw), &mc->gwin);
-	    
-	/* 2. Number of windows to restore at setcontext (): */
-	err |= __put_user(get_thread_wsaved(), &gw->count);
-
-	/* 3. We just pay attention to the gw->count field on setcontext */
-	set_thread_wsaved(0); /* So process is allowed to execute. */
-
-	/* Setup the signal information.  Solaris expects a bunch of
-	 * information to be passed to the signal handler, we don't provide
-	 * that much currently, should use siginfo.
-	 */
-	err |= __put_user(signr, &si->siginfo.signo);
-	err |= __put_user(SVR4_SINOINFO, &si->siginfo.code);
-	if (err)
-		goto sigsegv;
-
-	regs->u_regs[UREG_FP] = (unsigned long) sfp;
-	regs->tpc = (unsigned long) sa->sa_handler;
-	regs->tnpc = (regs->tpc + 4);
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-
-	/* Arguments passed to signal handler */
-	if (regs->u_regs[14]){
-		struct reg_window32 __user *rw = (struct reg_window32 __user *)
-			(regs->u_regs[14] & 0x00000000ffffffffUL);
-
-		err |= __put_user(signr, &rw->ins[0]);
-		err |= __put_user((u64)si, &rw->ins[1]);
-		err |= __put_user((u64)uc, &rw->ins[2]);
-		err |= __put_user((u64)sfp, &rw->ins[6]);	/* frame pointer */
-		if (err)
-			goto sigsegv;
-
-		regs->u_regs[UREG_I0] = signr;
-		regs->u_regs[UREG_I1] = (u32)(u64) si;
-		regs->u_regs[UREG_I2] = (u32)(u64) uc;
-	}
-	return;
-
-sigsegv:
-	force_sigsegv(signr, current);
-}
-
-asmlinkage int
-svr4_getcontext(svr4_ucontext_t __user *uc, struct pt_regs *regs)
-{
-	svr4_gregset_t  __user *gr;
-	svr4_mcontext_t __user *mc;
-	svr4_sigset_t setv;
-	int i, err;
-	u32 psr;
-
-	synchronize_user_stack();
-	save_and_clear_fpu();
-	
-	if (get_thread_wsaved())
-		do_exit(SIGSEGV);
-
-	err = clear_user(uc, sizeof(*uc));
-
-	/* Setup convenience variables */
-	mc = &uc->mcontext;
-	gr = &mc->greg;
-
-	setv.sigbits[0] = current->blocked.sig[0];
-	setv.sigbits[1] = (current->blocked.sig[0] >> 32);
-	if (_NSIG_WORDS >= 2) {
-		setv.sigbits[2] = current->blocked.sig[1];
-		setv.sigbits[3] = (current->blocked.sig[1] >> 32);
-		err |= __copy_to_user(&uc->sigmask, &setv, sizeof(svr4_sigset_t));
-	} else
-		err |= __copy_to_user(&uc->sigmask, &setv, 2 * sizeof(unsigned));
-
-	/* Store registers */
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-	err |= __put_user(regs->tpc, &uc->mcontext.greg[SVR4_PC]);
-	err |= __put_user(regs->tnpc, &uc->mcontext.greg[SVR4_NPC]);
-
-	psr = tstate_to_psr(regs->tstate) & ~PSR_EF;		   
-	if (current_thread_info()->fpsaved[0] & FPRS_FEF)
-		psr |= PSR_EF;
-	err |= __put_user(psr, &uc->mcontext.greg[SVR4_PSR]);
-
-	err |= __put_user(regs->y, &uc->mcontext.greg[SVR4_Y]);
-	
-	/* Copy g[1..7] and o[0..7] registers */
-	for (i = 0; i < 7; i++)
-		err |= __put_user(regs->u_regs[UREG_G1+i], (&(*gr)[SVR4_G1])+i);
-	for (i = 0; i < 8; i++)
-		err |= __put_user(regs->u_regs[UREG_I0+i], (&(*gr)[SVR4_O0])+i);
-
-	/* Setup sigaltstack */
-	err |= __put_user(current->sas_ss_sp, &uc->stack.sp);
-	err |= __put_user(sas_ss_flags(regs->u_regs[UREG_FP]), &uc->stack.flags);
-	err |= __put_user(current->sas_ss_size, &uc->stack.size);
-
-	/* The register file is not saved
-	 * we have already stuffed all of it with sync_user_stack
-	 */
-	return (err ? -EFAULT : 0);
-}
-
-
-/* Set the context for a svr4 application, this is Solaris way to sigreturn */
-asmlinkage int svr4_setcontext(svr4_ucontext_t __user *c, struct pt_regs *regs)
-{
-	svr4_gregset_t  __user *gr;
-	mm_segment_t old_fs;
-	u32 pc, npc, psr, u_ss_sp;
-	sigset_t set;
-	svr4_sigset_t setv;
-	int i, err;
-	stack_t st;
-	
-	/* Fixme: restore windows, or is this already taken care of in
-	 * svr4_setup_frame when sync_user_windows is done?
-	 */
-	flush_user_windows();
-	
-	if (get_thread_wsaved())
-		goto sigsegv;
-
-	if (((unsigned long) c) & 3){
-		printk("Unaligned structure passed\n");
-		goto sigsegv;
-	}
-
-	if (!__access_ok(c, sizeof(*c))) {
-		/* Miguel, add nice debugging msg _here_. ;-) */
-		goto sigsegv;
-	}
-
-	/* Check for valid PC and nPC */
-	gr = &c->mcontext.greg;
-	err = __get_user(pc, &((*gr)[SVR4_PC]));
-	err |= __get_user(npc, &((*gr)[SVR4_NPC]));
-	if ((pc | npc) & 3)
-		goto sigsegv;
-	
-	/* Retrieve information from passed ucontext */
-	/* note that nPC is ored a 1, this is used to inform entry.S */
-	/* that we don't want it to mess with our PC and nPC */
-	
-	err |= copy_from_user(&setv, &c->sigmask, sizeof(svr4_sigset_t));
-	set.sig[0] = setv.sigbits[0] | (((long)setv.sigbits[1]) << 32);
-	if (_NSIG_WORDS >= 2)
-		set.sig[1] = setv.sigbits[2] | (((long)setv.sigbits[3]) << 32);
-	
-	err |= __get_user(u_ss_sp, &c->stack.sp);
-	st.ss_sp = compat_ptr(u_ss_sp);
-	err |= __get_user(st.ss_flags, &c->stack.flags);
-	err |= __get_user(st.ss_size, &c->stack.size);
-	if (err)
-		goto sigsegv;
-		
-	/* It is more difficult to avoid calling this function than to
-	   call it and ignore errors.  */
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	do_sigaltstack((stack_t __user *) &st, NULL, regs->u_regs[UREG_I6]);
-	set_fs(old_fs);
-	
-	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	regs->tpc = pc;
-	regs->tnpc = npc | 1;
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-	err |= __get_user(regs->y, &((*gr)[SVR4_Y]));
-	err |= __get_user(psr, &((*gr)[SVR4_PSR]));
-	regs->tstate &= ~(TSTATE_ICC|TSTATE_XCC);
-	regs->tstate |= psr_to_tstate_icc(psr);
-
-	/* Restore g[1..7] and o[0..7] registers */
-	for (i = 0; i < 7; i++)
-		err |= __get_user(regs->u_regs[UREG_G1+i], (&(*gr)[SVR4_G1])+i);
-	for (i = 0; i < 8; i++)
-		err |= __get_user(regs->u_regs[UREG_I0+i], (&(*gr)[SVR4_O0])+i);
-	if (err)
-		goto sigsegv;
-
-	return -EINTR;
-sigsegv:
-	return -EFAULT;
-}
-
 static void setup_rt_frame32(struct k_sigaction *ka, struct pt_regs *regs,
 			     unsigned long signr, sigset_t *oldset,
 			     siginfo_t *info)
@@ -1216,20 +940,14 @@ sigsegv:
 
 static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka,
 				   siginfo_t *info,
-				   sigset_t *oldset, struct pt_regs *regs,
-				   int svr4_signal)
+				   sigset_t *oldset, struct pt_regs *regs)
 {
-	if (svr4_signal)
-		setup_svr4_frame32(&ka->sa, regs->tpc, regs->tnpc,
-				   regs, signr, oldset);
-	else {
-		if (ka->sa.sa_flags & SA_SIGINFO)
-			setup_rt_frame32(ka, regs, signr, oldset, info);
-		else if (test_thread_flag(TIF_NEWSIGNALS))
-			new_setup_frame32(ka, regs, signr, oldset);
-		else
-			setup_frame32(&ka->sa, regs, signr, oldset, info);
-	}
+	if (ka->sa.sa_flags & SA_SIGINFO)
+		setup_rt_frame32(ka, regs, signr, oldset, info);
+	else if (test_thread_flag(TIF_NEWSIGNALS))
+		new_setup_frame32(ka, regs, signr, oldset);
+	else
+		setup_frame32(&ka->sa, regs, signr, oldset, info);
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NOMASK))
@@ -1270,7 +988,6 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs,
 	struct signal_deliver_cookie cookie;
 	struct k_sigaction ka;
 	int signr;
-	int svr4_signal = current->personality == PER_SVR4;
 	
 	cookie.restart_syscall = restart_syscall;
 	cookie.orig_i0 = orig_i0;
@@ -1279,8 +996,7 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs,
 	if (signr > 0) {
 		if (cookie.restart_syscall)
 			syscall_restart32(orig_i0, regs, &ka.sa);
-		handle_signal32(signr, &ka, &info, oldset,
-				regs, svr4_signal);
+		handle_signal32(signr, &ka, &info, oldset, regs);
 
 		/* a signal was successfully delivered; the saved
 		 * sigmask will have been stored in the signal frame,
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 051b8d9cb989..38736460b8db 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -33,13 +33,11 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/idprom.h>
-#include <asm/svr4.h>
 #include <asm/elf.h>
 #include <asm/head.h>
 #include <asm/smp.h>
 #include <asm/mostek.h>
 #include <asm/ptrace.h>
-#include <asm/user.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
 #include <asm/fpumacro.h>
@@ -73,13 +71,8 @@ extern __kernel_size_t strlen(const char *);
 extern void linux_sparc_syscall(void);
 extern void rtrap(void);
 extern void show_regs(struct pt_regs *);
-extern void solaris_syscall(void);
 extern void syscall_trace(struct pt_regs *, int);
-extern u32 sunos_sys_table[], sys_call_table32[];
-extern void tl0_solaris(void);
 extern void sys_sigsuspend(void);
-extern int svr4_getcontext(svr4_ucontext_t *uc, struct pt_regs *regs);
-extern int svr4_setcontext(svr4_ucontext_t *uc, struct pt_regs *regs);
 extern int compat_sys_ioctl(unsigned int fd, unsigned int cmd, u32 arg);
 extern int (*handle_mathemu)(struct pt_regs *, struct fpustate *);
 extern long sparc32_open(const char __user * filename, int flags, int mode);
@@ -90,8 +83,6 @@ extern int __ashrdi3(int, int);
 
 extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs);
 
-extern unsigned int sys_call_table[];
-
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
 extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
 		      unsigned long *);
@@ -213,11 +204,6 @@ EXPORT_SYMBOL(pci_dma_supported);
 /* I/O device mmaping on Sparc64. */
 EXPORT_SYMBOL(io_remap_pfn_range);
 
-#if defined(CONFIG_COMPAT) && defined(CONFIG_NET)
-/* Solaris/SunOS binary compatibility */
-EXPORT_SYMBOL(verify_compat_iovec);
-#endif
-
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(put_fs_struct);
 
@@ -254,30 +240,6 @@ EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(__strlen_user);
 EXPORT_SYMBOL(__strnlen_user);
 
-#ifdef CONFIG_SOLARIS_EMUL_MODULE
-EXPORT_SYMBOL(linux_sparc_syscall);
-EXPORT_SYMBOL(rtrap);
-EXPORT_SYMBOL(show_regs);
-EXPORT_SYMBOL(solaris_syscall);
-EXPORT_SYMBOL(syscall_trace);
-EXPORT_SYMBOL(sunos_sys_table);
-EXPORT_SYMBOL(sys_call_table32);
-EXPORT_SYMBOL(tl0_solaris);
-EXPORT_SYMBOL(sys_sigsuspend);
-EXPORT_SYMBOL(sys_getppid);
-EXPORT_SYMBOL(sys_getpid);
-EXPORT_SYMBOL(sys_geteuid);
-EXPORT_SYMBOL(sys_getuid);
-EXPORT_SYMBOL(sys_getegid);
-EXPORT_SYMBOL(sysctl_nr_open);
-EXPORT_SYMBOL(sys_getgid);
-EXPORT_SYMBOL(svr4_getcontext);
-EXPORT_SYMBOL(svr4_setcontext);
-EXPORT_SYMBOL(compat_sys_ioctl);
-EXPORT_SYMBOL(sys_ioctl);
-EXPORT_SYMBOL(sparc32_open);
-#endif
-
 /* Special internal versions of library functions. */
 EXPORT_SYMBOL(_clear_page);
 EXPORT_SYMBOL(clear_user_page);
@@ -334,9 +296,6 @@ EXPORT_SYMBOL(do_BUG);
 /* for ns8703 */
 EXPORT_SYMBOL(ns87303_lock);
 
-/* for solaris compat module */
-EXPORT_SYMBOL_GPL(sys_call_table);
-
 EXPORT_SYMBOL(tick_ops);
 
 EXPORT_SYMBOL(xor_vis_2);
diff --git a/arch/sparc64/kernel/sunos_ioctl32.c b/arch/sparc64/kernel/sunos_ioctl32.c
deleted file mode 100644
index 75d2bad49839..000000000000
--- a/arch/sparc64/kernel/sunos_ioctl32.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/* $Id: sunos_ioctl32.c,v 1.11 2000/07/30 23:12:24 davem Exp $
- * sunos_ioctl32.c: SunOS ioctl compatibility on sparc64.
- *
- * Copyright (C) 1995 Miguel de Icaza (miguel@nuclecu.unam.mx)
- * Copyright (C) 1995, 1996, 1997 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#include <asm/uaccess.h>
-
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/termios.h>
-#include <linux/tty.h>
-#include <linux/ioctl.h>
-#include <linux/route.h>
-#include <linux/sockios.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/syscalls.h>
-#include <linux/compat.h>
-
-#define SUNOS_NR_OPEN	256
-
-struct rtentry32 {
-        u32   		rt_pad1;
-        struct sockaddr rt_dst;         /* target address               */
-        struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
-        struct sockaddr rt_genmask;     /* target network mask (IP)     */
-        unsigned short  rt_flags;
-        short           rt_pad2;
-        u32   		rt_pad3;
-        unsigned char   rt_tos;
-        unsigned char   rt_class;
-        short           rt_pad4;
-        short           rt_metric;      /* +1 for binary compatibility! */
-        /* char * */ u32 rt_dev;        /* forcing the device at add    */
-        u32   		rt_mtu;         /* per route MTU/Window         */
-        u32   		rt_window;      /* Window clamping              */
-        unsigned short  rt_irtt;        /* Initial RTT                  */
-
-};
-
-struct ifmap32 {
-	u32 mem_start;
-	u32 mem_end;
-	unsigned short base_addr;
-	unsigned char irq;
-	unsigned char dma;
-	unsigned char port;
-};
-
-struct ifreq32 {
-#define IFHWADDRLEN     6
-#define IFNAMSIZ        16
-        union {
-                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
-        } ifr_ifrn;
-        union {
-                struct  sockaddr ifru_addr;
-                struct  sockaddr ifru_dstaddr;
-                struct  sockaddr ifru_broadaddr;
-                struct  sockaddr ifru_netmask;
-                struct  sockaddr ifru_hwaddr;
-                short   ifru_flags;
-                int     ifru_ivalue;
-                int     ifru_mtu;
-                struct  ifmap32 ifru_map;
-                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
-                compat_caddr_t ifru_data;
-        } ifr_ifru;
-};
-
-struct ifconf32 {
-        int     ifc_len;                        /* size of buffer       */
-        compat_caddr_t  ifcbuf;
-};
-
-extern asmlinkage int compat_sys_ioctl(unsigned int, unsigned int, u32);
-
-asmlinkage int sunos_ioctl (int fd, u32 cmd, u32 arg)
-{
-	int ret = -EBADF;
-
-	if(fd >= SUNOS_NR_OPEN)
-		goto out;
-	if(!fcheck(fd))
-		goto out;
-
-	if(cmd == TIOCSETD) {
-		mm_segment_t old_fs = get_fs();
-		int __user *p;
-		int ntty = N_TTY;
-		int tmp;
-
-		p = (int __user *) (unsigned long) arg;
-		ret = -EFAULT;
-		if(get_user(tmp, p))
-			goto out;
-		if(tmp == 2) {
-			set_fs(KERNEL_DS);
-			ret = sys_ioctl(fd, cmd, (unsigned long) &ntty);
-			set_fs(old_fs);
-			ret = (ret == -EINVAL ? -EOPNOTSUPP : ret);
-			goto out;
-		}
-	}
-	if(cmd == TIOCNOTTY) {
-		ret = sys_setsid();
-		goto out;
-	}
-	switch(cmd) {
-	case _IOW('r', 10, struct rtentry32):
-		ret = compat_sys_ioctl(fd, SIOCADDRT, arg);
-		goto out;
-	case _IOW('r', 11, struct rtentry32):
-		ret = compat_sys_ioctl(fd, SIOCDELRT, arg);
-		goto out;
-
-	case _IOW('i', 12, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFADDR, arg);
-		goto out;
-	case _IOWR('i', 13, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFADDR, arg);
-		goto out;
-	case _IOW('i', 14, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFDSTADDR, arg);
-		goto out;
-	case _IOWR('i', 15, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFDSTADDR, arg);
-		goto out;
-	case _IOW('i', 16, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFFLAGS, arg);
-		goto out;
-	case _IOWR('i', 17, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFFLAGS, arg);
-		goto out;
-	case _IOW('i', 18, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFMEM, arg);
-		goto out;
-	case _IOWR('i', 19, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFMEM, arg);
-		goto out;
-
-	case _IOWR('i', 20, struct ifconf32):
-		ret = compat_sys_ioctl(fd, SIOCGIFCONF, arg);
-		goto out;
-
-	case _IOW('i', 21, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFMTU, arg);
-		goto out;
-
-	case _IOWR('i', 22, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFMTU, arg);
-		goto out;
-
-	case _IOWR('i', 23, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFBRDADDR, arg);
-		goto out;
-	case _IOW('i', 24, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFBRDADDR, arg);
-		goto out;
-	case _IOWR('i', 25, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFNETMASK, arg);
-		goto out;
-	case _IOW('i', 26, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFNETMASK, arg);
-		goto out;
-	case _IOWR('i', 27, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCGIFMETRIC, arg);
-		goto out;
-	case _IOW('i', 28, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCSIFMETRIC, arg);
-		goto out;
-
-	case _IOW('i', 30, struct arpreq):
-		ret = compat_sys_ioctl(fd, SIOCSARP, arg);
-		goto out;
-	case _IOWR('i', 31, struct arpreq):
-		ret = compat_sys_ioctl(fd, SIOCGARP, arg);
-		goto out;
-	case _IOW('i', 32, struct arpreq):
-		ret = compat_sys_ioctl(fd, SIOCDARP, arg);
-		goto out;
-
-	case _IOW('i', 40, struct ifreq32): /* SIOCUPPER */
-	case _IOW('i', 41, struct ifreq32): /* SIOCLOWER */
-	case _IOW('i', 44, struct ifreq32): /* SIOCSETSYNC */
-	case _IOW('i', 45, struct ifreq32): /* SIOCGETSYNC */
-	case _IOW('i', 46, struct ifreq32): /* SIOCSSDSTATS */
-	case _IOW('i', 47, struct ifreq32): /* SIOCSSESTATS */
-	case _IOW('i', 48, struct ifreq32): /* SIOCSPROMISC */
-		ret = -EOPNOTSUPP;
-		goto out;
-
-	case _IOW('i', 49, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCADDMULTI, arg);
-		goto out;
-	case _IOW('i', 50, struct ifreq32):
-		ret = compat_sys_ioctl(fd, SIOCDELMULTI, arg);
-		goto out;
-
-	/* FDDI interface ioctls, unsupported. */
-		
-	case _IOW('i', 51, struct ifreq32): /* SIOCFDRESET */
-	case _IOW('i', 52, struct ifreq32): /* SIOCFDSLEEP */
-	case _IOW('i', 53, struct ifreq32): /* SIOCSTRTFMWAR */
-	case _IOW('i', 54, struct ifreq32): /* SIOCLDNSTRTFW */
-	case _IOW('i', 55, struct ifreq32): /* SIOCGETFDSTAT */
-	case _IOW('i', 56, struct ifreq32): /* SIOCFDNMIINT */
-	case _IOW('i', 57, struct ifreq32): /* SIOCFDEXUSER */
-	case _IOW('i', 58, struct ifreq32): /* SIOCFDGNETMAP */
-	case _IOW('i', 59, struct ifreq32): /* SIOCFDGIOCTL */
-		printk("FDDI ioctl, returning EOPNOTSUPP\n");
-		ret = -EOPNOTSUPP;
-		goto out;
-
-	case _IOW('t', 125, int):
-		/* More stupid tty sunos ioctls, just
-		 * say it worked.
-		 */
-		ret = 0;
-		goto out;
-
-	/* Non posix grp */
-	case _IOW('t', 118, int): {
-		int oldval, newval, __user *ptr;
-
-		cmd = TIOCSPGRP;
-		ptr = (int __user *) (unsigned long) arg;
-		ret = -EFAULT;
-		if(get_user(oldval, ptr))
-			goto out;
-		ret = compat_sys_ioctl(fd, cmd, arg);
-		__get_user(newval, ptr);
-		if(newval == -1) {
-			__put_user(oldval, ptr);
-			ret = -EIO;
-		}
-		if(ret == -ENOTTY)
-			ret = -EIO;
-		goto out;
-	}
-
-	case _IOR('t', 119, int): {
-		int oldval, newval, __user *ptr;
-
-		cmd = TIOCGPGRP;
-		ptr = (int __user *) (unsigned long) arg;
-		ret = -EFAULT;
-		if(get_user(oldval, ptr))
-			goto out;
-		ret = compat_sys_ioctl(fd, cmd, arg);
-		__get_user(newval, ptr);
-		if(newval == -1) {
-			__put_user(oldval, ptr);
-			ret = -EIO;
-		}
-		if(ret == -ENOTTY)
-			ret = -EIO;
-		goto out;
-	}
-	};
-
-	ret = compat_sys_ioctl(fd, cmd, arg);
-	/* so stupid... */
-	ret = (ret == -EINVAL ? -EOPNOTSUPP : ret);
-out:
-	return ret;
-}
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index f952745d0f3d..73ed01ba40dc 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -720,44 +720,6 @@ out:
 	return err;
 }
 
-asmlinkage long solaris_syscall(struct pt_regs *regs)
-{
-	static int count;
-
-	regs->tpc = regs->tnpc;
-	regs->tnpc += 4;
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-	if (++count <= 5) {
-		printk ("For Solaris binary emulation you need solaris module loaded\n");
-		show_regs (regs);
-	}
-	send_sig(SIGSEGV, current, 1);
-
-	return -ENOSYS;
-}
-
-#ifndef CONFIG_SUNOS_EMUL
-asmlinkage long sunos_syscall(struct pt_regs *regs)
-{
-	static int count;
-
-	regs->tpc = regs->tnpc;
-	regs->tnpc += 4;
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-	if (++count <= 20)
-		printk ("SunOS binary emulation not compiled in\n");
-	force_sig(SIGSEGV, current);
-
-	return -ENOSYS;
-}
-#endif
-
 asmlinkage long sys_utrap_install(utrap_entry_t type,
 				  utrap_handler_t new_p,
 				  utrap_handler_t new_d,
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
deleted file mode 100644
index e91194fe39d7..000000000000
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ /dev/null
@@ -1,1359 +0,0 @@
-/* $Id: sys_sunos32.c,v 1.64 2002/02/09 19:49:31 davem Exp $
- * sys_sunos32.c: SunOS binary compatibility layer on sparc64.
- *
- * Copyright (C) 1995, 1996, 1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1995 Miguel de Icaza (miguel@nuclecu.unam.mx)
- *
- * Based upon preliminary work which is:
- *
- * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu)
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/compat.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/resource.h>
-#include <linux/ipc.h>
-#include <linux/shm.h>
-#include <linux/msg.h>
-#include <linux/sem.h>
-#include <linux/signal.h>
-#include <linux/uio.h>
-#include <linux/utsname.h>
-#include <linux/major.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/errno.h>
-#include <linux/smp.h>
-#include <linux/smp_lock.h>
-#include <linux/syscalls.h>
-
-#include <asm/uaccess.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pconf.h>
-#include <asm/idprom.h> /* for gethostid() */
-#include <asm/unistd.h>
-#include <asm/system.h>
-#include <asm/compat_signal.h>
-
-/* For the nfs mount emulation */
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/nfs.h>
-#include <linux/nfs2.h>
-#include <linux/nfs_mount.h>
-
-/* for sunos_select */
-#include <linux/time.h>
-#include <linux/personality.h>
-
-/* For SOCKET_I */
-#include <net/sock.h>
-#include <net/compat.h>
-
-#define SUNOS_NR_OPEN	256
-
-asmlinkage u32 sunos_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u32 off)
-{
-	struct file *file = NULL;
-	unsigned long retval, ret_type;
-
-	if (flags & MAP_NORESERVE) {
-		static int cnt;
-		if (cnt++ < 10)
-			printk("%s:  unimplemented SunOS MAP_NORESERVE mmap() flag\n",
-			       current->comm);
-		flags &= ~MAP_NORESERVE;
-	}
-	retval = -EBADF;
-	if (!(flags & MAP_ANONYMOUS)) {
-		struct inode * inode;
-		if (fd >= SUNOS_NR_OPEN)
-			goto out;
- 		file = fget(fd);
-		if (!file)
-			goto out;
-		inode = file->f_path.dentry->d_inode;
-		if (imajor(inode) == MEM_MAJOR && iminor(inode) == 5) {
-			flags |= MAP_ANONYMOUS;
-			fput(file);
-			file = NULL;
-		}
-	}
-
-	retval = -EINVAL;
-	if (!(flags & MAP_FIXED))
-		addr = 0;
-	else if (len > 0xf0000000 || addr > 0xf0000000 - len)
-		goto out_putf;
-	ret_type = flags & _MAP_NEW;
-	flags &= ~_MAP_NEW;
-
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(file,
-			 (unsigned long) addr, (unsigned long) len,
-			 (unsigned long) prot, (unsigned long) flags,
-			 (unsigned long) off);
-	up_write(&current->mm->mmap_sem);
-	if (!ret_type)
-		retval = ((retval < 0xf0000000) ? 0 : retval);
-out_putf:
-	if (file)
-		fput(file);
-out:
-	return (u32) retval;
-}
-
-asmlinkage int sunos_mctl(u32 addr, u32 len, int function, u32 arg)
-{
-	return 0;
-}
-
-asmlinkage int sunos_brk(u32 baddr)
-{
-	int freepages, retval = -ENOMEM;
-	unsigned long rlim;
-	unsigned long newbrk, oldbrk, brk = (unsigned long) baddr;
-
-	down_write(&current->mm->mmap_sem);
-	if (brk < current->mm->end_code)
-		goto out;
-	newbrk = PAGE_ALIGN(brk);
-	oldbrk = PAGE_ALIGN(current->mm->brk);
-	retval = 0;
-	if (oldbrk == newbrk) {
-		current->mm->brk = brk;
-		goto out;
-	}
-	/* Always allow shrinking brk. */
-	if (brk <= current->mm->brk) {
-		current->mm->brk = brk;
-		do_munmap(current->mm, newbrk, oldbrk-newbrk);
-		goto out;
-	}
-	/* Check against rlimit and stack.. */
-	retval = -ENOMEM;
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim >= RLIM_INFINITY)
-		rlim = ~0;
-	if (brk - current->mm->end_code > rlim)
-		goto out;
-	/* Check against existing mmap mappings. */
-	if (find_vma_intersection(current->mm, oldbrk, newbrk+PAGE_SIZE))
-		goto out;
-	/* stupid algorithm to decide if we have enough memory: while
-	 * simple, it hopefully works in most obvious cases.. Easy to
-	 * fool it, but this should catch most mistakes.
-	 */
-	freepages = global_page_state(NR_FILE_PAGES);
-	freepages >>= 1;
-	freepages += nr_free_pages();
-	freepages += nr_swap_pages;
-	freepages -= num_physpages >> 4;
-	freepages -= (newbrk-oldbrk) >> PAGE_SHIFT;
-	if (freepages < 0)
-		goto out;
-	/* Ok, we have probably got enough memory - let it rip. */
-	current->mm->brk = brk;
-	do_brk(oldbrk, newbrk-oldbrk);
-	retval = 0;
-out:
-	up_write(&current->mm->mmap_sem);
-	return retval;
-}
-
-asmlinkage u32 sunos_sbrk(int increment)
-{
-	int error, oldbrk;
-
-	/* This should do it hopefully... */
-	oldbrk = (int)current->mm->brk;
-	error = sunos_brk(((int) current->mm->brk) + increment);
-	if (!error)
-		error = oldbrk;
-	return error;
-}
-
-asmlinkage u32 sunos_sstk(int increment)
-{
-	printk("%s: Call to sunos_sstk(increment<%d>) is unsupported\n",
-	       current->comm, increment);
-
-	return (u32)-1;
-}
-
-/* Give hints to the kernel as to what paging strategy to use...
- * Completely bogus, don't remind me.
- */
-#define VA_NORMAL     0 /* Normal vm usage expected */
-#define VA_ABNORMAL   1 /* Abnormal/random vm usage probable */
-#define VA_SEQUENTIAL 2 /* Accesses will be of a sequential nature */
-#define VA_INVALIDATE 3 /* Page table entries should be flushed ??? */
-static char *vstrings[] = {
-	"VA_NORMAL",
-	"VA_ABNORMAL",
-	"VA_SEQUENTIAL",
-	"VA_INVALIDATE",
-};
-
-asmlinkage void sunos_vadvise(u32 strategy)
-{
-	static int count;
-
-	/* I wanna see who uses this... */
-	if (count++ < 5)
-		printk("%s: Advises us to use %s paging strategy\n",
-		       current->comm,
-		       strategy <= 3 ? vstrings[strategy] : "BOGUS");
-}
-
-/* This just wants the soft limit (ie. rlim_cur element) of the RLIMIT_NOFILE
- * resource limit and is for backwards compatibility with older sunos
- * revs.
- */
-asmlinkage int sunos_getdtablesize(void)
-{
-	return SUNOS_NR_OPEN;
-}
-
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-asmlinkage u32 sunos_sigblock(u32 blk_mask)
-{
-	u32 old;
-
-	spin_lock_irq(&current->sighand->siglock);
-	old = (u32) current->blocked.sig[0];
-	current->blocked.sig[0] |= (blk_mask & _BLOCKABLE);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	return old;
-}
-
-asmlinkage u32 sunos_sigsetmask(u32 newmask)
-{
-	u32 retval;
-
-	spin_lock_irq(&current->sighand->siglock);
-	retval = (u32) current->blocked.sig[0];
-	current->blocked.sig[0] = (newmask & _BLOCKABLE);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	return retval;
-}
-
-/* SunOS getdents is very similar to the newer Linux (iBCS2 compliant)    */
-/* getdents system call, the format of the structure just has a different */
-/* layout (d_off+d_ino instead of d_ino+d_off) */
-struct sunos_dirent {
-    s32		d_off;
-    u32		d_ino;
-    u16		d_reclen;
-    u16		d_namlen;
-    char	d_name[1];
-};
-
-struct sunos_dirent_callback {
-    struct sunos_dirent __user *curr;
-    struct sunos_dirent __user *previous;
-    int count;
-    int error;
-};
-
-#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
-#define ROUND_UP(x) (((x)+sizeof(s32)-1) & ~(sizeof(s32)-1))
-
-static int sunos_filldir(void * __buf, const char * name, int namlen,
-			 loff_t offset, ino_t ino, unsigned int d_type)
-{
-	struct sunos_dirent __user *dirent;
-	struct sunos_dirent_callback * buf = (struct sunos_dirent_callback *) __buf;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
-	u32 d_ino;
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
-		return -EOVERFLOW;
-	dirent = buf->previous;
-	if (dirent)
-		put_user(offset, &dirent->d_off);
-	dirent = buf->curr;
-	buf->previous = dirent;
-	put_user(d_ino, &dirent->d_ino);
-	put_user(namlen, &dirent->d_namlen);
-	put_user(reclen, &dirent->d_reclen);
-	if (copy_to_user(dirent->d_name, name, namlen))
-		return -EFAULT;
-	put_user(0, dirent->d_name + namlen);
-	dirent = (void __user *) dirent + reclen;
-	buf->curr = dirent;
-	buf->count -= reclen;
-	return 0;
-}
-
-asmlinkage int sunos_getdents(unsigned int fd, void __user *dirent, int cnt)
-{
-	struct file * file;
-	struct sunos_dirent __user *lastdirent;
-	struct sunos_dirent_callback buf;
-	int error = -EBADF;
-
-	if (fd >= SUNOS_NR_OPEN)
-		goto out;
-
-	file = fget(fd);
-	if (!file)
-		goto out;
-
-	error = -EINVAL;
-	if (cnt < (sizeof(struct sunos_dirent) + 255))
-		goto out_putf;
-
-	buf.curr = (struct sunos_dirent __user *) dirent;
-	buf.previous = NULL;
-	buf.count = cnt;
-	buf.error = 0;
-
-	error = vfs_readdir(file, sunos_filldir, &buf);
-	if (error < 0)
-		goto out_putf;
-
-	lastdirent = buf.previous;
-	error = buf.error;
-	if (lastdirent) {
-		put_user(file->f_pos, &lastdirent->d_off);
-		error = cnt - buf.count;
-	}
-
-out_putf:
-	fput(file);
-out:
-	return error;
-}
-
-/* Old sunos getdirentries, severely broken compatibility stuff here. */
-struct sunos_direntry {
-    u32		d_ino;
-    u16		d_reclen;
-    u16		d_namlen;
-    char	d_name[1];
-};
-
-struct sunos_direntry_callback {
-    struct sunos_direntry __user *curr;
-    struct sunos_direntry __user *previous;
-    int count;
-    int error;
-};
-
-static int sunos_filldirentry(void * __buf, const char * name, int namlen,
-			      loff_t offset, ino_t ino, unsigned int d_type)
-{
-	struct sunos_direntry __user *dirent;
-	struct sunos_direntry_callback * buf =
-		(struct sunos_direntry_callback *) __buf;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 1);
-	u32 d_ino;
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
-		return -EOVERFLOW;
-	dirent = buf->previous;
-	dirent = buf->curr;
-	buf->previous = dirent;
-	put_user(d_ino, &dirent->d_ino);
-	put_user(namlen, &dirent->d_namlen);
-	put_user(reclen, &dirent->d_reclen);
-	if (copy_to_user(dirent->d_name, name, namlen))
-		return -EFAULT;
-	put_user(0, dirent->d_name + namlen);
-	dirent = (void __user *) dirent + reclen;
-	buf->curr = dirent;
-	buf->count -= reclen;
-	return 0;
-}
-
-asmlinkage int sunos_getdirentries(unsigned int fd,
-				   void __user *dirent,
-				   int cnt,
-				   unsigned int __user *basep)
-{
-	struct file * file;
-	struct sunos_direntry __user *lastdirent;
-	int error = -EBADF;
-	struct sunos_direntry_callback buf;
-
-	if (fd >= SUNOS_NR_OPEN)
-		goto out;
-
-	file = fget(fd);
-	if (!file)
-		goto out;
-
-	error = -EINVAL;
-	if (cnt < (sizeof(struct sunos_direntry) + 255))
-		goto out_putf;
-
-	buf.curr = (struct sunos_direntry __user *) dirent;
-	buf.previous = NULL;
-	buf.count = cnt;
-	buf.error = 0;
-
-	error = vfs_readdir(file, sunos_filldirentry, &buf);
-	if (error < 0)
-		goto out_putf;
-
-	lastdirent = buf.previous;
-	error = buf.error;
-	if (lastdirent) {
-		put_user(file->f_pos, basep);
-		error = cnt - buf.count;
-	}
-
-out_putf:
-	fput(file);
-out:
-	return error;
-}
-
-struct sunos_utsname {
-	char sname[9];
-	char nname[9];
-	char nnext[56];
-	char rel[9];
-	char ver[9];
-	char mach[9];
-};
-
-asmlinkage int sunos_uname(struct sunos_utsname __user *name)
-{
-	int ret;
-
-	down_read(&uts_sem);
-	ret = copy_to_user(&name->sname[0], &utsname()->sysname[0],
-			   sizeof(name->sname) - 1);
-	ret |= copy_to_user(&name->nname[0], &utsname()->nodename[0],
-			    sizeof(name->nname) - 1);
-	ret |= put_user('\0', &name->nname[8]);
-	ret |= copy_to_user(&name->rel[0], &utsname()->release[0],
-			    sizeof(name->rel) - 1);
-	ret |= copy_to_user(&name->ver[0], &utsname()->version[0],
-			    sizeof(name->ver) - 1);
-	ret |= copy_to_user(&name->mach[0], &utsname()->machine[0],
-			    sizeof(name->mach) - 1);
-	up_read(&uts_sem);
-	return (ret ? -EFAULT : 0);
-}
-
-asmlinkage int sunos_nosys(void)
-{
-	struct pt_regs *regs;
-	siginfo_t info;
-	static int cnt;
-
-	regs = current_thread_info()->kregs;
-	if (test_thread_flag(TIF_32BIT)) {
-		regs->tpc &= 0xffffffff;
-		regs->tnpc &= 0xffffffff;
-	}
-	info.si_signo = SIGSYS;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT|0x100;
-	info.si_addr = (void __user *)regs->tpc;
-	info.si_trapno = regs->u_regs[UREG_G1];
-	send_sig_info(SIGSYS, &info, current);
-	if (cnt++ < 4) {
-		printk("Process makes ni_syscall number %d, register dump:\n",
-		       (int) regs->u_regs[UREG_G1]);
-		show_regs(regs);
-	}
-	return -ENOSYS;
-}
-
-/* This is not a real and complete implementation yet, just to keep
- * the easy SunOS binaries happy.
- */
-asmlinkage int sunos_fpathconf(int fd, int name)
-{
-	int ret;
-
-	switch(name) {
-	case _PCONF_LINK:
-		ret = LINK_MAX;
-		break;
-	case _PCONF_CANON:
-		ret = MAX_CANON;
-		break;
-	case _PCONF_INPUT:
-		ret = MAX_INPUT;
-		break;
-	case _PCONF_NAME:
-		ret = NAME_MAX;
-		break;
-	case _PCONF_PATH:
-		ret = PATH_MAX;
-		break;
-	case _PCONF_PIPE:
-		ret = PIPE_BUF;
-		break;
-	case _PCONF_CHRESTRICT:		/* XXX Investigate XXX */
-		ret = 1;
-		break;
-	case _PCONF_NOTRUNC:		/* XXX Investigate XXX */
-	case _PCONF_VDISABLE:
-		ret = 0;
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	return ret;
-}
-
-asmlinkage int sunos_pathconf(u32 u_path, int name)
-{
-	int ret;
-
-	ret = sunos_fpathconf(0, name); /* XXX cheese XXX */
-	return ret;
-}
-
-asmlinkage int sunos_select(int width, u32 inp, u32 outp, u32 exp, u32 tvp_x)
-{
-	int ret;
-
-	/* SunOS binaries expect that select won't change the tvp contents */
-	ret = compat_sys_select(width, compat_ptr(inp), compat_ptr(outp),
-				compat_ptr(exp), compat_ptr(tvp_x));
-	if (ret == -EINTR && tvp_x) {
-		struct compat_timeval __user *tvp = compat_ptr(tvp_x);
-		time_t sec, usec;
-
-		__get_user(sec, &tvp->tv_sec);
-		__get_user(usec, &tvp->tv_usec);
-		if (sec == 0 && usec == 0)
-			ret = 0;
-	}
-	return ret;
-}
-
-asmlinkage void sunos_nop(void)
-{
-	return;
-}
-
-#if 0 /* This code doesn't translate user pointers correctly,
-       * disable for now. -DaveM
-       */
-
-/* XXXXXXXXXX SunOS mount/umount. XXXXXXXXXXX */
-#define SMNT_RDONLY       1
-#define SMNT_NOSUID       2
-#define SMNT_NEWTYPE      4
-#define SMNT_GRPID        8
-#define SMNT_REMOUNT      16
-#define SMNT_NOSUB        32
-#define SMNT_MULTI        64
-#define SMNT_SYS5         128
-
-struct sunos_fh_t {
-	char fh_data [NFS_FHSIZE];
-};
-
-struct sunos_nfs_mount_args {
-	struct sockaddr_in  *addr; /* file server address */
-	struct nfs_fh *fh;     /* File handle to be mounted */
-	int        flags;      /* flags */
-	int        wsize;      /* write size in bytes */
-	int        rsize;      /* read size in bytes */
-	int        timeo;      /* initial timeout in .1 secs */
-	int        retrans;    /* times to retry send */
-	char       *hostname;  /* server's hostname */
-	int        acregmin;   /* attr cache file min secs */
-	int        acregmax;   /* attr cache file max secs */
-	int        acdirmin;   /* attr cache dir min secs */
-	int        acdirmax;   /* attr cache dir max secs */
-	char       *netname;   /* server's netname */
-};
-
-
-/* Bind the socket on a local reserved port and connect it to the
- * remote server.  This on Linux/i386 is done by the mount program,
- * not by the kernel. 
- */
-/* XXXXXXXXXXXXXXXXXXXX */
-static int
-sunos_nfs_get_server_fd (int fd, struct sockaddr_in *addr)
-{
-	struct sockaddr_in local;
-	struct sockaddr_in server;
-	int    try_port;
-	int    ret;
-	struct socket *socket;
-	struct inode  *inode;
-	struct file   *file;
-
-	file = fget(fd);
-	if (!file)
-		return 0;
-
-	inode = file->f_path.dentry->d_inode;
-
-	socket = SOCKET_I(inode);
-	local.sin_family = AF_INET;
-	local.sin_addr.s_addr = htonl(INADDR_ANY);
-
-	/* IPPORT_RESERVED = 1024, can't find the definition in the kernel */
-	try_port = 1024;
-	do {
-		local.sin_port = htons (--try_port);
-		ret = socket->ops->bind(socket, (struct sockaddr*)&local,
-					sizeof(local));
-	} while (ret && try_port > (1024 / 2));
-
-	if (ret) {
-		fput(file);
-		return 0;
-	}
-
-	server.sin_family = AF_INET;
-	server.sin_addr = addr->sin_addr;
-	server.sin_port = NFS_PORT;
-
-	/* Call sys_connect */
-	ret = socket->ops->connect (socket, (struct sockaddr *) &server,
-				    sizeof (server), file->f_flags);
-	fput(file);
-	if (ret < 0)
-		return 0;
-	return 1;
-}
-
-/* XXXXXXXXXXXXXXXXXXXX */
-static int get_default (int value, int def_value)
-{
-    if (value)
-	return value;
-    else
-	return def_value;
-}
-
-/* XXXXXXXXXXXXXXXXXXXX */
-static int sunos_nfs_mount(char *dir_name, int linux_flags, void __user *data)
-{
-	int  server_fd, err;
-	char *the_name, *mount_page;
-	struct nfs_mount_data linux_nfs_mount;
-	struct sunos_nfs_mount_args sunos_mount;
-
-	/* Ok, here comes the fun part: Linux's nfs mount needs a
-	 * socket connection to the server, but SunOS mount does not
-	 * require this, so we use the information on the destination
-	 * address to create a socket and bind it to a reserved
-	 * port on this system
-	 */
-	if (copy_from_user(&sunos_mount, data, sizeof(sunos_mount)))
-		return -EFAULT;
-
-	server_fd = sys_socket (AF_INET, SOCK_DGRAM, IPPROTO_UDP);
-	if (server_fd < 0)
-		return -ENXIO;
-
-	if (copy_from_user(&linux_nfs_mount.addr, sunos_mount.addr,
-			   sizeof(*sunos_mount.addr)) ||
-	    copy_from_user(&linux_nfs_mount.root, sunos_mount.fh,
-			   sizeof(*sunos_mount.fh))) {
-		sys_close (server_fd);
-		return -EFAULT;
-	}
-
-	if (!sunos_nfs_get_server_fd (server_fd, &linux_nfs_mount.addr)){
-		sys_close (server_fd);
-		return -ENXIO;
-	}
-
-	/* Now, bind it to a locally reserved port */
-	linux_nfs_mount.version  = NFS_MOUNT_VERSION;
-	linux_nfs_mount.flags    = sunos_mount.flags;
-	linux_nfs_mount.fd       = server_fd;
-	
-	linux_nfs_mount.rsize    = get_default (sunos_mount.rsize, 8192);
-	linux_nfs_mount.wsize    = get_default (sunos_mount.wsize, 8192);
-	linux_nfs_mount.timeo    = get_default (sunos_mount.timeo, 10);
-	linux_nfs_mount.retrans  = sunos_mount.retrans;
-	
-	linux_nfs_mount.acregmin = sunos_mount.acregmin;
-	linux_nfs_mount.acregmax = sunos_mount.acregmax;
-	linux_nfs_mount.acdirmin = sunos_mount.acdirmin;
-	linux_nfs_mount.acdirmax = sunos_mount.acdirmax;
-
-	the_name = getname(sunos_mount.hostname);
-	if (IS_ERR(the_name))
-		return PTR_ERR(the_name);
-
-	strlcpy(linux_nfs_mount.hostname, the_name,
-		sizeof(linux_nfs_mount.hostname));
-	putname (the_name);
-	
-	mount_page = (char *) get_zeroed_page(GFP_KERNEL);
-	if (!mount_page)
-		return -ENOMEM;
-
-	memcpy(mount_page, &linux_nfs_mount, sizeof(linux_nfs_mount));
-
-	err = do_mount("", dir_name, "nfs", linux_flags, mount_page);
-
-	free_page((unsigned long) mount_page);
-	return err;
-}
-
-/* XXXXXXXXXXXXXXXXXXXX */
-asmlinkage int
-sunos_mount(char *type, char *dir, int flags, void *data)
-{
-	int linux_flags = 0;
-	int ret = -EINVAL;
-	char *dev_fname = 0;
-	char *dir_page, *type_page;
-
-	if (!capable (CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* We don't handle the integer fs type */
-	if ((flags & SMNT_NEWTYPE) == 0)
-		goto out;
-
-	/* Do not allow for those flags we don't support */
-	if (flags & (SMNT_GRPID|SMNT_NOSUB|SMNT_MULTI|SMNT_SYS5))
-		goto out;
-
-	if (flags & SMNT_REMOUNT)
-		linux_flags |= MS_REMOUNT;
-	if (flags & SMNT_RDONLY)
-		linux_flags |= MS_RDONLY;
-	if (flags & SMNT_NOSUID)
-		linux_flags |= MS_NOSUID;
-
-	dir_page = getname(dir);
-	ret = PTR_ERR(dir_page);
-	if (IS_ERR(dir_page))
-		goto out;
-
-	type_page = getname(type);
-	ret = PTR_ERR(type_page);
-	if (IS_ERR(type_page))
-		goto out1;
-
-	if (strcmp(type_page, "ext2") == 0) {
-		dev_fname = getname(data);
-	} else if (strcmp(type_page, "iso9660") == 0) {
-		dev_fname = getname(data);
-	} else if (strcmp(type_page, "minix") == 0) {
-		dev_fname = getname(data);
-	} else if (strcmp(type_page, "nfs") == 0) {
-		ret = sunos_nfs_mount (dir_page, flags, data);
-		goto out2;
-        } else if (strcmp(type_page, "ufs") == 0) {
-		printk("Warning: UFS filesystem mounts unsupported.\n");
-		ret = -ENODEV;
-		goto out2;
-	} else if (strcmp(type_page, "proc")) {
-		ret = -ENODEV;
-		goto out2;
-	}
-	ret = PTR_ERR(dev_fname);
-	if (IS_ERR(dev_fname))
-		goto out2;
-	lock_kernel();
-	ret = do_mount(dev_fname, dir_page, type_page, linux_flags, NULL);
-	unlock_kernel();
-	if (dev_fname)
-		putname(dev_fname);
-out2:
-	putname(type_page);
-out1:
-	putname(dir_page);
-out:
-	return ret;
-}
-#endif
-
-asmlinkage int sunos_setpgrp(pid_t pid, pid_t pgid)
-{
-	int ret;
-
-	/* So stupid... */
-	if ((!pid || pid == current->pid) &&
-	    !pgid) {
-		sys_setsid();
-		ret = 0;
-	} else {
-		ret = sys_setpgid(pid, pgid);
-	}
-	return ret;
-}
-
-/* So stupid... */
-extern long compat_sys_wait4(compat_pid_t, compat_uint_t __user *, int,
-			     struct compat_rusage __user *);
-
-asmlinkage int sunos_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, struct compat_rusage __user *ru)
-{
-	int ret;
-
-	ret = compat_sys_wait4((pid ? pid : ((compat_pid_t)-1)),
-			       stat_addr, options, ru);
-	return ret;
-}
-
-asmlinkage int sunos_killpg(int pgrp, int sig)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = -EINVAL;
-	if (pgrp > 0)
-		ret = kill_pgrp(find_vpid(pgrp), sig, 0);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-asmlinkage int sunos_audit(void)
-{
-	printk ("sys_audit\n");
-	return -1;
-}
-
-asmlinkage u32 sunos_gethostid(void)
-{
-	u32 ret;
-
-	ret = (((u32)idprom->id_machtype << 24) | ((u32)idprom->id_sernum));
-
-	return ret;
-}
-
-/* sysconf options, for SunOS compatibility */
-#define   _SC_ARG_MAX             1
-#define   _SC_CHILD_MAX           2
-#define   _SC_CLK_TCK             3
-#define   _SC_NGROUPS_MAX         4
-#define   _SC_OPEN_MAX            5
-#define   _SC_JOB_CONTROL         6
-#define   _SC_SAVED_IDS           7
-#define   _SC_VERSION             8
-
-asmlinkage s32 sunos_sysconf (int name)
-{
-	s32 ret;
-
-	switch (name){
-	case _SC_ARG_MAX:
-		ret = ARG_MAX;
-		break;
-	case _SC_CHILD_MAX:
-		ret = current->signal->rlim[RLIMIT_NPROC].rlim_cur;
-		break;
-	case _SC_CLK_TCK:
-		ret = HZ;
-		break;
-	case _SC_NGROUPS_MAX:
-		ret = NGROUPS_MAX;
-		break;
-	case _SC_OPEN_MAX:
-		ret = current->signal->rlim[RLIMIT_NOFILE].rlim_cur;
-		break;
-	case _SC_JOB_CONTROL:
-		ret = 1;	/* yes, we do support job control */
-		break;
-	case _SC_SAVED_IDS:
-		ret = 1;	/* yes, we do support saved uids  */
-		break;
-	case _SC_VERSION:
-		/* mhm, POSIX_VERSION is in /usr/include/unistd.h
-		 * should it go on /usr/include/linux?
-		 */
-		ret = 199009;
-		break;
-	default:
-		ret = -1;
-		break;
-	};
-	return ret;
-}
-
-asmlinkage int sunos_semsys(int op, u32 arg1, u32 arg2, u32 arg3, void __user *ptr)
-{
-	union semun arg4;
-	int ret;
-
-	switch (op) {
-	case 0:
-		/* Most arguments match on a 1:1 basis but cmd doesn't */
-		switch(arg3) {
-		case 4:
-			arg3=GETPID; break;
-		case 5:
-			arg3=GETVAL; break;
-		case 6:
-			arg3=GETALL; break;
-		case 3:
-			arg3=GETNCNT; break;
-		case 7:
-			arg3=GETZCNT; break;
-		case 8:
-			arg3=SETVAL; break;
-		case 9:
-			arg3=SETALL; break;
-		}
-		/* sys_semctl(): */
-		/* value to modify semaphore to */
-		arg4.__pad = ptr;
-		ret = sys_semctl((int)arg1, (int)arg2, (int)arg3, arg4);
-		break;
-	case 1:
-		/* sys_semget(): */
-		ret = sys_semget((key_t)arg1, (int)arg2, (int)arg3);
-		break;
-	case 2:
-		/* sys_semop(): */
-		ret = sys_semop((int)arg1, (struct sembuf __user *)(unsigned long)arg2,
-				(unsigned int) arg3);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	};
-	return ret;
-}
-
-struct msgbuf32 {
-	s32 mtype;
-	char mtext[1];
-};
-
-struct ipc_perm32
-{
-	key_t    	  key;
-        compat_uid_t  uid;
-        compat_gid_t  gid;
-        compat_uid_t  cuid;
-        compat_gid_t  cgid;
-        compat_mode_t mode;
-        unsigned short  seq;
-};
-
-struct msqid_ds32
-{
-        struct ipc_perm32 msg_perm;
-        u32 msg_first;
-        u32 msg_last;
-        compat_time_t msg_stime;
-        compat_time_t msg_rtime;
-        compat_time_t msg_ctime;
-        u32 wwait;
-        u32 rwait;
-        unsigned short msg_cbytes;
-        unsigned short msg_qnum;  
-        unsigned short msg_qbytes;
-        compat_ipc_pid_t msg_lspid;
-        compat_ipc_pid_t msg_lrpid;
-};
-
-static inline int sunos_msqid_get(struct msqid_ds32 __user *user,
-				  struct msqid_ds *kern)
-{
-	if (get_user(kern->msg_perm.key, &user->msg_perm.key)		||
-	    __get_user(kern->msg_perm.uid, &user->msg_perm.uid)		||
-	    __get_user(kern->msg_perm.gid, &user->msg_perm.gid)		||
-	    __get_user(kern->msg_perm.cuid, &user->msg_perm.cuid)	||
-	    __get_user(kern->msg_perm.cgid, &user->msg_perm.cgid)	||
-	    __get_user(kern->msg_stime, &user->msg_stime)		||
-	    __get_user(kern->msg_rtime, &user->msg_rtime)		||
-	    __get_user(kern->msg_ctime, &user->msg_ctime)		||
-	    __get_user(kern->msg_ctime, &user->msg_cbytes)		||
-	    __get_user(kern->msg_ctime, &user->msg_qnum)		||
-	    __get_user(kern->msg_ctime, &user->msg_qbytes)		||
-	    __get_user(kern->msg_ctime, &user->msg_lspid)		||
-	    __get_user(kern->msg_ctime, &user->msg_lrpid))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int sunos_msqid_put(struct msqid_ds32 __user *user,
-				  struct msqid_ds *kern)
-{
-	if (put_user(kern->msg_perm.key, &user->msg_perm.key)		||
-	    __put_user(kern->msg_perm.uid, &user->msg_perm.uid)		||
-	    __put_user(kern->msg_perm.gid, &user->msg_perm.gid)		||
-	    __put_user(kern->msg_perm.cuid, &user->msg_perm.cuid)	||
-	    __put_user(kern->msg_perm.cgid, &user->msg_perm.cgid)	||
-	    __put_user(kern->msg_stime, &user->msg_stime)		||
-	    __put_user(kern->msg_rtime, &user->msg_rtime)		||
-	    __put_user(kern->msg_ctime, &user->msg_ctime)		||
-	    __put_user(kern->msg_ctime, &user->msg_cbytes)		||
-	    __put_user(kern->msg_ctime, &user->msg_qnum)		||
-	    __put_user(kern->msg_ctime, &user->msg_qbytes)		||
-	    __put_user(kern->msg_ctime, &user->msg_lspid)		||
-	    __put_user(kern->msg_ctime, &user->msg_lrpid))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int sunos_msgbuf_get(struct msgbuf32 __user *user, struct msgbuf *kern, int len)
-{
-	if (get_user(kern->mtype, &user->mtype)	||
-	    __copy_from_user(kern->mtext, &user->mtext, len))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int sunos_msgbuf_put(struct msgbuf32 __user *user, struct msgbuf *kern, int len)
-{
-	if (put_user(kern->mtype, &user->mtype)	||
-	    __copy_to_user(user->mtext, kern->mtext, len))
-		return -EFAULT;
-	return 0;
-}
-
-asmlinkage int sunos_msgsys(int op, u32 arg1, u32 arg2, u32 arg3, u32 arg4)
-{
-	struct sparc_stackf32 __user *sp;
-	struct msqid_ds kds;
-	struct msgbuf *kmbuf;
-	mm_segment_t old_fs = get_fs();
-	u32 arg5;
-	int rval;
-
-	switch(op) {
-	case 0:
-		rval = sys_msgget((key_t)arg1, (int)arg2);
-		break;
-	case 1:
-		if (!sunos_msqid_get((struct msqid_ds32 __user *)(unsigned long)arg3, &kds)) {
-			set_fs(KERNEL_DS);
-			rval = sys_msgctl((int)arg1, (int)arg2,
-					  (struct msqid_ds __user *)(unsigned long)arg3);
-			set_fs(old_fs);
-			if (!rval)
-				rval = sunos_msqid_put((struct msqid_ds32 __user *)(unsigned long)arg3,
-						       &kds);
-		} else
-			rval = -EFAULT;
-		break;
-	case 2:
-		rval = -EFAULT;
-		kmbuf = kmalloc(sizeof(struct msgbuf) + arg3,
-						 GFP_KERNEL);
-		if (!kmbuf)
-			break;
-		sp = (struct sparc_stackf32 __user *)
-			(current_thread_info()->kregs->u_regs[UREG_FP] & 0xffffffffUL);
-		if (get_user(arg5, &sp->xxargs[0])) {
-			rval = -EFAULT;
-			kfree(kmbuf);
-			break;
-		}
-		set_fs(KERNEL_DS);
-		rval = sys_msgrcv((int)arg1, (struct msgbuf __user *) kmbuf,
-				  (size_t)arg3,
-				  (long)arg4, (int)arg5);
-		set_fs(old_fs);
-		if (!rval)
-			rval = sunos_msgbuf_put((struct msgbuf32 __user *)(unsigned long)arg2,
-						kmbuf, arg3);
-		kfree(kmbuf);
-		break;
-	case 3:
-		rval = -EFAULT;
-		kmbuf = kmalloc(sizeof(struct msgbuf) + arg3,
-						 GFP_KERNEL);
-		if (!kmbuf || sunos_msgbuf_get((struct msgbuf32 __user *)(unsigned long)arg2,
-					       kmbuf, arg3))
-			break;
-		set_fs(KERNEL_DS);
-		rval = sys_msgsnd((int)arg1, (struct msgbuf __user *) kmbuf,
-				  (size_t)arg3, (int)arg4);
-		set_fs(old_fs);
-		kfree(kmbuf);
-		break;
-	default:
-		rval = -EINVAL;
-		break;
-	}
-	return rval;
-}
-
-struct shmid_ds32 {
-        struct ipc_perm32       shm_perm;
-        int                     shm_segsz;
-        compat_time_t         shm_atime;
-        compat_time_t         shm_dtime;
-        compat_time_t         shm_ctime;
-        compat_ipc_pid_t    shm_cpid; 
-        compat_ipc_pid_t    shm_lpid; 
-        unsigned short          shm_nattch;
-};
-                                                        
-static inline int sunos_shmid_get(struct shmid_ds32 __user *user,
-				  struct shmid_ds *kern)
-{
-	if (get_user(kern->shm_perm.key, &user->shm_perm.key)		||
-	    __get_user(kern->shm_perm.uid, &user->shm_perm.uid)		||
-	    __get_user(kern->shm_perm.gid, &user->shm_perm.gid)		||
-	    __get_user(kern->shm_perm.cuid, &user->shm_perm.cuid)	||
-	    __get_user(kern->shm_perm.cgid, &user->shm_perm.cgid)	||
-	    __get_user(kern->shm_segsz, &user->shm_segsz)		||
-	    __get_user(kern->shm_atime, &user->shm_atime)		||
-	    __get_user(kern->shm_dtime, &user->shm_dtime)		||
-	    __get_user(kern->shm_ctime, &user->shm_ctime)		||
-	    __get_user(kern->shm_cpid, &user->shm_cpid)			||
-	    __get_user(kern->shm_lpid, &user->shm_lpid)			||
-	    __get_user(kern->shm_nattch, &user->shm_nattch))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int sunos_shmid_put(struct shmid_ds32 __user *user,
-				  struct shmid_ds *kern)
-{
-	if (put_user(kern->shm_perm.key, &user->shm_perm.key)		||
-	    __put_user(kern->shm_perm.uid, &user->shm_perm.uid)		||
-	    __put_user(kern->shm_perm.gid, &user->shm_perm.gid)		||
-	    __put_user(kern->shm_perm.cuid, &user->shm_perm.cuid)	||
-	    __put_user(kern->shm_perm.cgid, &user->shm_perm.cgid)	||
-	    __put_user(kern->shm_segsz, &user->shm_segsz)		||
-	    __put_user(kern->shm_atime, &user->shm_atime)		||
-	    __put_user(kern->shm_dtime, &user->shm_dtime)		||
-	    __put_user(kern->shm_ctime, &user->shm_ctime)		||
-	    __put_user(kern->shm_cpid, &user->shm_cpid)			||
-	    __put_user(kern->shm_lpid, &user->shm_lpid)			||
-	    __put_user(kern->shm_nattch, &user->shm_nattch))
-		return -EFAULT;
-	return 0;
-}
-
-asmlinkage int sunos_shmsys(int op, u32 arg1, u32 arg2, u32 arg3)
-{
-	struct shmid_ds ksds;
-	unsigned long raddr;
-	mm_segment_t old_fs = get_fs();
-	int rval;
-
-	switch(op) {
-	case 0:
-		/* do_shmat(): attach a shared memory area */
-		rval = do_shmat((int)arg1,(char __user *)(unsigned long)arg2,(int)arg3,&raddr);
-		if (!rval)
-			rval = (int) raddr;
-		break;
-	case 1:
-		/* sys_shmctl(): modify shared memory area attr. */
-		if (!sunos_shmid_get((struct shmid_ds32 __user *)(unsigned long)arg3, &ksds)) {
-			set_fs(KERNEL_DS);
-			rval = sys_shmctl((int) arg1,(int) arg2,
-					  (struct shmid_ds __user *) &ksds);
-			set_fs(old_fs);
-			if (!rval)
-				rval = sunos_shmid_put((struct shmid_ds32 __user *)(unsigned long)arg3,
-						       &ksds);
-		} else
-			rval = -EFAULT;
-		break;
-	case 2:
-		/* sys_shmdt(): detach a shared memory area */
-		rval = sys_shmdt((char __user *)(unsigned long)arg1);
-		break;
-	case 3:
-		/* sys_shmget(): get a shared memory area */
-		rval = sys_shmget((key_t)arg1,(int)arg2,(int)arg3);
-		break;
-	default:
-		rval = -EINVAL;
-		break;
-	};
-	return rval;
-}
-
-extern asmlinkage long sparc32_open(const char __user * filename, int flags, int mode);
-
-asmlinkage int sunos_open(u32 fname, int flags, int mode)
-{
-	const char __user *filename = compat_ptr(fname);
-
-	return sparc32_open(filename, flags, mode);
-}
-
-#define SUNOS_EWOULDBLOCK 35
-
-/* see the sunos man page read(2v) for an explanation
-   of this garbage. We use O_NDELAY to mark
-   file descriptors that have been set non-blocking 
-   using 4.2BSD style calls. (tridge) */
-
-static inline int check_nonblock(int ret, int fd)
-{
-	if (ret == -EAGAIN) {
-		struct file * file = fget(fd);
-		if (file) {
-			if (file->f_flags & O_NDELAY)
-				ret = -SUNOS_EWOULDBLOCK;
-			fput(file);
-		}
-	}
-	return ret;
-}
-
-asmlinkage int sunos_read(unsigned int fd, char __user *buf, u32 count)
-{
-	int ret;
-
-	ret = check_nonblock(sys_read(fd, buf, count), fd);
-	return ret;
-}
-
-asmlinkage int sunos_readv(u32 fd, void __user *vector, s32 count)
-{
-	int ret;
-
-	ret = check_nonblock(compat_sys_readv(fd, vector, count), fd);
-	return ret;
-}
-
-asmlinkage int sunos_write(unsigned int fd, char __user *buf, u32 count)
-{
-	int ret;
-
-	ret = check_nonblock(sys_write(fd, buf, count), fd);
-	return ret;
-}
-
-asmlinkage int sunos_writev(u32 fd, void __user *vector, s32 count)
-{
-	int ret;
-
-	ret = check_nonblock(compat_sys_writev(fd, vector, count), fd);
-	return ret;
-}
-
-asmlinkage int sunos_recv(u32 __fd, void __user *ubuf, int size, unsigned flags)
-{
-	int ret, fd = (int) __fd;
-
-	ret = check_nonblock(sys_recv(fd, ubuf, size, flags), fd);
-	return ret;
-}
-
-asmlinkage int sunos_send(u32 __fd, void __user *buff, int len, unsigned flags)
-{
-	int ret, fd = (int) __fd;
-
-	ret = check_nonblock(sys_send(fd, buff, len, flags), fd);
-	return ret;
-}
-
-asmlinkage int sunos_accept(u32 __fd, struct sockaddr __user *sa, int __user *addrlen)
-{
-	int ret, fd = (int) __fd;
-
-	while (1) {
-		ret = check_nonblock(sys_accept(fd, sa, addrlen), fd);
-		if (ret != -ENETUNREACH && ret != -EHOSTUNREACH)
-			break;
-	}
-	return ret;
-}
-
-#define SUNOS_SV_INTERRUPT 2
-
-asmlinkage int sunos_sigaction (int sig,
-				struct old_sigaction32 __user *act,
-				struct old_sigaction32 __user *oact)
-{
-	struct k_sigaction new_ka, old_ka;
-	int ret;
-
-	if (act) {
-		compat_old_sigset_t mask;
-		u32 u_handler;
-
-		if (get_user(u_handler, &act->sa_handler) ||
-		    __get_user(new_ka.sa.sa_flags, &act->sa_flags))
-			return -EFAULT;
-		new_ka.sa.sa_handler = compat_ptr(u_handler);
-		__get_user(mask, &act->sa_mask);
-		new_ka.sa.sa_restorer = NULL;
-		new_ka.ka_restorer = NULL;
-		siginitset(&new_ka.sa.sa_mask, mask);
-		new_ka.sa.sa_flags ^= SUNOS_SV_INTERRUPT;
-	}
-
-	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
-	if (!ret && oact) {
-		old_ka.sa.sa_flags ^= SUNOS_SV_INTERRUPT;
-		if (put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
-		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags))
-			return -EFAULT;
-		__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
-	}
-
-	return ret;
-}
-
-asmlinkage int sunos_setsockopt(u32 __fd, u32 __level, u32 __optname,
-				char __user *optval, u32 __optlen)
-{
-	int fd = (int) __fd;
-	int level = (int) __level;
-	int optname = (int) __optname;
-	int optlen = (int) __optlen;
-	int tr_opt = optname;
-	int ret;
-
-	if (level == SOL_IP) {
-		/* Multicast socketopts (ttl, membership) */
-		if (tr_opt >=2 && tr_opt <= 6)
-			tr_opt += 30;
-	}
-	ret = sys_setsockopt(fd, level, tr_opt,
-			     optval, optlen);
-	return ret;
-}
-
-asmlinkage int sunos_getsockopt(u32 __fd, u32 __level, u32 __optname,
-				char __user *optval, int __user *optlen)
-{
-	int fd = (int) __fd;
-	int level = (int) __level;
-	int optname = (int) __optname;
-	int tr_opt = optname;
-	int ret;
-
-	if (level == SOL_IP) {
-		/* Multicast socketopts (ttl, membership) */
-		if (tr_opt >=2 && tr_opt <= 6)
-			tr_opt += 30;
-	}
-	ret = compat_sys_getsockopt(fd, level, tr_opt,
-				    optval, optlen);
-	return ret;
-}
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index 6b9b718e24af..a4fef2ba1ae1 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -155,125 +155,3 @@ sys_call_table:
 	.word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 	.word sys_timerfd_settime, sys_timerfd_gettime
-
-#if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \
-    defined(CONFIG_SOLARIS_EMUL_MODULE)
-	/* Now the 32-bit SunOS syscall table. */
-
-	.align 4
-	.globl sunos_sys_table
-sunos_sys_table:
-/*0*/	.word sunos_indir, sys32_exit, sys_fork
-	.word sunos_read, sunos_write, sunos_open
-	.word sys_close, sunos_wait4, sys_creat
-	.word sys_link, sys_unlink, sunos_execv
-	.word sys_chdir, sunos_nosys, sys32_mknod
-	.word sys_chmod, sys32_lchown16, sunos_brk
-	.word sunos_nosys, sys32_lseek, sunos_getpid
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_getuid, sunos_nosys, sys_ptrace
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sys_access, sunos_nosys, sunos_nosys
-	.word sys_sync, sys_kill, compat_sys_newstat
-	.word sunos_nosys, compat_sys_newlstat, sys_dup
-	.word sys_pipe, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_getgid
-	.word sunos_nosys, sunos_nosys
-/*50*/	.word sunos_nosys, sys_acct, sunos_nosys
-	.word sunos_mctl, sunos_ioctl, sys_reboot
-	.word sunos_nosys, sys_symlink, sys_readlink
-	.word sys32_execve, sys_umask, sys_chroot
-	.word compat_sys_newfstat, sunos_nosys, sys_getpagesize
-	.word sys_msync, sys_vfork, sunos_nosys
-	.word sunos_nosys, sunos_sbrk, sunos_sstk
-	.word sunos_mmap, sunos_vadvise, sys_munmap
-	.word sys_mprotect, sys_madvise, sys_vhangup
-	.word sunos_nosys, sys_mincore, sys32_getgroups16
-	.word sys32_setgroups16, sys_getpgrp, sunos_setpgrp
-	.word compat_sys_setitimer, sunos_nosys, sys_swapon
-	.word compat_sys_getitimer, sys_gethostname, sys_sethostname
-	.word sunos_getdtablesize, sys_dup2, sunos_nop
-	.word compat_sys_fcntl, sunos_select, sunos_nop
-	.word sys_fsync, sys32_setpriority, sys32_socket
-	.word sys32_connect, sunos_accept
-/*100*/	.word sys_getpriority, sunos_send, sunos_recv
-	.word sunos_nosys, sys32_bind, sunos_setsockopt
-	.word sys32_listen, sunos_nosys, sunos_sigaction
-	.word sunos_sigblock, sunos_sigsetmask, sys_sigpause
-	.word sys32_sigstack, sys32_recvmsg, sys32_sendmsg
-	.word sunos_nosys, sys32_gettimeofday, compat_sys_getrusage
-	.word sunos_getsockopt, sunos_nosys, sunos_readv
-	.word sunos_writev, sys32_settimeofday, sys32_fchown16
-	.word sys_fchmod, sys32_recvfrom, sys32_setreuid16
-	.word sys32_setregid16, sys_rename, sys_truncate
-	.word sys_ftruncate, sys_flock, sunos_nosys
-	.word sys32_sendto, sys32_shutdown, sys32_socketpair
-	.word sys_mkdir, sys_rmdir, sys32_utimes
-	.word sys32_sigreturn, sunos_nosys, sys32_getpeername
-	.word sunos_gethostid, sunos_nosys, compat_sys_getrlimit
-	.word compat_sys_setrlimit, sunos_killpg, sunos_nosys
-	.word sunos_nosys, sunos_nosys
-/*150*/	.word sys32_getsockname, sunos_nosys, sunos_nosys
-	.word sys_poll, sunos_nosys, sunos_nosys
-	.word sunos_getdirentries, compat_sys_statfs, compat_sys_fstatfs
-	.word sys_oldumount, sunos_nosys, sunos_nosys
-	.word sys_getdomainname, sys_setdomainname
-	.word sunos_nosys, sys_quotactl, sunos_nosys
-	.word sunos_nosys, sys_ustat, sunos_semsys
-	.word sunos_nosys, sunos_shmsys, sunos_audit
-	.word sunos_nosys, sunos_getdents, sys_setsid
-	.word sys_fchdir, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, compat_sys_sigpending, sunos_nosys
-	.word sys_setpgid, sunos_pathconf, sunos_fpathconf
-	.word sunos_sysconf, sunos_uname, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-/*200*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys
-/*250*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-/*260*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-/*270*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-/*280*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-/*290*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-/*300*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-/*310*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys
-
-#endif
diff --git a/arch/sparc64/kernel/systbls.h b/arch/sparc64/kernel/systbls.h
index 8a0d20a35d0c..bc9f5dac4069 100644
--- a/arch/sparc64/kernel/systbls.h
+++ b/arch/sparc64/kernel/systbls.h
@@ -27,8 +27,6 @@ extern asmlinkage unsigned long sys64_mremap(unsigned long addr,
 					     unsigned long new_addr);
 extern asmlinkage unsigned long c_sys_nis_syscall(struct pt_regs *regs);
 extern asmlinkage long sys_getdomainname(char __user *name, int len);
-extern asmlinkage long solaris_syscall(struct pt_regs *regs);
-extern asmlinkage long sunos_syscall(struct pt_regs *regs);
 extern asmlinkage long sys_utrap_install(utrap_entry_t type,
 					 utrap_handler_t new_p,
 					 utrap_handler_t new_d,
diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S
index 7575aa371da8..b0de4c00b11a 100644
--- a/arch/sparc64/kernel/ttable.S
+++ b/arch/sparc64/kernel/ttable.S
@@ -117,16 +117,13 @@ tl0_f4o:	FILL_4_OTHER
 tl0_f5o:	FILL_5_OTHER
 tl0_f6o:	FILL_6_OTHER
 tl0_f7o:	FILL_7_OTHER
-tl0_sunos:	SUNOS_SYSCALL_TRAP
+tl0_resv100:	BTRAP(0x100)
 tl0_bkpt:	BREAKPOINT_TRAP
 tl0_divz:	TRAP(do_div0)
 tl0_flushw:	FLUSH_WINDOW_TRAP
-tl0_resv104:	BTRAP(0x104) BTRAP(0x105) BTRAP(0x106) BTRAP(0x107)
-		.globl tl0_solaris
-tl0_solaris:	SOLARIS_SYSCALL_TRAP
-tl0_resv109:	BTRAP(0x109)
-tl0_resv10a:	BTRAP(0x10a) BTRAP(0x10b) BTRAP(0x10c) BTRAP(0x10d) BTRAP(0x10e)
-tl0_resv10f:	BTRAP(0x10f)
+tl0_resv104:	BTRAP(0x104) BTRAP(0x105) BTRAP(0x106) BTRAP(0x107) BTRAP(0x108)
+tl0_resv109:	BTRAP(0x109) BTRAP(0x10a) BTRAP(0x10b) BTRAP(0x10c) BTRAP(0x10d)
+tl0_resv10e:	BTRAP(0x10e) BTRAP(0x10f)
 tl0_linux32:	LINUX_32BIT_SYSCALL_TRAP
 tl0_oldlinux64:	LINUX_64BIT_SYSCALL_TRAP
 tl0_resv112:	TRAP_UTRAP(UT_TRAP_INSTRUCTION_18,0x112) TRAP_UTRAP(UT_TRAP_INSTRUCTION_19,0x113)
@@ -139,8 +136,7 @@ tl0_resv11e:	TRAP_UTRAP(UT_TRAP_INSTRUCTION_30,0x11e) TRAP_UTRAP(UT_TRAP_INSTRUC
 tl0_getcc:	GETCC_TRAP
 tl0_setcc:	SETCC_TRAP
 tl0_getpsr:	TRAP(do_getpsr)
-tl0_resv123:	BTRAP(0x123) BTRAP(0x124) BTRAP(0x125) BTRAP(0x126)
-tl0_solindir:	INDIRECT_SOLARIS_SYSCALL(156)
+tl0_resv123:	BTRAP(0x123) BTRAP(0x124) BTRAP(0x125) BTRAP(0x126) BTRAP(0x127)
 tl0_resv128:	BTRAP(0x128) BTRAP(0x129) BTRAP(0x12a) BTRAP(0x12b) BTRAP(0x12c)
 tl0_resv12d:	BTRAP(0x12d) BTRAP(0x12e) BTRAP(0x12f) BTRAP(0x130) BTRAP(0x131)
 tl0_resv132:	BTRAP(0x132) BTRAP(0x133) BTRAP(0x134) BTRAP(0x135) BTRAP(0x136)
diff --git a/arch/sparc64/solaris/Makefile b/arch/sparc64/solaris/Makefile
deleted file mode 100644
index 8c8663033bfb..000000000000
--- a/arch/sparc64/solaris/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for the Solaris binary emulation.
-#
-
-EXTRA_AFLAGS := -ansi
-
-solaris-objs := entry64.o fs.o misc.o signal.o systbl.o socket.o \
-		ioctl.o ipc.o socksys.o timod.o
-
-obj-$(CONFIG_SOLARIS_EMUL) += solaris.o
diff --git a/arch/sparc64/solaris/conv.h b/arch/sparc64/solaris/conv.h
deleted file mode 100644
index 50e58232cf2b..000000000000
--- a/arch/sparc64/solaris/conv.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* $Id: conv.h,v 1.4 1998/08/15 20:42:51 davem Exp $
- * conv.h: Utility macros for Solaris emulation
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
- 
-/* #define DEBUG_SOLARIS */
-#define DEBUG_SOLARIS_KMALLOC
-
-#ifndef __ASSEMBLY__
-
-#include <asm/unistd.h>
-
-/* Use this to get at 32-bit user passed pointers. */
-#define A(__x)				\
-({	unsigned long __ret;		\
-	__asm__ ("srl	%0, 0, %0"	\
-		 : "=r" (__ret)		\
-		 : "0" (__x));		\
-	(void __user *)__ret;		\
-})
-
-extern unsigned sys_call_table[];
-extern unsigned sys_call_table32[];
-extern unsigned sunos_sys_table[];
-
-#define SYS(name) ((long)sys_call_table[__NR_##name])
-#define SUNOS(x) ((long)sunos_sys_table[x])
-
-#ifdef DEBUG_SOLARIS
-#define SOLD(s) printk("%s,%d,%s(): %s\n",__FILE__,__LINE__,__func__,(s))
-#define SOLDD(s) printk("solaris: "); printk s
-#else
-#define SOLD(s)
-#define SOLDD(s)
-#endif
-
-#endif /* __ASSEMBLY__ */
diff --git a/arch/sparc64/solaris/entry64.S b/arch/sparc64/solaris/entry64.S
deleted file mode 100644
index f170324e8bf2..000000000000
--- a/arch/sparc64/solaris/entry64.S
+++ /dev/null
@@ -1,223 +0,0 @@
-/* $Id: entry64.S,v 1.7 2002/02/09 19:49:31 davem Exp $
- * entry64.S:  Solaris syscall emulation entry point.
- *
- * Copyright (C) 1996,1997,1998 Jakub Jelinek   (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1995,1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996 Miguel de Icaza      (miguel@nuclecu.unam.mx)
- */
-
-#include <linux/errno.h>
-
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/smp.h>
-#include <asm/ptrace.h>
-#include <asm/page.h>
-#include <asm/signal.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-
-#include "conv.h"
-
-#define NR_SYSCALLS	256
-
-	.text
-solaris_syscall_trace:
-	add		%sp, PTREGS_OFF, %o0
-	call		syscall_trace
-	 mov		0, %o1
-	srl		%i0, 0, %o0
-	mov		%i4, %o4
-	srl		%i1, 0, %o1
-	mov		%i5, %o5
-	andcc		%l3, 1, %g0
-	be,pt		%icc, 2f
-	 srl		%i2, 0, %o2
-	b,pt		%xcc, 2f
-	 add		%sp, PTREGS_OFF, %o0
-
-solaris_sucks:
-/* Solaris is a big system which needs to be able to do all the things
- * in Inf+1 different ways */
-	add		%i6, 0x5c, %o0
-	mov		%i0, %g1
-	mov		%i1, %i0
-	mov		%i2, %i1
-	srl		%o0, 0, %o0
-	mov		%i3, %i2
-	movrz		%g1, 256, %g1 /* Ensure we don't loop forever */
-	mov		%i4, %i3
-	mov		%i5, %i4
-	ba,pt		%xcc, solaris_sparc_syscall
-exen:	 lduwa		[%o0] ASI_S, %i5
-
-exenf:	ba,pt		%xcc, solaris_sparc_syscall
-	 clr		%i5
-
-/* For shared binaries, binfmt_elf32 already sets up personality
-   and exec_domain. This is to handle static binaries as well */
-solaris_reg:
-	call		solaris_register
-	 nop
-	ba,pt		%xcc, 1f
-	 mov		%i4, %o4
-
-linux_syscall_for_solaris:
-	sethi		%hi(sys_call_table32), %l6
-	or		%l6, %lo(sys_call_table32), %l6
-	sll		%l3, 2, %l4
-	ba,pt		%xcc, 10f
-	 lduw		[%l6 + %l4], %l3
-
-	/* Solaris system calls enter here... */
-	.align	32
-	.globl	solaris_sparc_syscall, entry64_personality_patch
-solaris_sparc_syscall:
-entry64_personality_patch:
-	ldub		[%g4 + 0x0], %l0
-	cmp		%g1, 255
-	bg,pn		%icc, solaris_unimplemented
-	 srl		%g1, 0, %g1
-	sethi		%hi(solaris_sys_table), %l7
-	or		%l7, %lo(solaris_sys_table), %l7
-	brz,pn		%g1, solaris_sucks
-	 mov		%i4, %o4
-	sll		%g1, 2, %l4
-	cmp		%l0, 1
-	bne,pn		%icc, solaris_reg
-1:	 srl		%i0, 0, %o0
-	lduw		[%l7 + %l4], %l3
-	srl		%i1, 0, %o1
-	ldx		[%g6 + TI_FLAGS], %l5
-	cmp		%l3, NR_SYSCALLS
-	bleu,a,pn	%xcc, linux_syscall_for_solaris
-	 nop
-	andcc		%l3, 1, %g0
-	bne,a,pn	%icc, 10f
-	 add		%sp, PTREGS_OFF, %o0
-10:	srl		%i2, 0, %o2
-	mov		%i5, %o5
-	andn		%l3, 3, %l7
-	andcc		%l5, _TIF_SYSCALL_TRACE, %g0				
-	bne,pn		%icc, solaris_syscall_trace		
-	 mov		%i0, %l5
-2:	call		%l7
-	 srl		%i3, 0, %o3
-ret_from_solaris:
-	stx		%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-	ldx		[%g6 + TI_FLAGS], %l6
-	sra		%o0, 0, %o0
-	mov		%ulo(TSTATE_XCARRY | TSTATE_ICARRY), %g2
-	ldx		[%sp + PTREGS_OFF + PT_V9_TSTATE], %g3
-	cmp		%o0, -ERESTART_RESTARTBLOCK
-	sllx		%g2, 32, %g2
-	bgeu,pn		%xcc, 1f
-	 andcc		%l6, _TIF_SYSCALL_TRACE, %l6	
-
-	/* System call success, clear Carry condition code. */
-	andn		%g3, %g2, %g3
-	stx		%g3, [%sp + PTREGS_OFF + PT_V9_TSTATE]	
-	bne,pn		%icc, solaris_syscall_trace2
-	 ldx		[%sp + PTREGS_OFF + PT_V9_TNPC], %l1
-	andcc		%l1, 1, %g0
-	bne,pn		%icc, 2f
-	 clr		%l6
-	add		%l1, 0x4, %l2				         
-	stx		%l1, [%sp + PTREGS_OFF + PT_V9_TPC]	 ! pc = npc
-	call		rtrap
-	 stx		%l2, [%sp + PTREGS_OFF + PT_V9_TNPC] !npc = npc+4
-
-	/* When tnpc & 1, this comes from setcontext and we don't want to advance pc */
-2:	andn		%l1, 3, %l1
-	call		rtrap
-	 stx		%l1, [%sp + PTREGS_OFF + PT_V9_TNPC] !npc = npc&~3
-
-1:
-	/* System call failure, set Carry condition code.
-	 * Also, get abs(errno) to return to the process.
-	 */
-	sub		%g0, %o0, %o0
-	or		%g3, %g2, %g3
-	cmp		%o0, ERANGE	/* 0-ERANGE are identity mapped */
-	bleu,pt		%icc, 1f
-	 cmp		%o0, EMEDIUMTYPE
-	bgu,pn		%icc, 1f
-	 sethi		%hi(solaris_err_table), %l6
-	sll		%o0, 2, %o0
-	or		%l6, %lo(solaris_err_table), %l6
-	ldsw		[%l6 + %o0], %o0
-1:	stx		%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-	mov		1, %l6
-	stx		%g3, [%sp + PTREGS_OFF + PT_V9_TSTATE]
-	bne,pn		%icc, solaris_syscall_trace2
-	 ldx		[%sp + PTREGS_OFF + PT_V9_TNPC], %l1
-	andcc		%l1, 1, %g0
-	bne,pn		%icc, 2b
-	 add		%l1, 0x4, %l2
-	stx		%l1, [%sp + PTREGS_OFF + PT_V9_TPC]  ! pc = npc
-	call		rtrap
-	 stx		%l2, [%sp + PTREGS_OFF + PT_V9_TNPC] !npc = npc+4 
-
-solaris_syscall_trace2:
-	add		%sp, PTREGS_OFF, %o0
-	call		syscall_trace
-	 mov		1, %o1
-	add		%l1, 0x4, %l2			/* npc = npc+4 */
-	andcc		%l1, 1, %g0
-	bne,pn		%icc, 2b
-	 nop
-	stx		%l1, [%sp + PTREGS_OFF + PT_V9_TPC]
-	call		rtrap
-	 stx		%l2, [%sp + PTREGS_OFF + PT_V9_TNPC]
-
-	/* This one is tricky, so that's why we do it in assembly */
-	.globl		solaris_sigsuspend
-solaris_sigsuspend:
-	call		do_sol_sigsuspend
-	 nop
-	brlz,pn		%o0, ret_from_solaris
-	 nop
-	call		sys_sigsuspend
-	 stx		%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-	b,pt		%xcc, ret_from_solaris
-	 nop
-
-	.globl		solaris_getpid
-solaris_getpid:
-	call		sys_getppid
-	 nop
-	call		sys_getpid
-	 stx		%o0, [%sp + PTREGS_OFF + PT_V9_I1]
-	b,pt		%xcc, ret_from_solaris
-	 nop
-
-	.globl	solaris_getuid
-solaris_getuid:
-	call		sys_geteuid
-	 nop
-	call		sys_getuid
-	 stx		%o1, [%sp + PTREGS_OFF + PT_V9_I1]
-	b,pt		%xcc, ret_from_solaris
-	 nop
-
-	.globl	solaris_getgid
-solaris_getgid:
-	call		sys_getegid
-	 nop
-	call		sys_getgid
-	 stx		%o1, [%sp + PTREGS_OFF + PT_V9_I1]
-	b,pt		%xcc, ret_from_solaris
-	 nop
-
-	.globl		solaris_unimplemented
-solaris_unimplemented:
-	call		do_sol_unimplemented
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, ret_from_solaris
-	 nop
-
-	.section	__ex_table,"a"
-	.align		4
-	.word		exen, exenf
-
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
deleted file mode 100644
index 7d035f0d3ae1..000000000000
--- a/arch/sparc64/solaris/fs.c
+++ /dev/null
@@ -1,745 +0,0 @@
-/* $Id: fs.c,v 1.27 2002/02/08 03:57:14 davem Exp $
- * fs.c: fs related syscall emulation for Solaris
- *
- * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- *
- * 1999-08-19 Implemented solaris F_FREESP (truncate)
- *            fcntl, by Jason Rappleye (rappleye@ccr.buffalo.edu)
- */
-
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
-#include <linux/mm.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/smp_lock.h>
-#include <linux/limits.h>
-#include <linux/resource.h>
-#include <linux/quotaops.h>
-#include <linux/mount.h>
-#include <linux/vfs.h>
-
-#include <asm/uaccess.h>
-#include <asm/string.h>
-#include <asm/ptrace.h>
-
-#include "conv.h"
-
-#define R3_VERSION	1
-#define R4_VERSION	2
-
-typedef struct {
-	s32	tv_sec;
-	s32	tv_nsec;
-} timestruct_t;
-
-struct sol_stat {
-	u32		st_dev;
-	s32		st_pad1[3];     /* network id */
-	u32		st_ino;
-	u32		st_mode;
-	u32		st_nlink;
-	u32		st_uid;
-	u32		st_gid;
-	u32		st_rdev;
-	s32		st_pad2[2];
-	s32		st_size;
-	s32		st_pad3;	/* st_size, off_t expansion */
-	timestruct_t	st_atime;
-	timestruct_t	st_mtime;
-	timestruct_t	st_ctime;
-	s32		st_blksize;
-	s32		st_blocks;
-	char		st_fstype[16];
-	s32		st_pad4[8];     /* expansion area */
-};
-
-struct sol_stat64 {
-	u32		st_dev;
-	s32		st_pad1[3];     /* network id */
-	u64		st_ino;
-	u32		st_mode;
-	u32		st_nlink;
-	u32		st_uid;
-	u32		st_gid;
-	u32		st_rdev;
-	s32		st_pad2[2];
-	s64		st_size;
-	timestruct_t	st_atime;
-	timestruct_t	st_mtime;
-	timestruct_t	st_ctime;
-	s64		st_blksize;
-	s32		st_blocks;
-	char		st_fstype[16];
-	s32		st_pad4[4];     /* expansion area */
-};
-
-#define UFSMAGIC (((unsigned)'u'<<24)||((unsigned)'f'<<16)||((unsigned)'s'<<8))
-
-static inline int putstat(struct sol_stat __user *ubuf, struct kstat *kbuf)
-{
-	u32 ino;
-
-	if (kbuf->size > MAX_NON_LFS ||
-	    !sysv_valid_dev(kbuf->dev) ||
-	    !sysv_valid_dev(kbuf->rdev))
-		return -EOVERFLOW;
-	ino = kbuf->ino;
-	if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
-		return -EOVERFLOW;
-	if (put_user (sysv_encode_dev(kbuf->dev), &ubuf->st_dev)	||
-	    __put_user (ino, &ubuf->st_ino)				||
-	    __put_user (kbuf->mode, &ubuf->st_mode)		||
-	    __put_user (kbuf->nlink, &ubuf->st_nlink)	||
-	    __put_user (kbuf->uid, &ubuf->st_uid)		||
-	    __put_user (kbuf->gid, &ubuf->st_gid)		||
-	    __put_user (sysv_encode_dev(kbuf->rdev), &ubuf->st_rdev)	||
-	    __put_user (kbuf->size, &ubuf->st_size)		||
-	    __put_user (kbuf->atime.tv_sec, &ubuf->st_atime.tv_sec)	||
-	    __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime.tv_nsec)	||
-	    __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime.tv_sec)	||
-	    __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime.tv_nsec)	||
-	    __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime.tv_sec)	||
-	    __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime.tv_nsec)	||
-	    __put_user (kbuf->blksize, &ubuf->st_blksize)	||
-	    __put_user (kbuf->blocks, &ubuf->st_blocks)	||
-	    __put_user (UFSMAGIC, (unsigned __user *)ubuf->st_fstype))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int putstat64(struct sol_stat64 __user *ubuf, struct kstat *kbuf)
-{
-	if (!sysv_valid_dev(kbuf->dev) || !sysv_valid_dev(kbuf->rdev))
-		return -EOVERFLOW;
-	if (put_user (sysv_encode_dev(kbuf->dev), &ubuf->st_dev)	||
-	    __put_user (kbuf->ino, &ubuf->st_ino)		||
-	    __put_user (kbuf->mode, &ubuf->st_mode)		||
-	    __put_user (kbuf->nlink, &ubuf->st_nlink)	||
-	    __put_user (kbuf->uid, &ubuf->st_uid)		||
-	    __put_user (kbuf->gid, &ubuf->st_gid)		||
-	    __put_user (sysv_encode_dev(kbuf->rdev), &ubuf->st_rdev)	||
-	    __put_user (kbuf->size, &ubuf->st_size)		||
-	    __put_user (kbuf->atime.tv_sec, &ubuf->st_atime.tv_sec)	||
-	    __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime.tv_nsec)	||
-	    __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime.tv_sec)	||
-	    __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime.tv_nsec)	||
-	    __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime.tv_sec)	||
-	    __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime.tv_nsec)	||
-	    __put_user (kbuf->blksize, &ubuf->st_blksize)	||
-	    __put_user (kbuf->blocks, &ubuf->st_blocks)	||
-	    __put_user (UFSMAGIC, (unsigned __user *)ubuf->st_fstype))
-		return -EFAULT;
-	return 0;
-}
-
-asmlinkage int solaris_stat(u32 filename, u32 statbuf)
-{
-	struct kstat s;
-	int ret = vfs_stat(A(filename), &s);
-	if (!ret)
-		return putstat(A(statbuf), &s);
-	return ret;
-}
-
-asmlinkage int solaris_xstat(int vers, u32 filename, u32 statbuf)
-{
-	/* Solaris doesn't bother with looking at vers, so we do neither */
-	return solaris_stat(filename, statbuf);
-}
-
-asmlinkage int solaris_stat64(u32 filename, u32 statbuf)
-{
-	struct kstat s;
-	int ret = vfs_stat(A(filename), &s);
-	if (!ret)
-		return putstat64(A(statbuf), &s);
-	return ret;
-}
-
-asmlinkage int solaris_lstat(u32 filename, u32 statbuf)
-{
-	struct kstat s;
-	int ret = vfs_lstat(A(filename), &s);
-	if (!ret)
-		return putstat(A(statbuf), &s);
-	return ret;
-}
-
-asmlinkage int solaris_lxstat(int vers, u32 filename, u32 statbuf)
-{
-	return solaris_lstat(filename, statbuf);
-}
-
-asmlinkage int solaris_lstat64(u32 filename, u32 statbuf)
-{
-	struct kstat s;
-	int ret = vfs_lstat(A(filename), &s);
-	if (!ret)
-		return putstat64(A(statbuf), &s);
-	return ret;
-}
-
-asmlinkage int solaris_fstat(unsigned int fd, u32 statbuf)
-{
-	struct kstat s;
-	int ret = vfs_fstat(fd, &s);
-	if (!ret)
-		return putstat(A(statbuf), &s);
-	return ret;
-}
-
-asmlinkage int solaris_fxstat(int vers, u32 fd, u32 statbuf)
-{
-	return solaris_fstat(fd, statbuf);
-}
-
-asmlinkage int solaris_fstat64(unsigned int fd, u32 statbuf)
-{
-	struct kstat s;
-	int ret = vfs_fstat(fd, &s);
-	if (!ret)
-		return putstat64(A(statbuf), &s);
-	return ret;
-}
-
-asmlinkage int solaris_mknod(u32 path, u32 mode, s32 dev)
-{
-	int (*sys_mknod)(const char __user *,int,unsigned) = 
-		(int (*)(const char __user *,int,unsigned))SYS(mknod);
-	int major = sysv_major(dev);
-	int minor = sysv_minor(dev);
-
-	/* minor is guaranteed to be OK for MKDEV, major might be not */
-	if (major > 0xfff)
-		return -EINVAL;
-	return sys_mknod(A(path), mode, new_encode_dev(MKDEV(major,minor)));
-}
-
-asmlinkage int solaris_xmknod(int vers, u32 path, u32 mode, s32 dev)
-{
-	return solaris_mknod(path, mode, dev);
-}
-
-asmlinkage int solaris_getdents64(unsigned int fd, void __user *dirent, unsigned int count)
-{
-	int (*sys_getdents)(unsigned int, void __user *, unsigned int) =
-		(int (*)(unsigned int, void __user *, unsigned int))SYS(getdents);
-		
-	return sys_getdents(fd, dirent, count);
-}
-
-/* This statfs thingie probably will go in the near future, but... */
-
-struct sol_statfs {
-	short	f_type;
-	s32	f_bsize;
-	s32	f_frsize;
-	s32	f_blocks;
-	s32	f_bfree;
-	u32	f_files;
-	u32	f_ffree;
-	char	f_fname[6];
-	char	f_fpack[6];
-};
-
-asmlinkage int solaris_statfs(u32 path, u32 buf, int len, int fstype)
-{
-	int ret;
-	struct statfs s;
-	mm_segment_t old_fs = get_fs();
-	int (*sys_statfs)(const char __user *,struct statfs __user *) = 
-		(int (*)(const char __user *,struct statfs __user *))SYS(statfs);
-	struct sol_statfs __user *ss = A(buf);
-	
-	if (len != sizeof(struct sol_statfs)) return -EINVAL;
-	if (!fstype) {
-		/* FIXME: mixing userland and kernel pointers */
-		set_fs (KERNEL_DS);
-		ret = sys_statfs(A(path), &s);
-		set_fs (old_fs);
-		if (!ret) {
-			if (put_user (s.f_type, &ss->f_type)		||
-			    __put_user (s.f_bsize, &ss->f_bsize)	||
-			    __put_user (0, &ss->f_frsize)		||
-			    __put_user (s.f_blocks, &ss->f_blocks)	||
-			    __put_user (s.f_bfree, &ss->f_bfree)	||
-			    __put_user (s.f_files, &ss->f_files)	||
-			    __put_user (s.f_ffree, &ss->f_ffree)	||
-			    __clear_user (&ss->f_fname, 12))
-				return -EFAULT;
-		}
-		return ret;
-	}
-/* Linux can't stat unmounted filesystems so we
- * simply lie and claim 100MB of 1GB is free. Sorry.
- */
-	if (put_user (fstype, &ss->f_type)		||
-	    __put_user (1024, &ss->f_bsize)		||
-	    __put_user (0, &ss->f_frsize)		||
-	    __put_user (1024*1024, &ss->f_blocks)	||
-	    __put_user (100*1024, &ss->f_bfree)		||
-	    __put_user (60000, &ss->f_files)		||
-	    __put_user (50000, &ss->f_ffree)		||
-	    __clear_user (&ss->f_fname, 12))
-		return -EFAULT;
-	return 0;
-}
-
-asmlinkage int solaris_fstatfs(u32 fd, u32 buf, int len, int fstype)
-{
-	int ret;
-	struct statfs s;
-	mm_segment_t old_fs = get_fs();
-	int (*sys_fstatfs)(unsigned,struct statfs __user *) = 
-		(int (*)(unsigned,struct statfs __user *))SYS(fstatfs);
-	struct sol_statfs __user *ss = A(buf);
-	
-	if (len != sizeof(struct sol_statfs)) return -EINVAL;
-	if (!fstype) {
-		set_fs (KERNEL_DS);
-		ret = sys_fstatfs(fd, &s);
-		set_fs (old_fs);
-		if (!ret) {
-			if (put_user (s.f_type, &ss->f_type)		||
-			    __put_user (s.f_bsize, &ss->f_bsize)	||
-			    __put_user (0, &ss->f_frsize)		||
-			    __put_user (s.f_blocks, &ss->f_blocks)	||
-			    __put_user (s.f_bfree, &ss->f_bfree)	||
-			    __put_user (s.f_files, &ss->f_files)	||
-			    __put_user (s.f_ffree, &ss->f_ffree)	||
-			    __clear_user (&ss->f_fname, 12))
-				return -EFAULT;
-		}
-		return ret;
-	}
-	/* Otherwise fstatfs is the same as statfs */
-	return solaris_statfs(0, buf, len, fstype);
-}
-
-struct sol_statvfs {
-	u32	f_bsize;
-	u32	f_frsize;
-	u32	f_blocks;
-	u32	f_bfree;
-	u32	f_bavail;
-	u32	f_files;
-	u32	f_ffree;
-	u32	f_favail;
-	u32	f_fsid;
-	char	f_basetype[16];
-	u32	f_flag;
-	u32	f_namemax;
-	char	f_fstr[32];
-	u32	f_filler[16];
-};
-
-struct sol_statvfs64 {
-	u32	f_bsize;
-	u32	f_frsize;
-	u64	f_blocks;
-	u64	f_bfree;
-	u64	f_bavail;
-	u64	f_files;
-	u64	f_ffree;
-	u64	f_favail;
-	u32	f_fsid;
-	char	f_basetype[16];
-	u32	f_flag;
-	u32	f_namemax;
-	char	f_fstr[32];
-	u32	f_filler[16];
-};
-
-static int report_statvfs(struct vfsmount *mnt, struct inode *inode, u32 buf)
-{
-	struct kstatfs s;
-	int error;
-	struct sol_statvfs __user *ss = A(buf);
-
-	error = vfs_statfs(mnt->mnt_root, &s);
-	if (!error) {
-		const char *p = mnt->mnt_sb->s_type->name;
-		int i = 0;
-		int j = strlen (p);
-		
-		if (j > 15) j = 15;
-		if (IS_RDONLY(inode)) i = 1;
-		if (mnt->mnt_flags & MNT_NOSUID) i |= 2;
-		if (!sysv_valid_dev(inode->i_sb->s_dev))
-			return -EOVERFLOW;
-		if (put_user (s.f_bsize, &ss->f_bsize)		||
-		    __put_user (0, &ss->f_frsize)		||
-		    __put_user (s.f_blocks, &ss->f_blocks)	||
-		    __put_user (s.f_bfree, &ss->f_bfree)	||
-		    __put_user (s.f_bavail, &ss->f_bavail)	||
-		    __put_user (s.f_files, &ss->f_files)	||
-		    __put_user (s.f_ffree, &ss->f_ffree)	||
-		    __put_user (s.f_ffree, &ss->f_favail)	||
-		    __put_user (sysv_encode_dev(inode->i_sb->s_dev), &ss->f_fsid) ||
-		    __copy_to_user (ss->f_basetype,p,j)		||
-		    __put_user (0, (char __user *)&ss->f_basetype[j])	||
-		    __put_user (s.f_namelen, &ss->f_namemax)	||
-		    __put_user (i, &ss->f_flag)			||		    
-		    __clear_user (&ss->f_fstr, 32))
-			return -EFAULT;
-	}
-	return error;
-}
-
-static int report_statvfs64(struct vfsmount *mnt, struct inode *inode, u32 buf)
-{
-	struct kstatfs s;
-	int error;
-	struct sol_statvfs64 __user *ss = A(buf);
-			
-	error = vfs_statfs(mnt->mnt_root, &s);
-	if (!error) {
-		const char *p = mnt->mnt_sb->s_type->name;
-		int i = 0;
-		int j = strlen (p);
-		
-		if (j > 15) j = 15;
-		if (IS_RDONLY(inode)) i = 1;
-		if (mnt->mnt_flags & MNT_NOSUID) i |= 2;
-		if (!sysv_valid_dev(inode->i_sb->s_dev))
-			return -EOVERFLOW;
-		if (put_user (s.f_bsize, &ss->f_bsize)		||
-		    __put_user (0, &ss->f_frsize)		||
-		    __put_user (s.f_blocks, &ss->f_blocks)	||
-		    __put_user (s.f_bfree, &ss->f_bfree)	||
-		    __put_user (s.f_bavail, &ss->f_bavail)	||
-		    __put_user (s.f_files, &ss->f_files)	||
-		    __put_user (s.f_ffree, &ss->f_ffree)	||
-		    __put_user (s.f_ffree, &ss->f_favail)	||
-		    __put_user (sysv_encode_dev(inode->i_sb->s_dev), &ss->f_fsid) ||
-		    __copy_to_user (ss->f_basetype,p,j)		||
-		    __put_user (0, (char __user *)&ss->f_basetype[j])	||
-		    __put_user (s.f_namelen, &ss->f_namemax)	||
-		    __put_user (i, &ss->f_flag)			||		    
-		    __clear_user (&ss->f_fstr, 32))
-			return -EFAULT;
-	}
-	return error;
-}
-
-asmlinkage int solaris_statvfs(u32 path, u32 buf)
-{
-	struct nameidata nd;
-	int error;
-
-	error = user_path_walk(A(path),&nd);
-	if (!error) {
-		struct inode *inode = nd.path.dentry->d_inode;
-		error = report_statvfs(nd.path.mnt, inode, buf);
-		path_put(&nd.path);
-	}
-	return error;
-}
-
-asmlinkage int solaris_fstatvfs(unsigned int fd, u32 buf)
-{
-	struct file * file;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (file) {
-		error = report_statvfs(file->f_path.mnt, file->f_path.dentry->d_inode, buf);
-		fput(file);
-	}
-
-	return error;
-}
-
-asmlinkage int solaris_statvfs64(u32 path, u32 buf)
-{
-	struct nameidata nd;
-	int error;
-
-	lock_kernel();
-	error = user_path_walk(A(path), &nd);
-	if (!error) {
-		struct inode *inode = nd.path.dentry->d_inode;
-		error = report_statvfs64(nd.path.mnt, inode, buf);
-		path_put(&nd.path);
-	}
-	unlock_kernel();
-	return error;
-}
-
-asmlinkage int solaris_fstatvfs64(unsigned int fd, u32 buf)
-{
-	struct file * file;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (file) {
-		lock_kernel();
-		error = report_statvfs64(file->f_path.mnt, file->f_path.dentry->d_inode, buf);
-		unlock_kernel();
-		fput(file);
-	}
-	return error;
-}
-
-extern asmlinkage long sparc32_open(const char * filename, int flags, int mode);
-
-asmlinkage int solaris_open(u32 fname, int flags, u32 mode)
-{
-	const char *filename = (const char *)(long)fname;
-	int fl = flags & 0xf;
-
-	/* Translate flags first. */
-	if (flags & 0x2000) fl |= O_LARGEFILE;
-	if (flags & 0x8050) fl |= O_SYNC;
-	if (flags & 0x80) fl |= O_NONBLOCK;
-	if (flags & 0x100) fl |= O_CREAT;
-	if (flags & 0x200) fl |= O_TRUNC;
-	if (flags & 0x400) fl |= O_EXCL;
-	if (flags & 0x800) fl |= O_NOCTTY;
-	flags = fl;
-
-	return sparc32_open(filename, flags, mode);
-}
-
-#define SOL_F_SETLK	6
-#define SOL_F_SETLKW	7
-#define SOL_F_FREESP    11
-#define SOL_F_ISSTREAM  13
-#define SOL_F_GETLK     14
-#define SOL_F_PRIV      15
-#define SOL_F_NPRIV     16
-#define SOL_F_QUOTACTL  17
-#define SOL_F_BLOCKS    18
-#define SOL_F_BLKSIZE   19
-#define SOL_F_GETOWN    23
-#define SOL_F_SETOWN    24
-
-struct sol_flock {
-	short	l_type;
-	short	l_whence;
-	u32	l_start;
-	u32	l_len;
-	s32	l_sysid;
-	s32	l_pid;
-	s32	l_pad[4];
-};
-
-asmlinkage int solaris_fcntl(unsigned fd, unsigned cmd, u32 arg)
-{
-	int (*sys_fcntl)(unsigned,unsigned,unsigned long) = 
-		(int (*)(unsigned,unsigned,unsigned long))SYS(fcntl);
-	int ret, flags;
-
-	switch (cmd) {
-	case F_DUPFD:
-	case F_GETFD:
-	case F_SETFD: return sys_fcntl(fd, cmd, (unsigned long)arg);
-	case F_GETFL:
-		flags = sys_fcntl(fd, cmd, 0);
-		ret = flags & 0xf;
-		if (flags & O_SYNC) ret |= 0x8050;
-		if (flags & O_NONBLOCK) ret |= 0x80;
-		return ret;
-	case F_SETFL:
-		flags = arg & 0xf;
-		if (arg & 0x8050) flags |= O_SYNC;
-		if (arg & 0x80) flags |= O_NONBLOCK;
-		return sys_fcntl(fd, cmd, (long)flags);
-	case SOL_F_GETLK:
-	case SOL_F_SETLK:
-	case SOL_F_SETLKW:
-		{
-			struct flock f;
-			struct sol_flock __user *p = A(arg);
-			mm_segment_t old_fs = get_fs();
-
-			switch (cmd) {
-			case SOL_F_GETLK: cmd = F_GETLK; break;
-			case SOL_F_SETLK: cmd = F_SETLK; break;
-			case SOL_F_SETLKW: cmd = F_SETLKW; break;
-			}
-
-			if (get_user (f.l_type, &p->l_type) ||
-			    __get_user (f.l_whence, &p->l_whence) ||
-			    __get_user (f.l_start, &p->l_start) ||
-			    __get_user (f.l_len, &p->l_len) ||
-			    __get_user (f.l_pid, &p->l_sysid))
-				return -EFAULT;
-
-			set_fs(KERNEL_DS);
-			ret = sys_fcntl(fd, cmd, (unsigned long)&f);
-			set_fs(old_fs);
-
-			if (__put_user (f.l_type, &p->l_type) ||
-			    __put_user (f.l_whence, &p->l_whence) ||
-			    __put_user (f.l_start, &p->l_start) ||
-			    __put_user (f.l_len, &p->l_len) ||
-			    __put_user (f.l_pid, &p->l_pid) ||
-			    __put_user (0, &p->l_sysid))
-				return -EFAULT;
-
-			return ret;
-		}
-	case SOL_F_FREESP:
-	        { 
-		    int length;
-		    int (*sys_newftruncate)(unsigned int, unsigned long)=
-			    (int (*)(unsigned int, unsigned long))SYS(ftruncate);
-
-		    if (get_user(length, &((struct sol_flock __user *)A(arg))->l_start))
-			    return -EFAULT;
-
-		    return sys_newftruncate(fd, length);
-		}
-	};
-	return -EINVAL;
-}
-
-asmlinkage int solaris_ulimit(int cmd, int val)
-{
-	switch (cmd) {
-	case 1: /* UL_GETFSIZE - in 512B chunks */
-		return current->signal->rlim[RLIMIT_FSIZE].rlim_cur >> 9;
-	case 2: /* UL_SETFSIZE */
-		if ((unsigned long)val > (LONG_MAX>>9)) return -ERANGE;
-		val <<= 9;
-		task_lock(current->group_leader);
-		if (val > current->signal->rlim[RLIMIT_FSIZE].rlim_max) {
-			if (!capable(CAP_SYS_RESOURCE)) {
-				task_unlock(current->group_leader);
-				return -EPERM;
-			}
-			current->signal->rlim[RLIMIT_FSIZE].rlim_max = val;
-		}
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = val;
-		task_unlock(current->group_leader);
-		return 0;
-	case 3: /* UL_GMEMLIM */
-		return current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	case 4: /* UL_GDESLIM */
-		return sysctl_nr_open;
-	}
-	return -EINVAL;
-}
-
-/* At least at the time I'm writing this, Linux doesn't have ACLs, so we
-   just fake this */
-asmlinkage int solaris_acl(u32 filename, int cmd, int nentries, u32 aclbufp)
-{
-	return -ENOSYS;
-}
-
-asmlinkage int solaris_facl(unsigned int fd, int cmd, int nentries, u32 aclbufp)
-{
-	return -ENOSYS;
-}
-
-asmlinkage int solaris_pread(unsigned int fd, char __user *buf, u32 count, u32 pos)
-{
-	ssize_t (*sys_pread64)(unsigned int, char __user *, size_t, loff_t) =
-		(ssize_t (*)(unsigned int, char __user *, size_t, loff_t))SYS(pread64);
-
-	return sys_pread64(fd, buf, count, (loff_t)pos);
-}
-
-asmlinkage int solaris_pwrite(unsigned int fd, char __user *buf, u32 count, u32 pos)
-{
-	ssize_t (*sys_pwrite64)(unsigned int, char __user *, size_t, loff_t) =
-		(ssize_t (*)(unsigned int, char __user *, size_t, loff_t))SYS(pwrite64);
-
-	return sys_pwrite64(fd, buf, count, (loff_t)pos);
-}
-
-/* POSIX.1 names */
-#define _PC_LINK_MAX    1
-#define _PC_MAX_CANON   2
-#define _PC_MAX_INPUT   3
-#define _PC_NAME_MAX    4
-#define _PC_PATH_MAX    5
-#define _PC_PIPE_BUF    6
-#define _PC_NO_TRUNC    7
-#define _PC_VDISABLE    8
-#define _PC_CHOWN_RESTRICTED    9
-/* POSIX.4 names */
-#define _PC_ASYNC_IO    10
-#define _PC_PRIO_IO     11
-#define _PC_SYNC_IO     12
-#define _PC_LAST        12
-
-/* This is not a real and complete implementation yet, just to keep
- * the easy Solaris binaries happy.
- */
-asmlinkage int solaris_fpathconf(int fd, int name)
-{
-	int ret;
-
-	switch(name) {
-	case _PC_LINK_MAX:
-		ret = LINK_MAX;
-		break;
-	case _PC_MAX_CANON:
-		ret = MAX_CANON;
-		break;
-	case _PC_MAX_INPUT:
-		ret = MAX_INPUT;
-		break;
-	case _PC_NAME_MAX:
-		ret = NAME_MAX;
-		break;
-	case _PC_PATH_MAX:
-		ret = PATH_MAX;
-		break;
-	case _PC_PIPE_BUF:
-		ret = PIPE_BUF;
-		break;
-	case _PC_CHOWN_RESTRICTED:
-		ret = 1;
-		break;
-	case _PC_NO_TRUNC:
-	case _PC_VDISABLE:
-		ret = 0;
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	return ret;
-}
-
-asmlinkage int solaris_pathconf(u32 path, int name)
-{
-	return solaris_fpathconf(0, name);
-}
-
-/* solaris_llseek returns long long - quite difficult */
-asmlinkage long solaris_llseek(struct pt_regs *regs, u32 off_hi, u32 off_lo, int whence)
-{
-	int (*sys_llseek)(unsigned int, unsigned long, unsigned long, loff_t __user *, unsigned int) =
-		(int (*)(unsigned int, unsigned long, unsigned long, loff_t __user *, unsigned int))SYS(_llseek);
-	int ret;
-	mm_segment_t old_fs = get_fs();
-	loff_t retval;
-	
-	set_fs(KERNEL_DS);
-	ret = sys_llseek((unsigned int)regs->u_regs[UREG_I0], off_hi, off_lo, &retval, whence);
-	set_fs(old_fs);
-	if (ret < 0) return ret;
-	regs->u_regs[UREG_I1] = (u32)retval;
-	return (retval >> 32);
-}
-
-/* Have to mask out all but lower 3 bits */
-asmlinkage int solaris_access(u32 filename, long mode)
-{
-	int (*sys_access)(const char __user *, int) = 
-		(int (*)(const char __user *, int))SYS(access);
-		
-	return sys_access(A(filename), mode & 7);
-}
diff --git a/arch/sparc64/solaris/ioctl.c b/arch/sparc64/solaris/ioctl.c
deleted file mode 100644
index 8ad10a6d993b..000000000000
--- a/arch/sparc64/solaris/ioctl.c
+++ /dev/null
@@ -1,825 +0,0 @@
-/* $Id: ioctl.c,v 1.17 2002/02/08 03:57:14 davem Exp $
- * ioctl.c: Solaris ioctl emulation.
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1997,1998 Patrik Rak (prak3264@ss1000.ms.mff.cuni.cz)
- *
- * Streams & timod emulation based on code
- * Copyright (C) 1995, 1996 Mike Jagdis (jaggy@purplet.demon.co.uk)
- *
- * 1999-08-19 Implemented solaris 'm' (mag tape) and
- *            'O' (openprom) ioctls, by Jason Rappleye
- *             (rappleye@ccr.buffalo.edu)
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/smp_lock.h>
-#include <linux/syscalls.h>
-#include <linux/ioctl.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/netdevice.h>
-#include <linux/mtio.h>
-#include <linux/time.h>
-#include <linux/rcupdate.h>
-#include <linux/compat.h>
-
-#include <net/sock.h>
-#include <net/net_namespace.h>
-
-#include <asm/uaccess.h>
-#include <asm/termios.h>
-#include <asm/openpromio.h>
-
-#include "conv.h"
-#include "socksys.h"
-
-extern asmlinkage int compat_sys_ioctl(unsigned int fd, unsigned int cmd,
-	u32 arg);
-asmlinkage int solaris_ioctl(unsigned int fd, unsigned int cmd, u32 arg);
-
-extern int timod_putmsg(unsigned int fd, char __user *ctl_buf, int ctl_len,
-			char __user *data_buf, int data_len, int flags);
-extern int timod_getmsg(unsigned int fd, char __user *ctl_buf, int ctl_maxlen, int __user *ctl_len,
-			char __user *data_buf, int data_maxlen, int __user *data_len, int *flags);
-
-/* termio* stuff {{{ */
-
-struct solaris_termios {
-	u32	c_iflag;
-	u32	c_oflag;
-	u32	c_cflag;
-	u32	c_lflag;
-	u8	c_cc[19];
-};
-
-struct solaris_termio {
-	u16	c_iflag;
-	u16	c_oflag;
-	u16	c_cflag;
-	u16	c_lflag;
-	s8	c_line;
-	u8	c_cc[8];
-};
-
-struct solaris_termiox {
-	u16	x_hflag;
-	u16	x_cflag;
-	u16	x_rflag[5];
-	u16	x_sflag;
-};
-
-static u32 solaris_to_linux_cflag(u32 cflag)
-{
-	cflag &= 0x7fdff000;
-	if (cflag & 0x200000) {
-		int baud = cflag & 0xf;
-		cflag &= ~0x20000f;
-		switch (baud) {
-		case 0: baud = B57600; break;
-		case 1: baud = B76800; break;
-		case 2: baud = B115200; break;
-		case 3: baud = B153600; break;
-		case 4: baud = B230400; break;
-		case 5: baud = B307200; break;
-		case 6: baud = B460800; break;
-		}
-		cflag |= CBAUDEX | baud;
-	}
-	return cflag;
-}
-
-static u32 linux_to_solaris_cflag(u32 cflag)
-{
-	cflag &= ~(CMSPAR | CIBAUD);
-	if (cflag & CBAUDEX) {
-		int baud = cflag & CBAUD;
-		cflag &= ~CBAUD;
-		switch (baud) {
-		case B57600: baud = 0; break;
-		case B76800: baud = 1; break;
-		case B115200: baud = 2; break;
-		case B153600: baud = 3; break;
-		case B230400: baud = 4; break;
-		case B307200: baud = 5; break;
-		case B460800: baud = 6; break;
-		case B614400: baud = 7; break;
-		case B921600: baud = 8; break;
-#if 0		
-		case B1843200: baud = 9; break;
-#endif
-		}
-		cflag |= 0x200000 | baud;
-	}
-	return cflag;
-}
-
-static inline int linux_to_solaris_termio(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	struct solaris_termio __user *p = A(arg);
-	int ret;
-	
-	ret = sys_ioctl(fd, cmd, (unsigned long)p);
-	if (!ret) {
-		u32 cflag;
-		
-		if (__get_user (cflag, &p->c_cflag))
-			return -EFAULT;
-		cflag = linux_to_solaris_cflag(cflag);
-		if (__put_user (cflag, &p->c_cflag))
-			return -EFAULT;
-	}
-	return ret;
-}
-
-static int solaris_to_linux_termio(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	int ret;
-	struct solaris_termio s;
-	mm_segment_t old_fs = get_fs();
-	
-	if (copy_from_user (&s, (struct solaris_termio __user *)A(arg), sizeof(struct solaris_termio)))
-		return -EFAULT;
-	s.c_cflag = solaris_to_linux_cflag(s.c_cflag);
-	set_fs(KERNEL_DS);
-	ret = sys_ioctl(fd, cmd, (unsigned long)&s);
-	set_fs(old_fs);
-	return ret;
-}
-
-static inline int linux_to_solaris_termios(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	int ret;
-	struct solaris_termios s;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);	
-	ret = sys_ioctl(fd, cmd, (unsigned long)&s);
-	set_fs(old_fs);
-	if (!ret) {
-		struct solaris_termios __user *p = A(arg);
-		if (put_user (s.c_iflag, &p->c_iflag) ||
-		    __put_user (s.c_oflag, &p->c_oflag) ||
-		    __put_user (linux_to_solaris_cflag(s.c_cflag), &p->c_cflag) ||
-		    __put_user (s.c_lflag, &p->c_lflag) ||
-		    __copy_to_user (p->c_cc, s.c_cc, 16) ||
-		    __clear_user (p->c_cc + 16, 2))
-			return -EFAULT;
-	}
-	return ret;
-}
-
-static int solaris_to_linux_termios(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	int ret;
-	struct solaris_termios s;
-	struct solaris_termios __user *p = A(arg);
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_ioctl(fd, TCGETS, (unsigned long)&s);
-	set_fs(old_fs);
-	if (ret) return ret;
-	if (put_user (s.c_iflag, &p->c_iflag) ||
-	    __put_user (s.c_oflag, &p->c_oflag) ||
-	    __put_user (s.c_cflag, &p->c_cflag) ||
-	    __put_user (s.c_lflag, &p->c_lflag) ||
-	    __copy_from_user (s.c_cc, p->c_cc, 16))
-		return -EFAULT;
-	s.c_cflag = solaris_to_linux_cflag(s.c_cflag);
-	set_fs(KERNEL_DS);
-	ret = sys_ioctl(fd, cmd, (unsigned long)&s);
-	set_fs(old_fs);
-	return ret;
-}
-
-static inline int solaris_T(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	switch (cmd & 0xff) {
-	case 1: /* TCGETA */
-		return linux_to_solaris_termio(fd, TCGETA, arg);
-	case 2: /* TCSETA */
-		return solaris_to_linux_termio(fd, TCSETA, arg);
-	case 3: /* TCSETAW */
-		return solaris_to_linux_termio(fd, TCSETAW, arg);
-	case 4: /* TCSETAF */
-		return solaris_to_linux_termio(fd, TCSETAF, arg);
-	case 5: /* TCSBRK */
-		return sys_ioctl(fd, TCSBRK, arg);
-	case 6: /* TCXONC */
-		return sys_ioctl(fd, TCXONC, arg);
-	case 7: /* TCFLSH */
-		return sys_ioctl(fd, TCFLSH, arg);
-	case 13: /* TCGETS */
-		return linux_to_solaris_termios(fd, TCGETS, arg);
-	case 14: /* TCSETS */
-		return solaris_to_linux_termios(fd, TCSETS, arg);
-	case 15: /* TCSETSW */
-		return solaris_to_linux_termios(fd, TCSETSW, arg);
-	case 16: /* TCSETSF */
-		return solaris_to_linux_termios(fd, TCSETSF, arg);
-	case 103: /* TIOCSWINSZ */
-		return sys_ioctl(fd, TIOCSWINSZ, arg);
-	case 104: /* TIOCGWINSZ */
-		return sys_ioctl(fd, TIOCGWINSZ, arg);
-	}
-	return -ENOSYS;
-}
-
-static inline int solaris_t(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	switch (cmd & 0xff) {
-	case 20: /* TIOCGPGRP */
-		return sys_ioctl(fd, TIOCGPGRP, arg);
-	case 21: /* TIOCSPGRP */
-		return sys_ioctl(fd, TIOCSPGRP, arg);
-	}
-	return -ENOSYS;
-}
-
-/* }}} */
-
-/* A pseudo STREAMS support {{{ */
-
-struct strioctl {
-	int cmd, timeout, len;
-	u32 data;
-};
-
-struct solaris_si_sockparams {
-	int sp_family;
-	int sp_type;
-	int sp_protocol;
-};
-
-struct solaris_o_si_udata {
-	int tidusize;
-	int addrsize;
-	int optsize;
-	int etsdusize;
-	int servtype;
-	int so_state;
-	int so_options;
-	int tsdusize;
-};
-
-struct solaris_si_udata {
-	int tidusize;
-	int addrsize;
-	int optsize;
-	int etsdusize;
-	int servtype;
-	int so_state;
-	int so_options;
-	int tsdusize;
-	struct solaris_si_sockparams sockparams;
-};
-
-#define SOLARIS_MODULE_TIMOD    0
-#define SOLARIS_MODULE_SOCKMOD  1
-#define SOLARIS_MODULE_MAX      2
-
-static struct module_info {
-        const char *name;
-        /* can be expanded further if needed */
-} module_table[ SOLARIS_MODULE_MAX + 1 ] = {
-        /* the ordering here must match the module numbers above! */
-        { "timod" },
-        { "sockmod" },
-        { NULL }
-};
-
-static inline int solaris_sockmod(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	struct inode *ino;
-	struct fdtable *fdt;
-	/* I wonder which of these tests are superfluous... --patrik */
-	rcu_read_lock();
-	fdt = files_fdtable(current->files);
-	if (! fdt->fd[fd] ||
-	    ! fdt->fd[fd]->f_path.dentry ||
-	    ! (ino = fdt->fd[fd]->f_path.dentry->d_inode) ||
-	    ! S_ISSOCK(ino->i_mode)) {
-		rcu_read_unlock();
-		return TBADF;
-	}
-	rcu_read_unlock();
-	
-	switch (cmd & 0xff) {
-	case 109: /* SI_SOCKPARAMS */
-	{
-		struct solaris_si_sockparams si;
-		if (copy_from_user (&si, A(arg), sizeof(si)))
-			return (EFAULT << 8) | TSYSERR;
-
-		/* Should we modify socket ino->socket_i.ops and type? */
-		return 0;
-	}
-	case 110: /* SI_GETUDATA */
-	{
-		int etsdusize, servtype;
-		struct solaris_si_udata __user *p = A(arg);
-		switch (SOCKET_I(ino)->type) {
-		case SOCK_STREAM:
-			etsdusize = 1;
-			servtype = 2;
-			break;
-		default:
-			etsdusize = -2;
-			servtype = 3;
-			break;
-		}
-		if (put_user(16384, &p->tidusize) ||
-		    __put_user(sizeof(struct sockaddr), &p->addrsize) ||
-		    __put_user(-1, &p->optsize) ||
-		    __put_user(etsdusize, &p->etsdusize) ||
-		    __put_user(servtype, &p->servtype) ||
-		    __put_user(0, &p->so_state) ||
-		    __put_user(0, &p->so_options) ||
-		    __put_user(16384, &p->tsdusize) ||
-		    __put_user(SOCKET_I(ino)->ops->family, &p->sockparams.sp_family) ||
-		    __put_user(SOCKET_I(ino)->type, &p->sockparams.sp_type) ||
-		    __put_user(SOCKET_I(ino)->ops->family, &p->sockparams.sp_protocol))
-			return (EFAULT << 8) | TSYSERR;
-		return 0;
-	}
-	case 101: /* O_SI_GETUDATA */
-	{
-		int etsdusize, servtype;
-		struct solaris_o_si_udata __user *p = A(arg);
-		switch (SOCKET_I(ino)->type) {
-		case SOCK_STREAM:
-			etsdusize = 1;
-			servtype = 2;
-			break;
-		default:
-			etsdusize = -2;
-			servtype = 3;
-			break;
-		}
-		if (put_user(16384, &p->tidusize) ||
-		    __put_user(sizeof(struct sockaddr), &p->addrsize) ||
-		    __put_user(-1, &p->optsize) ||
-		    __put_user(etsdusize, &p->etsdusize) ||
-		    __put_user(servtype, &p->servtype) ||
-		    __put_user(0, &p->so_state) ||
-		    __put_user(0, &p->so_options) ||
-		    __put_user(16384, &p->tsdusize))
-			return (EFAULT << 8) | TSYSERR;
-		return 0;
-	}
-	case 102: /* SI_SHUTDOWN */
-	case 103: /* SI_LISTEN */
-	case 104: /* SI_SETMYNAME */
-	case 105: /* SI_SETPEERNAME */
-	case 106: /* SI_GETINTRANSIT */
-	case 107: /* SI_TCL_LINK */
-	case 108: /* SI_TCL_UNLINK */
-		;
-	}
-	return TNOTSUPPORT;
-}
-
-static inline int solaris_timod(unsigned int fd, unsigned int cmd, u32 arg,
-                                    int len, int __user *len_p)
-{
-	int ret;
-		
-	switch (cmd & 0xff) {
-	case 141: /* TI_OPTMGMT */
-	{
-		int i;
-		u32 prim;
-		SOLD("TI_OPMGMT entry");
-		ret = timod_putmsg(fd, A(arg), len, NULL, -1, 0);
-		SOLD("timod_putmsg() returned");
-		if (ret)
-			return (-ret << 8) | TSYSERR;
-		i = MSG_HIPRI;
-		SOLD("calling timod_getmsg()");
-		ret = timod_getmsg(fd, A(arg), len, len_p, NULL, -1, NULL, &i);
-		SOLD("timod_getmsg() returned");
-		if (ret)
-			return (-ret << 8) | TSYSERR;
-		SOLD("ret ok");
-		if (get_user(prim, (u32 __user *)A(arg)))
-			return (EFAULT << 8) | TSYSERR;
-		SOLD("got prim");
-		if (prim == T_ERROR_ACK) {
-			u32 tmp, tmp2;
-			SOLD("prim is T_ERROR_ACK");
-			if (get_user(tmp, (u32 __user *)A(arg)+3) ||
-			    get_user(tmp2, (u32 __user *)A(arg)+2))
-				return (EFAULT << 8) | TSYSERR;
-			return (tmp2 << 8) | tmp;
-		}
-		SOLD("TI_OPMGMT return 0");
-		return 0;
-	}
-	case 142: /* TI_BIND */
-	{
-		int i;
-		u32 prim;
-		SOLD("TI_BIND entry");
-		ret = timod_putmsg(fd, A(arg), len, NULL, -1, 0);
-		SOLD("timod_putmsg() returned");
-		if (ret)
-			return (-ret << 8) | TSYSERR;
-		len = 1024; /* Solaris allows arbitrary return size */
-		i = MSG_HIPRI;
-		SOLD("calling timod_getmsg()");
-		ret = timod_getmsg(fd, A(arg), len, len_p, NULL, -1, NULL, &i);
-		SOLD("timod_getmsg() returned");
-		if (ret)
-			return (-ret << 8) | TSYSERR;
-		SOLD("ret ok");
-		if (get_user(prim, (u32 __user *)A(arg)))
-			return (EFAULT << 8) | TSYSERR;
-		SOLD("got prim");
-		if (prim == T_ERROR_ACK) {
-			u32 tmp, tmp2;
-			SOLD("prim is T_ERROR_ACK");
-			if (get_user(tmp, (u32 __user *)A(arg)+3) ||
-			    get_user(tmp2, (u32 __user *)A(arg)+2))
-				return (EFAULT << 8) | TSYSERR;
-			return (tmp2 << 8) | tmp;
-		}
-		SOLD("no ERROR_ACK requested");
-		if (prim != T_OK_ACK)
-			return TBADSEQ;
-		SOLD("OK_ACK requested");
-		i = MSG_HIPRI;
-		SOLD("calling timod_getmsg()");
-		ret = timod_getmsg(fd, A(arg), len, len_p, NULL, -1, NULL, &i);
-		SOLD("timod_getmsg() returned");
-		if (ret)
-			return (-ret << 8) | TSYSERR;
-		SOLD("TI_BIND return ok");
-		return 0;
-	}
-	case 140: /* TI_GETINFO */
-	case 143: /* TI_UNBIND */
-	case 144: /* TI_GETMYNAME */
-	case 145: /* TI_GETPEERNAME */
-	case 146: /* TI_SETMYNAME */
-	case 147: /* TI_SETPEERNAME */
-		;
-	}
-	return TNOTSUPPORT;
-}
-
-static inline int solaris_S(struct file *filp, unsigned int fd, unsigned int cmd, u32 arg)
-{
-	char *p;
-	int ret;
-	mm_segment_t old_fs;
-	struct strioctl si;
-	struct inode *ino;
-        struct sol_socket_struct *sock;
-        struct module_info *mi;
-
-        ino = filp->f_path.dentry->d_inode;
-        if (!S_ISSOCK(ino->i_mode))
-		return -EBADF;
-        sock = filp->private_data;
-        if (! sock) {
-                printk("solaris_S: NULL private_data\n");
-                return -EBADF;
-        }
-        if (sock->magic != SOLARIS_SOCKET_MAGIC) {
-                printk("solaris_S: invalid magic\n");
-                return -EBADF;
-        }
-        
-
-	switch (cmd & 0xff) {
-	case 1: /* I_NREAD */
-		return -ENOSYS;
-	case 2: /* I_PUSH */
-        {
-		p = getname (A(arg));
-		if (IS_ERR (p))
-			return PTR_ERR(p);
-                ret = -EINVAL;
-                for (mi = module_table; mi->name; mi++) {
-                        if (strcmp(mi->name, p) == 0) {
-                                sol_module m;
-                                if (sock->modcount >= MAX_NR_STREAM_MODULES) {
-                                        ret = -ENXIO;
-                                        break;
-                                }
-                                m = (sol_module) (mi - module_table);
-                                sock->module[sock->modcount++] = m;
-                                ret = 0;
-                                break;
-                        }
-                }
-		putname (p);
-		return ret;
-        }
-	case 3: /* I_POP */
-                if (sock->modcount <= 0) return -EINVAL;
-                sock->modcount--;
-		return 0;
-        case 4: /* I_LOOK */
-        {
-        	const char *p;
-                if (sock->modcount <= 0) return -EINVAL;
-                p = module_table[(unsigned)sock->module[sock->modcount]].name;
-                if (copy_to_user (A(arg), p, strlen(p)))
-                	return -EFAULT;
-                return 0;
-        }
-	case 5: /* I_FLUSH */
-		return 0;
-	case 8: /* I_STR */
-		if (copy_from_user(&si, A(arg), sizeof(struct strioctl)))
-			return -EFAULT;
-                /* We ignore what module is actually at the top of stack. */
-		switch ((si.cmd >> 8) & 0xff) {
-		case 'I':
-                        return solaris_sockmod(fd, si.cmd, si.data);
-		case 'T':
-                        return solaris_timod(fd, si.cmd, si.data, si.len,
-				&((struct strioctl __user *)A(arg))->len);
-		default:
-			return solaris_ioctl(fd, si.cmd, si.data);
-		}
-	case 9: /* I_SETSIG */
-		return sys_ioctl(fd, FIOSETOWN, current->pid);
-	case 10: /* I_GETSIG */
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		sys_ioctl(fd, FIOGETOWN, (unsigned long)&ret);
-		set_fs(old_fs);
-		if (ret == current->pid) return 0x3ff;
-		else return -EINVAL;
-	case 11: /* I_FIND */
-        {
-                int i;
-		p = getname (A(arg));
-		if (IS_ERR (p))
-			return PTR_ERR(p);
-                ret = 0;
-                for (i = 0; i < sock->modcount; i++) {
-                        unsigned m = sock->module[i];
-                        if (strcmp(module_table[m].name, p) == 0) {
-                                ret = 1;
-                                break;
-                        } 
-                }
-		putname (p);
-		return ret;
-        }
-	case 19: /* I_SWROPT */
-	case 32: /* I_SETCLTIME */
-		return 0;	/* Lie */
-	}
-	return -ENOSYS;
-}
-
-static inline int solaris_s(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	switch (cmd & 0xff) {
-	case 0: /* SIOCSHIWAT */
-	case 2: /* SIOCSLOWAT */
-		return 0; /* We don't support them */
-	case 1: /* SIOCGHIWAT */
-	case 3: /* SIOCGLOWAT */
-		if (put_user (0, (u32 __user *)A(arg)))
-			return -EFAULT;
-		return 0; /* Lie */
-	case 7: /* SIOCATMARK */
-		return sys_ioctl(fd, SIOCATMARK, arg);
-	case 8: /* SIOCSPGRP */
-		return sys_ioctl(fd, SIOCSPGRP, arg);
-	case 9: /* SIOCGPGRP */
-		return sys_ioctl(fd, SIOCGPGRP, arg);
-	}
-	return -ENOSYS;
-}
-
-static inline int solaris_r(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	switch (cmd & 0xff) {
-	case 10: /* SIOCADDRT */
-		return compat_sys_ioctl(fd, SIOCADDRT, arg);
-	case 11: /* SIOCDELRT */
-		return compat_sys_ioctl(fd, SIOCDELRT, arg);
-	}
-	return -ENOSYS;
-}
-
-static inline int solaris_i(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	switch (cmd & 0xff) {
-	case 12: /* SIOCSIFADDR */
-		return compat_sys_ioctl(fd, SIOCSIFADDR, arg);
-	case 13: /* SIOCGIFADDR */
-		return compat_sys_ioctl(fd, SIOCGIFADDR, arg);
-	case 14: /* SIOCSIFDSTADDR */
-		return compat_sys_ioctl(fd, SIOCSIFDSTADDR, arg);
-	case 15: /* SIOCGIFDSTADDR */
-		return compat_sys_ioctl(fd, SIOCGIFDSTADDR, arg);
-	case 16: /* SIOCSIFFLAGS */
-		return compat_sys_ioctl(fd, SIOCSIFFLAGS, arg);
-	case 17: /* SIOCGIFFLAGS */
-		return compat_sys_ioctl(fd, SIOCGIFFLAGS, arg);
-	case 18: /* SIOCSIFMEM */
-		return compat_sys_ioctl(fd, SIOCSIFMEM, arg);
-	case 19: /* SIOCGIFMEM */
-		return compat_sys_ioctl(fd, SIOCGIFMEM, arg);
-	case 20: /* SIOCGIFCONF */
-		return compat_sys_ioctl(fd, SIOCGIFCONF, arg);
-	case 21: /* SIOCSIFMTU */
-		return compat_sys_ioctl(fd, SIOCSIFMTU, arg);
-	case 22: /* SIOCGIFMTU */
-		return compat_sys_ioctl(fd, SIOCGIFMTU, arg);
-	case 23: /* SIOCGIFBRDADDR */
-		return compat_sys_ioctl(fd, SIOCGIFBRDADDR, arg);
-	case 24: /* SIOCSIFBRDADDR */
-		return compat_sys_ioctl(fd, SIOCSIFBRDADDR, arg);
-	case 25: /* SIOCGIFNETMASK */
-		return compat_sys_ioctl(fd, SIOCGIFNETMASK, arg);
-	case 26: /* SIOCSIFNETMASK */
-		return compat_sys_ioctl(fd, SIOCSIFNETMASK, arg);
-	case 27: /* SIOCGIFMETRIC */
-		return compat_sys_ioctl(fd, SIOCGIFMETRIC, arg);
-	case 28: /* SIOCSIFMETRIC */
-		return compat_sys_ioctl(fd, SIOCSIFMETRIC, arg);
-	case 30: /* SIOCSARP */
-		return compat_sys_ioctl(fd, SIOCSARP, arg);
-	case 31: /* SIOCGARP */
-		return compat_sys_ioctl(fd, SIOCGARP, arg);
-	case 32: /* SIOCDARP */
-		return compat_sys_ioctl(fd, SIOCDARP, arg);
-	case 52: /* SIOCGETNAME */
-	case 53: /* SIOCGETPEER */
-		{
-			struct sockaddr uaddr;
-			int uaddr_len = sizeof(struct sockaddr), ret;
-			long args[3];
-			mm_segment_t old_fs = get_fs();
-			int (*sys_socketcall)(int, unsigned long *) =
-				(int (*)(int, unsigned long *))SYS(socketcall);
-			
-			args[0] = fd; args[1] = (long)&uaddr; args[2] = (long)&uaddr_len;
-			set_fs(KERNEL_DS);
-			ret = sys_socketcall(((cmd & 0xff) == 52) ? SYS_GETSOCKNAME : SYS_GETPEERNAME,
-					args);
-			set_fs(old_fs);
-			if (ret >= 0) {
-				if (copy_to_user(A(arg), &uaddr, uaddr_len))
-					return -EFAULT;
-			}
-			return ret;
-		}
-#if 0		
-	case 86: /* SIOCSOCKSYS */
-		return socksys_syscall(fd, arg);
-#endif		
-	case 87: /* SIOCGIFNUM */
-		{
-			struct net_device *d;
-			int i = 0;
-			
-			read_lock_bh(&dev_base_lock);
-			for_each_netdev(&init_net, d)
-				i++;
-			read_unlock_bh(&dev_base_lock);
-
-			if (put_user (i, (int __user *)A(arg)))
-				return -EFAULT;
-			return 0;
-		}
-	}
-	return -ENOSYS;
-}
-
-static int solaris_m(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	int ret;
-
-	switch (cmd & 0xff) {
-	case 1: /* MTIOCTOP */
-		ret = sys_ioctl(fd, MTIOCTOP, (unsigned long)&arg);
-		break;
-	case 2: /* MTIOCGET */
-		ret = sys_ioctl(fd, MTIOCGET, (unsigned long)&arg);
-		break;
-	case 3: /* MTIOCGETDRIVETYPE */
-	case 4: /* MTIOCPERSISTENT */
-	case 5: /* MTIOCPERSISTENTSTATUS */
-	case 6: /* MTIOCLRERR */
-	case 7: /* MTIOCGUARANTEEDORDER */
-	case 8: /* MTIOCRESERVE */
-	case 9: /* MTIOCRELEASE */
-	case 10: /* MTIOCFORCERESERVE */
-	case 13: /* MTIOCSTATE */
-	case 14: /* MTIOCREADIGNOREILI */
-	case 15: /* MTIOCREADIGNOREEOFS */
-	case 16: /* MTIOCSHORTFMK */
-	default:
-		ret = -ENOSYS; /* linux doesn't support these */
-		break;
-	};
-
-	return ret;
-}
-
-static int solaris_O(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	int ret = -EINVAL;
-
-	switch (cmd & 0xff) {
-	case 1: /* OPROMGETOPT */
-		ret = sys_ioctl(fd, OPROMGETOPT, arg);
-		break;
-	case 2: /* OPROMSETOPT */
-		ret = sys_ioctl(fd, OPROMSETOPT, arg);
-		break;
-	case 3: /* OPROMNXTOPT */
-		ret = sys_ioctl(fd, OPROMNXTOPT, arg);
-		break;
-	case 4: /* OPROMSETOPT2 */
-		ret = sys_ioctl(fd, OPROMSETOPT2, arg);
-		break;
-	case 5: /* OPROMNEXT */
-		ret = sys_ioctl(fd, OPROMNEXT, arg);
-		break;
-	case 6: /* OPROMCHILD */
-		ret = sys_ioctl(fd, OPROMCHILD, arg);
-		break;
-	case 7: /* OPROMGETPROP */
-		ret = sys_ioctl(fd, OPROMGETPROP, arg);
-		break;
-	case 8: /* OPROMNXTPROP */
-		ret = sys_ioctl(fd, OPROMNXTPROP, arg);
-		break;
-	case 9: /* OPROMU2P */
-		ret = sys_ioctl(fd, OPROMU2P, arg);
-		break;
-	case 10: /* OPROMGETCONS */
-		ret = sys_ioctl(fd, OPROMGETCONS, arg);
-		break;
-	case 11: /* OPROMGETFBNAME */
-		ret = sys_ioctl(fd, OPROMGETFBNAME, arg);
-		break;
-	case 12: /* OPROMGETBOOTARGS */
-		ret = sys_ioctl(fd, OPROMGETBOOTARGS, arg);
-		break;
-	case 13: /* OPROMGETVERSION */
-	case 14: /* OPROMPATH2DRV */
-	case 15: /* OPROMDEV2PROMNAME */
-	case 16: /* OPROMPROM2DEVNAME */
-	case 17: /* OPROMGETPROPLEN */
-	default:
-		ret = -EINVAL;
-		break;
-	};
-	return ret;
-}
-
-/* }}} */
-
-asmlinkage int solaris_ioctl(unsigned int fd, unsigned int cmd, u32 arg)
-{
-	struct file *filp;
-	int error = -EBADF;
-
-	filp = fget(fd);
-	if (!filp)
-		goto out;
-
-	lock_kernel();
-	error = -EFAULT;
-	switch ((cmd >> 8) & 0xff) {
-	case 'S': error = solaris_S(filp, fd, cmd, arg); break;
-	case 'T': error = solaris_T(fd, cmd, arg); break;
-	case 'i': error = solaris_i(fd, cmd, arg); break;
-	case 'r': error = solaris_r(fd, cmd, arg); break;
-	case 's': error = solaris_s(fd, cmd, arg); break;
-	case 't': error = solaris_t(fd, cmd, arg); break;
-	case 'f': error = sys_ioctl(fd, cmd, arg); break;
-	case 'm': error = solaris_m(fd, cmd, arg); break;
-	case 'O': error = solaris_O(fd, cmd, arg); break;
-	default:
-		error = -ENOSYS;
-		break;
-	}
-	unlock_kernel();
-	fput(filp);
-out:
-	if (error == -ENOSYS) {
-		unsigned char c = cmd>>8;
-		
-		if (c < ' ' || c > 126) c = '.';
-		printk("solaris_ioctl: Unknown cmd fd(%d) cmd(%08x '%c') arg(%08x)\n",
-		       (int)fd, (unsigned int)cmd, c, (unsigned int)arg);
-		error = -EINVAL;
-	}
-	return error;
-}
diff --git a/arch/sparc64/solaris/ipc.c b/arch/sparc64/solaris/ipc.c
deleted file mode 100644
index 499135fa7060..000000000000
--- a/arch/sparc64/solaris/ipc.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/* $Id: ipc.c,v 1.5 1999/12/09 00:41:00 davem Exp $
- * ipc.c: Solaris IPC emulation
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/wait.h>
-#include <linux/mm.h>
-#include <linux/shm.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/ipc.h>
-
-#include <asm/uaccess.h>
-#include <asm/string.h>
-
-#include "conv.h"
-
-struct solaris_ipc_perm {
-	s32	uid;
-	s32	gid;
-	s32	cuid;
-	s32	cgid;
-	u32	mode;
-	u32	seq;
-	int	key;
-	s32	pad[4];
-};
-
-struct solaris_shmid_ds {
-	struct solaris_ipc_perm	shm_perm;
-	int			shm_segsz;
-	u32			shm_amp;
-	unsigned short		shm_lkcnt;
-	char			__padxx[2];
-	s32			shm_lpid;
-	s32			shm_cpid;
-	u32			shm_nattch;
-	u32			shm_cnattch;
-	s32			shm_atime;
-	s32			shm_pad1;
-	s32			shm_dtime;
-	s32			shm_pad2;
-	s32			shm_ctime;
-	s32			shm_pad3;
-	unsigned short		shm_cv;
-	char			shm_pad4[2];
-	u32			shm_sptas;
-	s32			shm_pad5[2];
-};
-
-asmlinkage long solaris_shmsys(int cmd, u32 arg1, u32 arg2, u32 arg3)
-{
-	int (*sys_ipc)(unsigned,int,int,unsigned long,void __user *,long) = 
-		(int (*)(unsigned,int,int,unsigned long,void __user *,long))SYS(ipc);
-	mm_segment_t old_fs;
-	unsigned long raddr;
-	int ret;
-		
-	switch (cmd) {
-	case 0: /* shmat */
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		ret = sys_ipc(SHMAT, arg1, arg3 & ~0x4000, (unsigned long)&raddr, A(arg2), 0);
-		set_fs(old_fs);
-		if (ret >= 0) return (u32)raddr;
-		else return ret;
-	case 1: /* shmctl */
-		switch (arg2) {
-		case 3: /* SHM_LOCK */
-		case 4: /* SHM_UNLOCK */
-			return sys_ipc(SHMCTL, arg1, (arg2 == 3) ? SHM_LOCK : SHM_UNLOCK, 0, NULL, 0);
-		case 10: /* IPC_RMID */
-			return sys_ipc(SHMCTL, arg1, IPC_RMID, 0, NULL, 0);
-		case 11: /* IPC_SET */
-			{
-				struct shmid_ds s;
-				struct solaris_shmid_ds __user *p = A(arg3);
-				
-				if (get_user (s.shm_perm.uid, &p->shm_perm.uid) ||
-				    __get_user (s.shm_perm.gid, &p->shm_perm.gid) || 
-				    __get_user (s.shm_perm.mode, &p->shm_perm.mode))
-					return -EFAULT;
-				old_fs = get_fs();
-				set_fs(KERNEL_DS);
-				ret = sys_ipc(SHMCTL, arg1, IPC_SET, 0, &s, 0);
-				set_fs(old_fs);
-				return ret;
-			}
-		case 12: /* IPC_STAT */
-			{
-				struct shmid_ds s;
-				struct solaris_shmid_ds __user *p = A(arg3);
-				
-				old_fs = get_fs();
-				set_fs(KERNEL_DS);
-				ret = sys_ipc(SHMCTL, arg1, IPC_SET, 0, &s, 0);
-				set_fs(old_fs);
-				if (put_user (s.shm_perm.uid, &(p->shm_perm.uid)) ||
-				    __put_user (s.shm_perm.gid, &(p->shm_perm.gid)) || 
-				    __put_user (s.shm_perm.cuid, &(p->shm_perm.cuid)) ||
-				    __put_user (s.shm_perm.cgid, &(p->shm_perm.cgid)) || 
-				    __put_user (s.shm_perm.mode, &(p->shm_perm.mode)) ||
-				    __put_user (s.shm_perm.seq, &(p->shm_perm.seq)) ||
-				    __put_user (s.shm_perm.key, &(p->shm_perm.key)) ||
-				    __put_user (s.shm_segsz, &(p->shm_segsz)) ||
-				    __put_user (s.shm_lpid, &(p->shm_lpid)) ||
-				    __put_user (s.shm_cpid, &(p->shm_cpid)) ||
-				    __put_user (s.shm_nattch, &(p->shm_nattch)) ||
-				    __put_user (s.shm_atime, &(p->shm_atime)) ||
-				    __put_user (s.shm_dtime, &(p->shm_dtime)) ||
-				    __put_user (s.shm_ctime, &(p->shm_ctime)))
-					return -EFAULT;
-				return ret;
-			}
-		default: return -EINVAL;
-		}
-	case 2: /* shmdt */
-		return sys_ipc(SHMDT, 0, 0, 0, A(arg1), 0);
-	case 3: /* shmget */
-		return sys_ipc(SHMGET, arg1, arg2, arg3, NULL, 0);
-	}
-	return -EINVAL;
-}
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
deleted file mode 100644
index d3e48e9701bf..000000000000
--- a/arch/sparc64/solaris/misc.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/* $Id: misc.c,v 1.36 2002/02/09 19:49:31 davem Exp $
- * misc.c: Miscellaneous syscall emulation for Solaris
- *
- * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#include <linux/module.h> 
-#include <linux/types.h>
-#include <linux/utsname.h>
-#include <linux/limits.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/tty.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/timex.h>
-#include <linux/major.h>
-#include <linux/compat.h>
-
-#include <asm/uaccess.h>
-#include <asm/string.h>
-#include <asm/oplib.h>
-#include <asm/idprom.h>
-#include <asm/smp.h>
-#include <asm/prom.h>
-
-#include "conv.h"
-
-/* Conversion from Linux to Solaris errnos. 0-34 are identity mapped.
-   Some Linux errnos (EPROCLIM, EDOTDOT, ERREMOTE, EUCLEAN, ENOTNAM, 
-   ENAVAIL, EISNAM, EREMOTEIO, ENOMEDIUM, EMEDIUMTYPE) have no Solaris
-   equivalents. I return EINVAL in that case, which is very wrong. If
-   someone suggest a better value for them, you're welcomed.
-   On the other side, Solaris ECANCELED and ENOTSUP have no Linux equivalents,
-   but that doesn't matter here. --jj */
-int solaris_err_table[] = {
-/* 0 */  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-/* 10 */  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-/* 20 */  20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-/* 30 */  30, 31, 32, 33, 34, 22, 150, 149, 95, 96,
-/* 40 */  97, 98, 99, 120, 121, 122, 123, 124, 125, 126, 
-/* 50 */ 127, 128, 129, 130, 131, 132, 133, 134, 143, 144,
-/* 60 */ 145, 146, 90, 78, 147, 148, 93, 22, 94, 49,
-/* 70 */ 151, 66, 60, 62, 63, 35, 77, 36, 45, 46, 
-/* 80 */ 64, 22, 67, 68, 69, 70, 71, 74, 22, 82, 
-/* 90 */ 89, 92, 79, 81, 37, 38, 39, 40, 41, 42,
-/* 100 */ 43, 44, 50, 51, 52, 53, 54, 55, 56, 57,
-/* 110 */ 87, 61, 84, 65, 83, 80, 91, 22, 22, 22,
-/* 120 */ 22, 22, 88, 86, 85, 22, 22,
-};
-
-#define SOLARIS_NR_OPEN	256
-
-static u32 do_solaris_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u64 off)
-{
-	struct file *file = NULL;
-	unsigned long retval, ret_type;
-
-	/* Do we need it here? */
-	set_personality(PER_SVR4);
-	if (flags & MAP_NORESERVE) {
-		static int cnt;
-		
-		if (cnt < 5) {
-			printk("%s:  unimplemented Solaris MAP_NORESERVE mmap() flag\n",
-			       current->comm);
-			cnt++;
-		}
-		flags &= ~MAP_NORESERVE;
-	}
-	retval = -EBADF;
-	if(!(flags & MAP_ANONYMOUS)) {
-		if(fd >= SOLARIS_NR_OPEN)
-			goto out;
- 		file = fget(fd);
-		if (!file)
-			goto out;
-		else {
-			struct inode * inode = file->f_path.dentry->d_inode;
-			if(imajor(inode) == MEM_MAJOR &&
-			   iminor(inode) == 5) {
-				flags |= MAP_ANONYMOUS;
-				fput(file);
-				file = NULL;
-			}
-		}
-	}
-
-	retval = -EINVAL;
-	len = PAGE_ALIGN(len);
-	if(!(flags & MAP_FIXED))
-		addr = 0;
-	else if (len > STACK_TOP32 || addr > STACK_TOP32 - len)
-		goto out_putf;
-	ret_type = flags & _MAP_NEW;
-	flags &= ~_MAP_NEW;
-
-	down_write(&current->mm->mmap_sem);
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-	retval = do_mmap(file,
-			 (unsigned long) addr, (unsigned long) len,
-			 (unsigned long) prot, (unsigned long) flags, off);
-	up_write(&current->mm->mmap_sem);
-	if(!ret_type)
-		retval = ((retval < STACK_TOP32) ? 0 : retval);
-	                        
-out_putf:
-	if (file)
-		fput(file);
-out:
-	return (u32) retval;
-}
-
-asmlinkage u32 solaris_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u32 off)
-{
-	return do_solaris_mmap(addr, len, prot, flags, fd, (u64) off);
-}
-
-asmlinkage u32 solaris_mmap64(struct pt_regs *regs, u32 len, u32 prot, u32 flags, u32 fd, u32 offhi)
-{
-	u32 offlo;
-	
-	if (regs->u_regs[UREG_G1]) {
-		if (get_user (offlo, (u32 __user *)(long)((u32)regs->u_regs[UREG_I6] + 0x5c)))
-			return -EFAULT;
-	} else {
-		if (get_user (offlo, (u32 __user *)(long)((u32)regs->u_regs[UREG_I6] + 0x60)))
-			return -EFAULT;
-	}
-	return do_solaris_mmap((u32)regs->u_regs[UREG_I0], len, prot, flags, fd, (((u64)offhi)<<32)|offlo);
-}
-
-asmlinkage int solaris_brk(u32 brk)
-{
-	int (*sunos_brk)(u32) = (int (*)(u32))SUNOS(17);
-	
-	return sunos_brk(brk);
-}
-
-static int __set_utsfield(char __user *to, int to_size,
-			  const char *from, int from_size,
-			  int dotchop, int countfrom)
-{
-	int len = countfrom ? (to_size > from_size ?
-			       from_size : to_size) : to_size;
-	int off;
-
-	if (copy_to_user(to, from, len))
-		return -EFAULT;
-
-	off = len < to_size? len: len - 1;
-	if (dotchop) {
-		const char *p = strnchr(from, len, '.');
-		if (p) off = p - from;
-	}
-
-	if (__put_user('\0', to + off))
-		return -EFAULT;
-
-	return 0;
-}
-
-#define set_utsfield(to, from, dotchop, countfrom) \
-	__set_utsfield((to), sizeof(to), \
-		       (from), sizeof(from), \
-		       (dotchop), (countfrom))
-
-struct sol_uname {
-	char sysname[9];
-	char nodename[9];
-	char release[9];
-	char version[9];
-	char machine[9];
-};
-
-struct sol_utsname {
-	char sysname[257];
-	char nodename[257];
-	char release[257];
-	char version[257];
-	char machine[257];
-};
-
-static char *machine(void)
-{
-	switch (sparc_cpu_model) {
-	case sun4: return "sun4";
-	case sun4c: return "sun4c";
-	case sun4e: return "sun4e";
-	case sun4m: return "sun4m";
-	case sun4d: return "sun4d";
-	case sun4u: return "sun4u";
-	default: return "sparc";
-	}
-}
-
-static char *platform(char *buffer, int sz)
-{
-	struct device_node *dp = of_find_node_by_path("/");
-	int len;
-
-	*buffer = 0;
-	len = strlen(dp->name);
-	if (len > sz)
-		len = sz;
-	memcpy(buffer, dp->name, len);
-	buffer[len] = 0;
-	if (*buffer) {
-		char *p;
-
-		for (p = buffer; *p; p++)
-			if (*p == '/' || *p == ' ') *p = '_';
-		return buffer;
-	}
-
-	return "sun4u";
-}
-
-static char *serial(char *buffer, int sz)
-{
-	struct device_node *dp = of_find_node_by_path("/options");
-	int len;
-
-	*buffer = 0;
-	if (dp) {
-		const char *val =
-			of_get_property(dp, "system-board-serial#", &len);
-
-		if (val && len > 0) {
-			if (len > sz)
-				len = sz;
-			memcpy(buffer, val, len);
-			buffer[len] = 0;
-		}
-	}
-	if (!*buffer)
-		return "4512348717234";
-	else
-		return buffer;
-}
-
-asmlinkage int solaris_utssys(u32 buf, u32 flags, int which, u32 buf2)
-{
-	struct sol_uname __user *v = A(buf);
-	int err;
-
-	switch (which) {
-	case 0:	/* old uname */
-		/* Let's cheat */
-		err  = set_utsfield(v->sysname, "SunOS", 1, 0);
-		down_read(&uts_sem);
-		err |= set_utsfield(v->nodename, utsname()->nodename,
-				    1, 1);
-		up_read(&uts_sem);
-		err |= set_utsfield(v->release, "2.6", 0, 0);
-		err |= set_utsfield(v->version, "Generic", 0, 0);
-		err |= set_utsfield(v->machine, machine(), 0, 0);
-		return (err ? -EFAULT : 0);
-	case 2: /* ustat */
-		return -ENOSYS;
-	case 3: /* fusers */
-		return -ENOSYS;
-	default:
-		return -ENOSYS;
-	}
-}
-
-asmlinkage int solaris_utsname(u32 buf)
-{
-	struct sol_utsname __user *v = A(buf);
-	int err;
-
-	/* Why should we not lie a bit? */
-	down_read(&uts_sem);
-	err  = set_utsfield(v->sysname, "SunOS", 0, 0);
-	err |= set_utsfield(v->nodename, utsname()->nodename, 1, 1);
-	err |= set_utsfield(v->release, "5.6", 0, 0);
-	err |= set_utsfield(v->version, "Generic", 0, 0);
-	err |= set_utsfield(v->machine, machine(), 0, 0);
-	up_read(&uts_sem);
-
-	return (err ? -EFAULT : 0);
-}
-
-#define SI_SYSNAME		1       /* return name of operating system */
-#define SI_HOSTNAME		2       /* return name of node */
-#define SI_RELEASE		3       /* return release of operating system */
-#define SI_VERSION		4       /* return version field of utsname */
-#define SI_MACHINE		5       /* return kind of machine */
-#define SI_ARCHITECTURE		6       /* return instruction set arch */
-#define SI_HW_SERIAL		7       /* return hardware serial number */
-#define SI_HW_PROVIDER		8       /* return hardware manufacturer */
-#define SI_SRPC_DOMAIN		9       /* return secure RPC domain */
-#define SI_PLATFORM		513     /* return platform identifier */
-
-asmlinkage int solaris_sysinfo(int cmd, u32 buf, s32 count)
-{
-	char *p, *q, *r;
-	char buffer[256];
-	int len;
-	
-	/* Again, we cheat :)) */
-	switch (cmd) {
-	case SI_SYSNAME: r = "SunOS"; break;
-	case SI_HOSTNAME:
-		r = buffer + 256;
-		down_read(&uts_sem);
-		for (p = utsname()->nodename, q = buffer;
-		     q < r && *p && *p != '.'; *q++ = *p++);
-		up_read(&uts_sem);
-		*q = 0;
-		r = buffer;
-		break;
-	case SI_RELEASE: r = "5.6"; break;
-	case SI_MACHINE: r = machine(); break;
-	case SI_ARCHITECTURE: r = "sparc"; break;
-	case SI_HW_PROVIDER: r = "Sun_Microsystems"; break;
-	case SI_HW_SERIAL: r = serial(buffer, sizeof(buffer)); break;
-	case SI_PLATFORM: r = platform(buffer, sizeof(buffer)); break;
-	case SI_SRPC_DOMAIN: r = ""; break;
-	case SI_VERSION: r = "Generic"; break;
-	default: return -EINVAL;
-	}
-	len = strlen(r) + 1;
-	if (count < len) {
-		if (copy_to_user(A(buf), r, count - 1) ||
-		    __put_user(0, (char __user *)A(buf) + count - 1))
-			return -EFAULT;
-	} else {
-		if (copy_to_user(A(buf), r, len))
-			return -EFAULT;
-	}
-	return len;
-}
-
-#define	SOLARIS_CONFIG_NGROUPS			2
-#define	SOLARIS_CONFIG_CHILD_MAX		3
-#define	SOLARIS_CONFIG_OPEN_FILES		4
-#define	SOLARIS_CONFIG_POSIX_VER		5
-#define	SOLARIS_CONFIG_PAGESIZE			6
-#define	SOLARIS_CONFIG_CLK_TCK			7
-#define	SOLARIS_CONFIG_XOPEN_VER		8
-#define	SOLARIS_CONFIG_PROF_TCK			10
-#define	SOLARIS_CONFIG_NPROC_CONF		11
-#define	SOLARIS_CONFIG_NPROC_ONLN		12
-#define	SOLARIS_CONFIG_AIO_LISTIO_MAX		13
-#define	SOLARIS_CONFIG_AIO_MAX			14
-#define	SOLARIS_CONFIG_AIO_PRIO_DELTA_MAX	15
-#define	SOLARIS_CONFIG_DELAYTIMER_MAX		16
-#define	SOLARIS_CONFIG_MQ_OPEN_MAX		17
-#define	SOLARIS_CONFIG_MQ_PRIO_MAX		18
-#define	SOLARIS_CONFIG_RTSIG_MAX		19
-#define	SOLARIS_CONFIG_SEM_NSEMS_MAX		20
-#define	SOLARIS_CONFIG_SEM_VALUE_MAX		21
-#define	SOLARIS_CONFIG_SIGQUEUE_MAX		22
-#define	SOLARIS_CONFIG_SIGRT_MIN		23
-#define	SOLARIS_CONFIG_SIGRT_MAX		24
-#define	SOLARIS_CONFIG_TIMER_MAX		25
-#define	SOLARIS_CONFIG_PHYS_PAGES		26
-#define	SOLARIS_CONFIG_AVPHYS_PAGES		27
-
-asmlinkage int solaris_sysconf(int id)
-{
-	switch (id) {
-	case SOLARIS_CONFIG_NGROUPS:	return NGROUPS_MAX;
-	case SOLARIS_CONFIG_CHILD_MAX:
-		return current->signal->rlim[RLIMIT_NPROC].rlim_cur;
-	case SOLARIS_CONFIG_OPEN_FILES:
-		return current->signal->rlim[RLIMIT_NOFILE].rlim_cur;
-	case SOLARIS_CONFIG_POSIX_VER:	return 199309;
-	case SOLARIS_CONFIG_PAGESIZE:	return PAGE_SIZE;
-	case SOLARIS_CONFIG_XOPEN_VER:	return 3;
-	case SOLARIS_CONFIG_CLK_TCK:
-	case SOLARIS_CONFIG_PROF_TCK:
-		return sparc64_get_clock_tick(smp_processor_id());
-#ifdef CONFIG_SMP	
-	case SOLARIS_CONFIG_NPROC_CONF:	return NR_CPUS;
-	case SOLARIS_CONFIG_NPROC_ONLN:	return num_online_cpus();
-#else
-	case SOLARIS_CONFIG_NPROC_CONF:	return 1;
-	case SOLARIS_CONFIG_NPROC_ONLN:	return 1;
-#endif
-	case SOLARIS_CONFIG_SIGRT_MIN:		return 37;
-	case SOLARIS_CONFIG_SIGRT_MAX:		return 44;
-	case SOLARIS_CONFIG_PHYS_PAGES:
-	case SOLARIS_CONFIG_AVPHYS_PAGES:
-		{
-			struct sysinfo s;
-			
-			si_meminfo(&s);
-			if (id == SOLARIS_CONFIG_PHYS_PAGES)
-				return s.totalram >>= PAGE_SHIFT;
-			else
-				return s.freeram >>= PAGE_SHIFT;
-		}
-	/* XXX support these as well -jj */
-	case SOLARIS_CONFIG_AIO_LISTIO_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_AIO_MAX:		return -EINVAL;
-	case SOLARIS_CONFIG_AIO_PRIO_DELTA_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_DELAYTIMER_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_MQ_OPEN_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_MQ_PRIO_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_RTSIG_MAX:		return -EINVAL;
-	case SOLARIS_CONFIG_SEM_NSEMS_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_SEM_VALUE_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_SIGQUEUE_MAX:	return -EINVAL;
-	case SOLARIS_CONFIG_TIMER_MAX:		return -EINVAL;
-	default: return -EINVAL;
-	}
-}
-
-asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid)
-{
-	int ret;
-	
-	switch (cmd) {
-	case 0: /* getpgrp */
-		return task_pgrp_vnr(current);
-	case 1: /* setpgrp */
-		{
-			int (*sys_setpgid)(pid_t,pid_t) =
-				(int (*)(pid_t,pid_t))SYS(setpgid);
-				
-			/* can anyone explain me the difference between
-			   Solaris setpgrp and setsid? */
-			ret = sys_setpgid(0, 0);
-			if (ret) return ret;
-			proc_clear_tty(current);
-			return task_pgrp_vnr(current);
-		}
-	case 2: /* getsid */
-		{
-			int (*sys_getsid)(pid_t) = (int (*)(pid_t))SYS(getsid);
-			return sys_getsid(pid);
-		}
-	case 3: /* setsid */
-		{
-			int (*sys_setsid)(void) = (int (*)(void))SYS(setsid);
-			return sys_setsid();
-		}
-	case 4: /* getpgid */
-		{
-			int (*sys_getpgid)(pid_t) = (int (*)(pid_t))SYS(getpgid);
-			return sys_getpgid(pid);
-		}
-	case 5: /* setpgid */
-		{
-			int (*sys_setpgid)(pid_t,pid_t) = 
-				(int (*)(pid_t,pid_t))SYS(setpgid);
-			return sys_setpgid(pid,pgid);
-		}
-	}
-	return -EINVAL;
-}
-
-asmlinkage int solaris_gettimeofday(u32 tim)
-{
-	int (*sys_gettimeofday)(struct timeval *, struct timezone *) =
-		(int (*)(struct timeval *, struct timezone *))SYS(gettimeofday);
-		
-	return sys_gettimeofday((struct timeval *)(u64)tim, NULL);
-}
-
-#define RLIM_SOL_INFINITY32	0x7fffffff
-#define RLIM_SOL_SAVED_MAX32	0x7ffffffe
-#define RLIM_SOL_SAVED_CUR32	0x7ffffffd
-#define RLIM_SOL_INFINITY	((u64)-3)
-#define RLIM_SOL_SAVED_MAX	((u64)-2)
-#define RLIM_SOL_SAVED_CUR	((u64)-1)
-#define RESOURCE32(x) ((x > RLIM_INFINITY32) ? RLIM_INFINITY32 : x)
-#define RLIMIT_SOL_NOFILE	5
-#define RLIMIT_SOL_VMEM		6
-
-struct rlimit32 {
-	u32	rlim_cur;
-	u32	rlim_max;
-};
-
-asmlinkage int solaris_getrlimit(unsigned int resource, struct rlimit32 __user *rlim)
-{
-	struct rlimit r;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-	int (*sys_getrlimit)(unsigned int, struct rlimit *) =
-		(int (*)(unsigned int, struct rlimit *))SYS(getrlimit);
-
-	if (resource > RLIMIT_SOL_VMEM)
-		return -EINVAL;	
-	switch (resource) {
-	case RLIMIT_SOL_NOFILE: resource = RLIMIT_NOFILE; break;
-	case RLIMIT_SOL_VMEM: resource = RLIMIT_AS; break;
-	default: break;
-	}
-	set_fs (KERNEL_DS);
-	ret = sys_getrlimit(resource, &r);
-	set_fs (old_fs);
-	if (!ret) {
-		if (r.rlim_cur == RLIM_INFINITY)
-			r.rlim_cur = RLIM_SOL_INFINITY32;
-		else if ((u64)r.rlim_cur > RLIM_SOL_INFINITY32)
-			r.rlim_cur = RLIM_SOL_SAVED_CUR32;
-		if (r.rlim_max == RLIM_INFINITY)
-			r.rlim_max = RLIM_SOL_INFINITY32;
-		else if ((u64)r.rlim_max > RLIM_SOL_INFINITY32)
-			r.rlim_max = RLIM_SOL_SAVED_MAX32;
-		ret = put_user (r.rlim_cur, &rlim->rlim_cur);
-		ret |= __put_user (r.rlim_max, &rlim->rlim_max);
-	}
-	return ret;
-}
-
-asmlinkage int solaris_setrlimit(unsigned int resource, struct rlimit32 __user *rlim)
-{
-	struct rlimit r, rold;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-	int (*sys_getrlimit)(unsigned int, struct rlimit __user *) =
-		(int (*)(unsigned int, struct rlimit __user *))SYS(getrlimit);
-	int (*sys_setrlimit)(unsigned int, struct rlimit __user *) =
-		(int (*)(unsigned int, struct rlimit __user *))SYS(setrlimit);
-
-	if (resource > RLIMIT_SOL_VMEM)
-		return -EINVAL;	
-	switch (resource) {
-	case RLIMIT_SOL_NOFILE: resource = RLIMIT_NOFILE; break;
-	case RLIMIT_SOL_VMEM: resource = RLIMIT_AS; break;
-	default: break;
-	}
-	if (get_user (r.rlim_cur, &rlim->rlim_cur) ||
-	    __get_user (r.rlim_max, &rlim->rlim_max))
-		return -EFAULT;
-	set_fs (KERNEL_DS);
-	ret = sys_getrlimit(resource, &rold);
-	if (!ret) {
-		if (r.rlim_cur == RLIM_SOL_INFINITY32)
-			r.rlim_cur = RLIM_INFINITY;
-		else if (r.rlim_cur == RLIM_SOL_SAVED_CUR32)
-			r.rlim_cur = rold.rlim_cur;
-		else if (r.rlim_cur == RLIM_SOL_SAVED_MAX32)
-			r.rlim_cur = rold.rlim_max;
-		if (r.rlim_max == RLIM_SOL_INFINITY32)
-			r.rlim_max = RLIM_INFINITY;
-		else if (r.rlim_max == RLIM_SOL_SAVED_CUR32)
-			r.rlim_max = rold.rlim_cur;
-		else if (r.rlim_max == RLIM_SOL_SAVED_MAX32)
-			r.rlim_max = rold.rlim_max;
-		ret = sys_setrlimit(resource, &r);
-	}
-	set_fs (old_fs);
-	return ret;
-}
-
-asmlinkage int solaris_getrlimit64(unsigned int resource, struct rlimit __user *rlim)
-{
-	struct rlimit r;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-	int (*sys_getrlimit)(unsigned int, struct rlimit __user *) =
-		(int (*)(unsigned int, struct rlimit __user *))SYS(getrlimit);
-
-	if (resource > RLIMIT_SOL_VMEM)
-		return -EINVAL;	
-	switch (resource) {
-	case RLIMIT_SOL_NOFILE: resource = RLIMIT_NOFILE; break;
-	case RLIMIT_SOL_VMEM: resource = RLIMIT_AS; break;
-	default: break;
-	}
-	set_fs (KERNEL_DS);
-	ret = sys_getrlimit(resource, &r);
-	set_fs (old_fs);
-	if (!ret) {
-		if (r.rlim_cur == RLIM_INFINITY)
-			r.rlim_cur = RLIM_SOL_INFINITY;
-		if (r.rlim_max == RLIM_INFINITY)
-			r.rlim_max = RLIM_SOL_INFINITY;
-		ret = put_user (r.rlim_cur, &rlim->rlim_cur);
-		ret |= __put_user (r.rlim_max, &rlim->rlim_max);
-	}
-	return ret;
-}
-
-asmlinkage int solaris_setrlimit64(unsigned int resource, struct rlimit __user *rlim)
-{
-	struct rlimit r, rold;
-	int ret;
-	mm_segment_t old_fs = get_fs ();
-	int (*sys_getrlimit)(unsigned int, struct rlimit __user *) =
-		(int (*)(unsigned int, struct rlimit __user *))SYS(getrlimit);
-	int (*sys_setrlimit)(unsigned int, struct rlimit __user *) =
-		(int (*)(unsigned int, struct rlimit __user *))SYS(setrlimit);
-
-	if (resource > RLIMIT_SOL_VMEM)
-		return -EINVAL;	
-	switch (resource) {
-	case RLIMIT_SOL_NOFILE: resource = RLIMIT_NOFILE; break;
-	case RLIMIT_SOL_VMEM: resource = RLIMIT_AS; break;
-	default: break;
-	}
-	if (get_user (r.rlim_cur, &rlim->rlim_cur) ||
-	    __get_user (r.rlim_max, &rlim->rlim_max))
-		return -EFAULT;
-	set_fs (KERNEL_DS);
-	ret = sys_getrlimit(resource, &rold);
-	if (!ret) {
-		if (r.rlim_cur == RLIM_SOL_INFINITY)
-			r.rlim_cur = RLIM_INFINITY;
-		else if (r.rlim_cur == RLIM_SOL_SAVED_CUR)
-			r.rlim_cur = rold.rlim_cur;
-		else if (r.rlim_cur == RLIM_SOL_SAVED_MAX)
-			r.rlim_cur = rold.rlim_max;
-		if (r.rlim_max == RLIM_SOL_INFINITY)
-			r.rlim_max = RLIM_INFINITY;
-		else if (r.rlim_max == RLIM_SOL_SAVED_CUR)
-			r.rlim_max = rold.rlim_cur;
-		else if (r.rlim_max == RLIM_SOL_SAVED_MAX)
-			r.rlim_max = rold.rlim_max;
-		ret = sys_setrlimit(resource, &r);
-	}
-	set_fs (old_fs);
-	return ret;
-}
-
-struct sol_ntptimeval {
-	struct compat_timeval time;
-	s32 maxerror;
-	s32 esterror;
-};
-
-struct sol_timex {
-	u32 modes;
-	s32 offset;
-	s32 freq;
-	s32 maxerror;
-	s32 esterror;
-	s32 status;
-	s32 constant;
-	s32 precision;
-	s32 tolerance;
-	s32 ppsfreq;
-	s32 jitter;
-	s32 shift;
-	s32 stabil;
-	s32 jitcnt;
-	s32 calcnt;
-	s32 errcnt;
-	s32 stbcnt;
-};
-
-asmlinkage int solaris_ntp_gettime(struct sol_ntptimeval __user *ntp)
-{
-	int (*sys_adjtimex)(struct timex __user *) =
-		(int (*)(struct timex __user *))SYS(adjtimex);
-	struct timex t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-	
-	set_fs(KERNEL_DS);
-	t.modes = 0;
-	ret = sys_adjtimex(&t);
-	set_fs(old_fs);
-	if (ret < 0)
-		return ret;
-	ret = put_user (t.time.tv_sec, &ntp->time.tv_sec);
-	ret |= __put_user (t.time.tv_usec, &ntp->time.tv_usec);
-	ret |= __put_user (t.maxerror, &ntp->maxerror);
-	ret |= __put_user (t.esterror, &ntp->esterror);
-	return ret;	                        
-}
-
-asmlinkage int solaris_ntp_adjtime(struct sol_timex __user *txp)
-{
-	int (*sys_adjtimex)(struct timex __user *) =
-		(int (*)(struct timex __user *))SYS(adjtimex);
-	struct timex t;
-	int ret, err;
-	mm_segment_t old_fs = get_fs();
-
-	ret = get_user (t.modes, &txp->modes);
-	ret |= __get_user (t.offset, &txp->offset);
-	ret |= __get_user (t.freq, &txp->freq);
-	ret |= __get_user (t.maxerror, &txp->maxerror);
-	ret |= __get_user (t.esterror, &txp->esterror);
-	ret |= __get_user (t.status, &txp->status);
-	ret |= __get_user (t.constant, &txp->constant);
-	set_fs(KERNEL_DS);
-	ret = sys_adjtimex(&t);
-	set_fs(old_fs);
-	if (ret < 0)
-		return ret;
-	err = put_user (t.offset, &txp->offset);
-	err |= __put_user (t.freq, &txp->freq);
-	err |= __put_user (t.maxerror, &txp->maxerror);
-	err |= __put_user (t.esterror, &txp->esterror);
-	err |= __put_user (t.status, &txp->status);
-	err |= __put_user (t.constant, &txp->constant);
-	err |= __put_user (t.precision, &txp->precision);
-	err |= __put_user (t.tolerance, &txp->tolerance);
-	err |= __put_user (t.ppsfreq, &txp->ppsfreq);
-	err |= __put_user (t.jitter, &txp->jitter);
-	err |= __put_user (t.shift, &txp->shift);
-	err |= __put_user (t.stabil, &txp->stabil);
-	err |= __put_user (t.jitcnt, &txp->jitcnt);
-	err |= __put_user (t.calcnt, &txp->calcnt);
-	err |= __put_user (t.errcnt, &txp->errcnt);
-	err |= __put_user (t.stbcnt, &txp->stbcnt);
-	if (err)
-		return -EFAULT;
-	return ret;
-}
-
-asmlinkage int do_sol_unimplemented(struct pt_regs *regs)
-{
-	printk ("Unimplemented Solaris syscall %d %08x %08x %08x %08x\n", 
-			(int)regs->u_regs[UREG_G1], 
-			(int)regs->u_regs[UREG_I0],
-			(int)regs->u_regs[UREG_I1],
-			(int)regs->u_regs[UREG_I2],
-			(int)regs->u_regs[UREG_I3]);
-	return -ENOSYS;
-}
-
-asmlinkage void solaris_register(void)
-{
-	set_personality(PER_SVR4);
-}
-
-extern long solaris_to_linux_signals[], linux_to_solaris_signals[];
-
-struct exec_domain solaris_exec_domain = {
-	.name =		"Solaris",
-	.handler =	NULL,
-	.pers_low =	1,		/* PER_SVR4 personality */
-	.pers_high =	1,
-	.signal_map =	solaris_to_linux_signals,
-	.signal_invmap =linux_to_solaris_signals,
-	.module =	THIS_MODULE,
-	.next =		NULL
-};
-
-extern int init_socksys(void);
-
-MODULE_AUTHOR("Jakub Jelinek (jj@ultra.linux.cz), Patrik Rak (prak3264@ss1000.ms.mff.cuni.cz)");
-MODULE_DESCRIPTION("Solaris binary emulation module");
-MODULE_LICENSE("GPL");
-
-extern u32 tl0_solaris[8];
-#define update_ttable(x) 										\
-	tl0_solaris[3] = (((long)(x) - (long)tl0_solaris - 3) >> 2) | 0x40000000;			\
-	wmb();		\
-	__asm__ __volatile__ ("flush %0" : : "r" (&tl0_solaris[3]))
-
-extern u32 solaris_sparc_syscall[];
-extern u32 solaris_syscall[];
-extern void cleanup_socksys(void);
-
-extern u32 entry64_personality_patch;
-
-static int __init solaris_init(void)
-{
-	int ret;
-
-	SOLDD(("Solaris module at %p\n", solaris_sparc_syscall));
-	register_exec_domain(&solaris_exec_domain);
-	if ((ret = init_socksys())) {
-		unregister_exec_domain(&solaris_exec_domain);
-		return ret;
-	}
-	update_ttable(solaris_sparc_syscall);
-	entry64_personality_patch |=
-		(offsetof(struct task_struct, personality) +
-		 (sizeof(unsigned long) - 1));
-	wmb();
-	__asm__ __volatile__("flush %0"
-			     : : "r" (&entry64_personality_patch));
-	return 0;
-}
-
-static void __exit solaris_exit(void)
-{
-	update_ttable(solaris_syscall);
-	cleanup_socksys();
-	unregister_exec_domain(&solaris_exec_domain);
-}
-
-module_init(solaris_init);
-module_exit(solaris_exit);
diff --git a/arch/sparc64/solaris/signal.c b/arch/sparc64/solaris/signal.c
deleted file mode 100644
index de10c9716cfb..000000000000
--- a/arch/sparc64/solaris/signal.c
+++ /dev/null
@@ -1,429 +0,0 @@
-/* $Id: signal.c,v 1.7 2000/09/05 21:44:54 davem Exp $
- * signal.c: Signal emulation for Solaris
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-
-#include <asm/uaccess.h>
-#include <asm/svr4.h>
-#include <asm/string.h>
-
-#include "conv.h"
-#include "signal.h"
-
-#define _S(nr) (1L<<((nr)-1))
-
-#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
-
-long linux_to_solaris_signals[] = {
-        0,
-	SOLARIS_SIGHUP,		SOLARIS_SIGINT,	
-	SOLARIS_SIGQUIT,	SOLARIS_SIGILL,
-	SOLARIS_SIGTRAP,	SOLARIS_SIGIOT,
-	SOLARIS_SIGEMT,		SOLARIS_SIGFPE,
-	SOLARIS_SIGKILL,	SOLARIS_SIGBUS,
-	SOLARIS_SIGSEGV,	SOLARIS_SIGSYS,
-	SOLARIS_SIGPIPE,	SOLARIS_SIGALRM,
-	SOLARIS_SIGTERM,	SOLARIS_SIGURG,
-	SOLARIS_SIGSTOP,	SOLARIS_SIGTSTP,
-	SOLARIS_SIGCONT,	SOLARIS_SIGCLD,
-	SOLARIS_SIGTTIN,	SOLARIS_SIGTTOU,
-	SOLARIS_SIGPOLL,	SOLARIS_SIGXCPU,
-	SOLARIS_SIGXFSZ,	SOLARIS_SIGVTALRM,
-	SOLARIS_SIGPROF,	SOLARIS_SIGWINCH,
-	SOLARIS_SIGUSR1,	SOLARIS_SIGUSR1,
-	SOLARIS_SIGUSR2,	-1,
-};
-
-long solaris_to_linux_signals[] = {
-        0,
-        SIGHUP,		SIGINT,		SIGQUIT,	SIGILL,
-        SIGTRAP,	SIGIOT,		SIGEMT,		SIGFPE,
-        SIGKILL,	SIGBUS,		SIGSEGV,	SIGSYS,
-        SIGPIPE,	SIGALRM,	SIGTERM,	SIGUSR1,
-        SIGUSR2,	SIGCHLD,	-1,		SIGWINCH,
-        SIGURG,		SIGPOLL,	SIGSTOP,	SIGTSTP,
-        SIGCONT,	SIGTTIN,	SIGTTOU,	SIGVTALRM,
-        SIGPROF,	SIGXCPU,	SIGXFSZ,        -1,
-	-1,		-1,		-1,		-1,
-	-1,		-1,		-1,		-1,
-	-1,		-1,		-1,		-1,
-};
-
-static inline long mapsig(long sig)
-{
-	if ((unsigned long)sig > SOLARIS_NSIGNALS)
-		return -EINVAL;
-	return solaris_to_linux_signals[sig];
-}
-
-asmlinkage int solaris_kill(int pid, int sig)
-{
-	int (*sys_kill)(int,int) = 
-		(int (*)(int,int))SYS(kill);
-	int s = mapsig(sig);
-	
-	if (s < 0) return s;
-	return sys_kill(pid, s);
-}
-
-static long sig_handler(int sig, u32 arg, int one_shot)
-{
-	struct sigaction sa, old;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-	int (*sys_sigaction)(int,struct sigaction __user *,struct sigaction __user *) = 
-		(int (*)(int,struct sigaction __user *,struct sigaction __user *))SYS(sigaction);
-	
-	sigemptyset(&sa.sa_mask);
-	sa.sa_restorer = NULL;
-	sa.sa_handler = (__sighandler_t)A(arg);
-	sa.sa_flags = 0;
-	if (one_shot) sa.sa_flags = SA_ONESHOT | SA_NOMASK;
-	set_fs (KERNEL_DS);
-	ret = sys_sigaction(sig, (void __user *)&sa, (void __user *)&old);
-	set_fs (old_fs);
-	if (ret < 0) return ret;
-	return (u32)(unsigned long)old.sa_handler;
-}
-
-static inline long solaris_signal(int sig, u32 arg)
-{
-	return sig_handler (sig, arg, 1);
-}
-
-static long solaris_sigset(int sig, u32 arg)
-{
-	if (arg != 2) /* HOLD */ {
-		spin_lock_irq(&current->sighand->siglock);
-		sigdelsetmask(&current->blocked, _S(sig));
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
-		return sig_handler (sig, arg, 0);
-	} else {
-		spin_lock_irq(&current->sighand->siglock);
-		sigaddsetmask(&current->blocked, (_S(sig) & ~_BLOCKABLE));
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
-		return 0;
-	}
-}
-
-static inline long solaris_sighold(int sig)
-{
-	return solaris_sigset(sig, 2);
-}
-
-static inline long solaris_sigrelse(int sig)
-{
-	spin_lock_irq(&current->sighand->siglock);
-	sigdelsetmask(&current->blocked, _S(sig));
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
-	return 0;
-}
-
-static inline long solaris_sigignore(int sig)
-{
-	return sig_handler(sig, (u32)(unsigned long)SIG_IGN, 0);
-}
-
-static inline long solaris_sigpause(int sig)
-{
-	printk ("Need to support solaris sigpause\n");
-	return -ENOSYS;
-}
-
-asmlinkage long solaris_sigfunc(int sig, u32 arg)
-{
-	int func = sig & ~0xff;
-	
-	sig = mapsig(sig & 0xff); 
-	if (sig < 0) return sig; 
-	switch (func) {
-	case 0: return solaris_signal(sig, arg); 
-	case 0x100: return solaris_sigset(sig, arg); 
-	case 0x200: return solaris_sighold(sig);
-	case 0x400: return solaris_sigrelse(sig); 
-	case 0x800: return solaris_sigignore(sig); 
-	case 0x1000: return solaris_sigpause(sig);
-	}
-	return -EINVAL;
-}
-
-typedef struct {
-	u32 __sigbits[4];
-} sol_sigset_t;
-
-static inline int mapin(u32 *p, sigset_t *q)
-{
-	int i;
-	u32 x;
-	int sig;
-	
-	sigemptyset(q);
-	x = p[0];
-	for (i = 1; i <= SOLARIS_NSIGNALS; i++) {
-		if (x & 1) {
-			sig = solaris_to_linux_signals[i];
-			if (sig == -1)
-				return -EINVAL;
-			sigaddsetmask(q, (1L << (sig - 1)));
-		}
-		x >>= 1;
-		if (i == 32)
-			x = p[1];
-	}
-	return 0;
-}
-
-static inline int mapout(sigset_t *q, u32 *p)
-{
-	int i;
-	int sig;
-	
-	p[0] = 0;
-	p[1] = 0;
-	for (i = 1; i <= 32; i++) {
-		if (sigismember(q, sigmask(i))) {
-			sig = linux_to_solaris_signals[i];
-			if (sig == -1)
-				return -EINVAL;
-			if (sig > 32)
-				p[1] |= 1L << (sig - 33);
-			else
-				p[0] |= 1L << (sig - 1);
-		}
-	}
-	return 0;
-}
-
-asmlinkage int solaris_sigprocmask(int how, u32 in, u32 out)
-{
-	sigset_t in_s, *ins, out_s, *outs;
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	int (*sys_sigprocmask)(int,sigset_t __user *,sigset_t __user *) = 
-		(int (*)(int,sigset_t __user *,sigset_t __user *))SYS(sigprocmask);
-	
-	ins = NULL; outs = NULL;
-	if (in) {
-		u32 tmp[2];
-		
-		if (copy_from_user (tmp, (void __user *)A(in), 2*sizeof(u32)))
-			return -EFAULT;
-		ins = &in_s;
-		if (mapin (tmp, ins)) return -EINVAL;
-	}
-	if (out) outs = &out_s;
-	set_fs (KERNEL_DS);
-	ret = sys_sigprocmask((how == 3) ? SIG_SETMASK : how,
-				(void __user *)ins, (void __user *)outs);
-	set_fs (old_fs);
-	if (ret) return ret;
-	if (out) {
-		u32 tmp[4];
-		
-		tmp[2] = 0; tmp[3] = 0;
-		if (mapout (outs, tmp)) return -EINVAL;
-		if (copy_to_user((void __user *)A(out), tmp, 4*sizeof(u32)))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-asmlinkage long do_sol_sigsuspend(u32 mask)
-{
-	sigset_t s;
-	u32 tmp[2];
-		
-	if (copy_from_user (tmp, (sol_sigset_t __user *)A(mask), 2*sizeof(u32)))
-		return -EFAULT;
-	if (mapin (tmp, &s)) return -EINVAL;
-	return (long)s.sig[0];
-}
-
-struct sol_sigaction {
-	int	sa_flags;
-	u32	sa_handler;
-	u32	sa_mask[4];
-	int	sa_resv[2];
-};
-
-asmlinkage int solaris_sigaction(int sig, u32 act, u32 old)
-{
-	u32 tmp, tmp2[4];
-	struct sigaction s, s2;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-	struct sol_sigaction __user *p = (void __user *)A(old);
-	int (*sys_sigaction)(int,struct sigaction __user *,struct sigaction __user *) = 
-		(int (*)(int,struct sigaction __user *,struct sigaction __user *))SYS(sigaction);
-	
-	sig = mapsig(sig); 
-	if (sig < 0) {
-		/* We cheat a little bit for Solaris only signals */
-		if (old && clear_user(p, sizeof(struct sol_sigaction)))
-			return -EFAULT;
-		return 0;
-	}
-	if (act) {
-		if (get_user (tmp, &p->sa_flags))
-			return -EFAULT;
-		s.sa_flags = 0;
-		if (tmp & SOLARIS_SA_ONSTACK) s.sa_flags |= SA_STACK;
-		if (tmp & SOLARIS_SA_RESTART) s.sa_flags |= SA_RESTART;
-		if (tmp & SOLARIS_SA_NODEFER) s.sa_flags |= SA_NOMASK;
-		if (tmp & SOLARIS_SA_RESETHAND) s.sa_flags |= SA_ONESHOT;
-		if (tmp & SOLARIS_SA_NOCLDSTOP) s.sa_flags |= SA_NOCLDSTOP;
-		if (get_user (tmp, &p->sa_handler) ||
-		    copy_from_user (tmp2, &p->sa_mask, 2*sizeof(u32)))
-			return -EFAULT;
-		s.sa_handler = (__sighandler_t)A(tmp);
-		if (mapin (tmp2, &s.sa_mask)) return -EINVAL;
-		s.sa_restorer = NULL;
-	}
-	set_fs(KERNEL_DS);
-	ret = sys_sigaction(sig, act ? (void __user *)&s : NULL,
-				 old ? (void __user *)&s2 : NULL);
-	set_fs(old_fs);
-	if (ret) return ret;
-	if (old) {
-		if (mapout (&s2.sa_mask, tmp2)) return -EINVAL;
-		tmp = 0; tmp2[2] = 0; tmp2[3] = 0;
-		if (s2.sa_flags & SA_STACK) tmp |= SOLARIS_SA_ONSTACK;
-		if (s2.sa_flags & SA_RESTART) tmp |= SOLARIS_SA_RESTART;
-		if (s2.sa_flags & SA_NOMASK) tmp |= SOLARIS_SA_NODEFER;
-		if (s2.sa_flags & SA_ONESHOT) tmp |= SOLARIS_SA_RESETHAND;
-		if (s2.sa_flags & SA_NOCLDSTOP) tmp |= SOLARIS_SA_NOCLDSTOP;
-		if (put_user (tmp, &p->sa_flags) ||
-		    __put_user ((u32)(unsigned long)s2.sa_handler, &p->sa_handler) ||
-		    copy_to_user (&p->sa_mask, tmp2, 4*sizeof(u32)))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-asmlinkage int solaris_sigpending(int which, u32 set)
-{
-	sigset_t s;
-	u32 tmp[4];
-	switch (which) {
-	case 1: /* sigpending */
-		spin_lock_irq(&current->sighand->siglock);
-		sigandsets(&s, &current->blocked, &current->pending.signal);
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
-		break;
-	case 2: /* sigfillset - I just set signals which have linux equivalents */
-		sigfillset(&s);
-		break;
-	default: return -EINVAL;
-	}
-	if (mapout (&s, tmp)) return -EINVAL;
-	tmp[2] = 0; tmp[3] = 0;
-	if (copy_to_user ((u32 __user *)A(set), tmp, sizeof(tmp)))
-		return -EFAULT;
-	return 0;
-}
-
-asmlinkage int solaris_wait(u32 stat_loc)
-{
-	unsigned __user *p = (unsigned __user *)A(stat_loc);
-	int (*sys_wait4)(pid_t,unsigned __user *, int, struct rusage __user *) =
-		(int (*)(pid_t,unsigned __user *, int, struct rusage __user *))SYS(wait4);
-	int ret, status;
-	
-	ret = sys_wait4(-1, p, WUNTRACED, NULL);
-	if (ret >= 0 && stat_loc) {
-		if (get_user (status, p))
-			return -EFAULT;
-		if (((status - 1) & 0xffff) < 0xff)
-			status = linux_to_solaris_signals[status & 0x7f] & 0x7f;
-		else if ((status & 0xff) == 0x7f)
-			status = (linux_to_solaris_signals[(status >> 8) & 0xff] << 8) | 0x7f;
-		if (__put_user (status, p))
-			return -EFAULT;
-	}
-	return ret;
-}
-
-asmlinkage int solaris_waitid(int idtype, s32 pid, u32 info, int options)
-{
-	int (*sys_wait4)(pid_t,unsigned __user *, int, struct rusage __user *) =
-		(int (*)(pid_t,unsigned __user *, int, struct rusage __user *))SYS(wait4);
-	int opts, status, ret;
-	
-	switch (idtype) {
-	case 0: /* P_PID */ break;
-	case 1: /* P_PGID */ pid = -pid; break;
-	case 7: /* P_ALL */ pid = -1; break;
-	default: return -EINVAL;
-	}
-	opts = 0;
-	if (options & SOLARIS_WUNTRACED) opts |= WUNTRACED;
-	if (options & SOLARIS_WNOHANG) opts |= WNOHANG;
-	current->state = TASK_RUNNING;
-	ret = sys_wait4(pid, (unsigned int __user *)A(info), opts, NULL);
-	if (ret < 0) return ret;
-	if (info) {
-		struct sol_siginfo __user *s = (void __user *)A(info);
-	
-		if (get_user (status, (unsigned int __user *)A(info)))
-			return -EFAULT;
-
-		if (__put_user (SOLARIS_SIGCLD, &s->si_signo) ||
-		    __put_user (ret, &s->_data._proc._pid))
-			return -EFAULT;
-
-		switch (status & 0xff) {
-		case 0: ret = SOLARIS_CLD_EXITED;
-			status = (status >> 8) & 0xff;
-			break;
-		case 0x7f:
-			status = (status >> 8) & 0xff;
-			switch (status) {
-			case SIGSTOP:
-			case SIGTSTP: ret = SOLARIS_CLD_STOPPED;
-			default: ret = SOLARIS_CLD_EXITED;
-			}
-			status = linux_to_solaris_signals[status];
-			break;
-		default:
-			if (status & 0x80) ret = SOLARIS_CLD_DUMPED;
-			else ret = SOLARIS_CLD_KILLED;
-			status = linux_to_solaris_signals[status & 0x7f];
-			break;
-		}
-
-		if (__put_user (ret, &s->si_code) ||
-		    __put_user (status, &s->_data._proc._pdata._cld._status))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-extern int svr4_setcontext(svr4_ucontext_t *c, struct pt_regs *regs);
-extern int svr4_getcontext(svr4_ucontext_t *c, struct pt_regs *regs);
-
-asmlinkage int solaris_context(struct pt_regs *regs)
-{
-	switch ((unsigned)regs->u_regs[UREG_I0]) {
-	case 0: /* getcontext */
-		return svr4_getcontext((svr4_ucontext_t *)(long)(u32)regs->u_regs[UREG_I1], regs);
-	case 1: /* setcontext */
-		return svr4_setcontext((svr4_ucontext_t *)(long)(u32)regs->u_regs[UREG_I1], regs);
-	default:
-		return -EINVAL;
-
-	}
-}
-
-asmlinkage int solaris_sigaltstack(u32 ss, u32 oss)
-{
-/* XXX Implement this soon */
-	return 0;
-}
diff --git a/arch/sparc64/solaris/signal.h b/arch/sparc64/solaris/signal.h
deleted file mode 100644
index e91570803050..000000000000
--- a/arch/sparc64/solaris/signal.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* $Id: signal.h,v 1.3 1998/04/12 06:20:33 davem Exp $
- * signal.h: Signal emulation for Solaris
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-    
-#define SOLARIS_SIGHUP		1
-#define SOLARIS_SIGINT		2
-#define SOLARIS_SIGQUIT		3
-#define SOLARIS_SIGILL		4
-#define SOLARIS_SIGTRAP		5
-#define SOLARIS_SIGIOT		6
-#define SOLARIS_SIGEMT		7
-#define SOLARIS_SIGFPE		8
-#define SOLARIS_SIGKILL		9
-#define SOLARIS_SIGBUS		10
-#define SOLARIS_SIGSEGV		11
-#define SOLARIS_SIGSYS		12
-#define SOLARIS_SIGPIPE		13
-#define SOLARIS_SIGALRM		14
-#define SOLARIS_SIGTERM		15
-#define SOLARIS_SIGUSR1		16
-#define SOLARIS_SIGUSR2		17
-#define SOLARIS_SIGCLD		18
-#define SOLARIS_SIGPWR		19
-#define SOLARIS_SIGWINCH	20
-#define SOLARIS_SIGURG		21
-#define SOLARIS_SIGPOLL		22
-#define SOLARIS_SIGSTOP		23
-#define SOLARIS_SIGTSTP		24
-#define SOLARIS_SIGCONT		25
-#define SOLARIS_SIGTTIN		26
-#define SOLARIS_SIGTTOU		27
-#define SOLARIS_SIGVTALRM	28
-#define SOLARIS_SIGPROF		29
-#define SOLARIS_SIGXCPU		30
-#define SOLARIS_SIGXFSZ		31
-#define SOLARIS_SIGWAITING	32
-#define SOLARIS_SIGLWP		33
-#define SOLARIS_SIGFREEZE	34
-#define SOLARIS_SIGTHAW		35
-#define SOLARIS_SIGCANCEL	36
-#define SOLARIS_SIGRTMIN	37
-#define SOLARIS_SIGRTMAX	44
-#define SOLARIS_NSIGNALS	44
-
-
-#define SOLARIS_SA_ONSTACK	1
-#define SOLARIS_SA_RESETHAND	2
-#define SOLARIS_SA_RESTART	4
-#define SOLARIS_SA_SIGINFO	8
-#define SOLARIS_SA_NODEFER	16
-#define SOLARIS_SA_NOCLDWAIT	0x10000
-#define SOLARIS_SA_NOCLDSTOP	0x20000
-
-struct sol_siginfo {
-	int	si_signo;
-	int	si_code;
-	int	si_errno;
-	union	{
-		char	pad[128-3*sizeof(int)];
-		struct { 
-			s32	_pid;
-			union {
-				struct {
-					s32	_uid;
-					s32	_value;
-				} _kill;
-				struct {
-					s32	_utime;
-					int	_status;
-					s32	_stime;
-				} _cld;
-			} _pdata;
-		} _proc;
-		struct { /* SIGSEGV, SIGBUS, SIGILL and SIGFPE */
-			u32	_addr;
-			int	_trapno;
-		} _fault;
-		struct { /* SIGPOLL, SIGXFSZ */
-			int	_fd;
-			s32	_band;
-		} _file;
-	} _data;
-};
-
-#define SOLARIS_WUNTRACED	0x04
-#define SOLARIS_WNOHANG		0x40
-#define SOLARIS_WEXITED         0x01
-#define SOLARIS_WTRAPPED        0x02
-#define SOLARIS_WSTOPPED        WUNTRACED
-#define SOLARIS_WCONTINUED      0x08
-#define SOLARIS_WNOWAIT         0x80
-
-#define SOLARIS_TRAP_BRKPT      1
-#define SOLARIS_TRAP_TRACE      2
-#define SOLARIS_CLD_EXITED      1
-#define SOLARIS_CLD_KILLED      2
-#define SOLARIS_CLD_DUMPED      3
-#define SOLARIS_CLD_TRAPPED     4
-#define SOLARIS_CLD_STOPPED     5
-#define SOLARIS_CLD_CONTINUED   6
-#define SOLARIS_POLL_IN         1
-#define SOLARIS_POLL_OUT        2
-#define SOLARIS_POLL_MSG        3
-#define SOLARIS_POLL_ERR        4
-#define SOLARIS_POLL_PRI        5
-#define SOLARIS_POLL_HUP        6
diff --git a/arch/sparc64/solaris/socket.c b/arch/sparc64/solaris/socket.c
deleted file mode 100644
index cc69847cf240..000000000000
--- a/arch/sparc64/solaris/socket.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/* $Id: socket.c,v 1.6 2002/02/08 03:57:14 davem Exp $
- * socket.c: Socket syscall emulation for Solaris 2.6+
- *
- * Copyright (C) 1998 Jakub Jelinek (jj@ultra.linux.cz)
- *
- * 1999-08-19 Fixed socketpair code 
- *            Jason Rappleye (rappleye@ccr.buffalo.edu)
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/file.h>
-#include <linux/net.h>
-#include <linux/compat.h>
-#include <net/compat.h>
-#include <net/sock.h>
-
-#include <asm/uaccess.h>
-#include <asm/string.h>
-#include <asm/oplib.h>
-#include <asm/idprom.h>
-
-#include "conv.h"
-
-#define SOCK_SOL_STREAM		2
-#define SOCK_SOL_DGRAM		1
-#define SOCK_SOL_RAW		4
-#define SOCK_SOL_RDM		5
-#define SOCK_SOL_SEQPACKET	6
-
-#define SOL_SO_SNDLOWAT		0x1003
-#define SOL_SO_RCVLOWAT		0x1004
-#define SOL_SO_SNDTIMEO		0x1005
-#define SOL_SO_RCVTIMEO		0x1006
-#define SOL_SO_STATE		0x2000
-
-#define SOL_SS_NDELAY		0x040
-#define SOL_SS_NONBLOCK		0x080
-#define SOL_SS_ASYNC		0x100
-
-#define SO_STATE		0x000e
-
-static int socket_check(int family, int type)
-{
-	if (family != PF_UNIX && family != PF_INET)
-		return -ESOCKTNOSUPPORT;
-	switch (type) {
-	case SOCK_SOL_STREAM: type = SOCK_STREAM; break;
-	case SOCK_SOL_DGRAM: type = SOCK_DGRAM; break;
-	case SOCK_SOL_RAW: type = SOCK_RAW; break;
-	case SOCK_SOL_RDM: type = SOCK_RDM; break;
-	case SOCK_SOL_SEQPACKET: type = SOCK_SEQPACKET; break;
-	default: return -EINVAL;
-	}
-	return type;
-}
-
-static int solaris_to_linux_sockopt(int optname) 
-{
-	switch (optname) {
-	case SOL_SO_SNDLOWAT: optname = SO_SNDLOWAT; break;
-	case SOL_SO_RCVLOWAT: optname = SO_RCVLOWAT; break;
-	case SOL_SO_SNDTIMEO: optname = SO_SNDTIMEO; break;
-	case SOL_SO_RCVTIMEO: optname = SO_RCVTIMEO; break;
-	case SOL_SO_STATE: optname = SO_STATE; break;
-	};
-	
-	return optname;
-}
-	
-asmlinkage int solaris_socket(int family, int type, int protocol)
-{
-	int (*sys_socket)(int, int, int) =
-		(int (*)(int, int, int))SYS(socket);
-
-	type = socket_check (family, type);
-	if (type < 0) return type;
-	return sys_socket(family, type, protocol);
-}
-
-asmlinkage int solaris_socketpair(int *usockvec)
-{
-	int (*sys_socketpair)(int, int, int, int *) =
-		(int (*)(int, int, int, int *))SYS(socketpair);
-
-	/* solaris socketpair really only takes one arg at the syscall
-	 * level, int * usockvec. The libs apparently take care of 
-	 * making sure that family==AF_UNIX and type==SOCK_STREAM. The 
-	 * pointer we really want ends up residing in the first (and
-	 * supposedly only) argument.
-	 */
-
-	return sys_socketpair(AF_UNIX, SOCK_STREAM, 0, (int *)usockvec);
-}
-
-asmlinkage int solaris_bind(int fd, struct sockaddr *addr, int addrlen)
-{
-	int (*sys_bind)(int, struct sockaddr *, int) =
-		(int (*)(int, struct sockaddr *, int))SUNOS(104);
-
-	return sys_bind(fd, addr, addrlen);
-}
-
-asmlinkage int solaris_setsockopt(int fd, int level, int optname, u32 optval, int optlen)
-{
-	int (*sunos_setsockopt)(int, int, int, u32, int) =
-		(int (*)(int, int, int, u32, int))SUNOS(105);
-
-	optname = solaris_to_linux_sockopt(optname);
-	if (optname < 0)
-		return optname;
-	if (optname == SO_STATE)
-		return 0;
-
-	return sunos_setsockopt(fd, level, optname, optval, optlen);
-}
-
-asmlinkage int solaris_getsockopt(int fd, int level, int optname, u32 optval, u32 optlen)
-{
-	int (*sunos_getsockopt)(int, int, int, u32, u32) =
-		(int (*)(int, int, int, u32, u32))SUNOS(118);
-
-	optname = solaris_to_linux_sockopt(optname);
-	if (optname < 0)
-		return optname;
-
-	if (optname == SO_STATE)
-		optname = SOL_SO_STATE;
-
-	return sunos_getsockopt(fd, level, optname, optval, optlen);
-}
-
-asmlinkage int solaris_connect(int fd, struct sockaddr __user *addr, int addrlen)
-{
-	int (*sys_connect)(int, struct sockaddr __user *, int) =
-		(int (*)(int, struct sockaddr __user *, int))SYS(connect);
-
-	return sys_connect(fd, addr, addrlen);
-}
-
-asmlinkage int solaris_accept(int fd, struct sockaddr __user *addr, int __user *addrlen)
-{
-	int (*sys_accept)(int, struct sockaddr __user *, int __user *) =
-		(int (*)(int, struct sockaddr __user *, int __user *))SYS(accept);
-
-	return sys_accept(fd, addr, addrlen);
-}
-
-asmlinkage int solaris_listen(int fd, int backlog)
-{
-	int (*sys_listen)(int, int) =
-		(int (*)(int, int))SUNOS(106);
-
-	return sys_listen(fd, backlog);
-}
-
-asmlinkage int solaris_shutdown(int fd, int how)
-{
-	int (*sys_shutdown)(int, int) =
-		(int (*)(int, int))SYS(shutdown);
-
-	return sys_shutdown(fd, how);
-}
-
-#define MSG_SOL_OOB		0x1
-#define MSG_SOL_PEEK		0x2
-#define MSG_SOL_DONTROUTE	0x4
-#define MSG_SOL_EOR		0x8
-#define MSG_SOL_CTRUNC		0x10
-#define MSG_SOL_TRUNC		0x20
-#define MSG_SOL_WAITALL		0x40
-#define MSG_SOL_DONTWAIT	0x80
-
-static int solaris_to_linux_msgflags(int flags)
-{
-	int fl = flags & (MSG_OOB|MSG_PEEK|MSG_DONTROUTE);
-	
-	if (flags & MSG_SOL_EOR) fl |= MSG_EOR;
-	if (flags & MSG_SOL_CTRUNC) fl |= MSG_CTRUNC;
-	if (flags & MSG_SOL_TRUNC) fl |= MSG_TRUNC;
-	if (flags & MSG_SOL_WAITALL) fl |= MSG_WAITALL;
-	if (flags & MSG_SOL_DONTWAIT) fl |= MSG_DONTWAIT;
-	return fl;
-}
-
-static int linux_to_solaris_msgflags(int flags)
-{
-	int fl = flags & (MSG_OOB|MSG_PEEK|MSG_DONTROUTE);
-	
-	if (flags & MSG_EOR) fl |= MSG_SOL_EOR;
-	if (flags & MSG_CTRUNC) fl |= MSG_SOL_CTRUNC;
-	if (flags & MSG_TRUNC) fl |= MSG_SOL_TRUNC;
-	if (flags & MSG_WAITALL) fl |= MSG_SOL_WAITALL;
-	if (flags & MSG_DONTWAIT) fl |= MSG_SOL_DONTWAIT;
-	return fl;
-}
-
-asmlinkage int solaris_recvfrom(int s, char __user *buf, int len, int flags, u32 from, u32 fromlen)
-{
-	int (*sys_recvfrom)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *) =
-		(int (*)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *))SYS(recvfrom);
-	
-	return sys_recvfrom(s, buf, len, solaris_to_linux_msgflags(flags), A(from), A(fromlen));
-}
-
-asmlinkage int solaris_recv(int s, char __user *buf, int len, int flags)
-{
-	int (*sys_recvfrom)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *) =
-		(int (*)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *))SYS(recvfrom);
-	
-	return sys_recvfrom(s, buf, len, solaris_to_linux_msgflags(flags), NULL, NULL);
-}
-
-asmlinkage int solaris_sendto(int s, char __user *buf, int len, int flags, u32 to, u32 tolen)
-{
-	int (*sys_sendto)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *) =
-		(int (*)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *))SYS(sendto);
-	
-	return sys_sendto(s, buf, len, solaris_to_linux_msgflags(flags), A(to), A(tolen));
-}
-
-asmlinkage int solaris_send(int s, char *buf, int len, int flags)
-{
-	int (*sys_sendto)(int, void *, size_t, unsigned, struct sockaddr *, int *) =
-		(int (*)(int, void *, size_t, unsigned, struct sockaddr *, int *))SYS(sendto);
-	
-	return sys_sendto(s, buf, len, solaris_to_linux_msgflags(flags), NULL, NULL);
-}
-
-asmlinkage int solaris_getpeername(int fd, struct sockaddr *addr, int *addrlen)
-{
-	int (*sys_getpeername)(int, struct sockaddr *, int *) =
-		(int (*)(int, struct sockaddr *, int *))SYS(getpeername);
-
-	return sys_getpeername(fd, addr, addrlen);
-}
-
-asmlinkage int solaris_getsockname(int fd, struct sockaddr *addr, int *addrlen)
-{
-	int (*sys_getsockname)(int, struct sockaddr *, int *) =
-		(int (*)(int, struct sockaddr *, int *))SYS(getsockname);
-
-	return sys_getsockname(fd, addr, addrlen);
-}
-
-/* XXX This really belongs in some header file... -DaveM */
-#define MAX_SOCK_ADDR	128		/* 108 for Unix domain - 
-					   16 for IP, 16 for IPX,
-					   24 for IPv6,
-					   about 80 for AX.25 */
-
-struct sol_nmsghdr {
-	u32		msg_name;
-	int		msg_namelen;
-	u32		msg_iov;
-	u32		msg_iovlen;
-	u32		msg_control;
-	u32		msg_controllen;
-	u32		msg_flags;
-};
-
-struct sol_cmsghdr {
-	u32		cmsg_len;
-	int		cmsg_level;
-	int		cmsg_type;
-	unsigned char	cmsg_data[0];
-};
-
-static inline int msghdr_from_user32_to_kern(struct msghdr *kmsg,
-					     struct sol_nmsghdr __user *umsg)
-{
-	u32 tmp1, tmp2, tmp3;
-	int err;
-
-	err = get_user(tmp1, &umsg->msg_name);
-	err |= __get_user(tmp2, &umsg->msg_iov);
-	err |= __get_user(tmp3, &umsg->msg_control);
-	if (err)
-		return -EFAULT;
-
-	kmsg->msg_name = A(tmp1);
-	kmsg->msg_iov = A(tmp2);
-	kmsg->msg_control = A(tmp3);
-
-	err = get_user(kmsg->msg_namelen, &umsg->msg_namelen);
-	err |= get_user(kmsg->msg_controllen, &umsg->msg_controllen);
-	err |= get_user(kmsg->msg_flags, &umsg->msg_flags);
-	
-	kmsg->msg_flags = solaris_to_linux_msgflags(kmsg->msg_flags);
-	
-	return err;
-}
-
-asmlinkage int solaris_sendmsg(int fd, struct sol_nmsghdr __user *user_msg, unsigned user_flags)
-{
-	struct socket *sock;
-	char address[MAX_SOCK_ADDR];
-	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
-	unsigned char ctl[sizeof(struct cmsghdr) + 20];
-	unsigned char *ctl_buf = ctl;
-	struct msghdr msg_sys;
-	int err, ctl_len, iov_size, total_len;
-
-	err = -EFAULT;
-	if (msghdr_from_user32_to_kern(&msg_sys, user_msg))
-		goto out;
-
-	sock = sockfd_lookup(fd, &err);
-	if (!sock)
-		goto out;
-
-	/* do not move before msg_sys is valid */
-	err = -EMSGSIZE;
-	if (msg_sys.msg_iovlen > UIO_MAXIOV)
-		goto out_put;
-
-	/* Check whether to allocate the iovec area*/
-	err = -ENOMEM;
-	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
-	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
-		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
-		if (!iov)
-			goto out_put;
-	}
-
-	err = verify_compat_iovec(&msg_sys, iov, address, VERIFY_READ);
-	if (err < 0)
-		goto out_freeiov;
-	total_len = err;
-
-	err = -ENOBUFS;
-	if (msg_sys.msg_controllen > INT_MAX)
-		goto out_freeiov;
-
-	ctl_len = msg_sys.msg_controllen;
-	if (ctl_len) {
-		struct sol_cmsghdr __user *ucmsg = msg_sys.msg_control;
-		unsigned long *kcmsg;
-		compat_size_t cmlen;
-
-		err = -EINVAL;
-		if (ctl_len <= sizeof(compat_size_t))
-			goto out_freeiov;
-
-		if (ctl_len > sizeof(ctl)) {
-			err = -ENOBUFS;
-			ctl_buf = kmalloc(ctl_len, GFP_KERNEL);
-			if (!ctl_buf)
-				goto out_freeiov;
-		}
-		__get_user(cmlen, &ucmsg->cmsg_len);
-		kcmsg = (unsigned long *) ctl_buf;
-		*kcmsg++ = (unsigned long)cmlen;
-		err = -EFAULT;
-		if (copy_from_user(kcmsg, &ucmsg->cmsg_level,
-				   ctl_len - sizeof(compat_size_t)))
-			goto out_freectl;
-		msg_sys.msg_control = ctl_buf;
-	}
-	msg_sys.msg_flags = solaris_to_linux_msgflags(user_flags);
-
-	if (sock->file->f_flags & O_NONBLOCK)
-		msg_sys.msg_flags |= MSG_DONTWAIT;
-	err = sock_sendmsg(sock, &msg_sys, total_len);
-
-out_freectl:
-	if (ctl_buf != ctl)    
-		sock_kfree_s(sock->sk, ctl_buf, ctl_len);
-out_freeiov:
-	if (iov != iovstack)
-		sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
-	sockfd_put(sock);
-out:       
-	return err;
-}
-
-asmlinkage int solaris_recvmsg(int fd, struct sol_nmsghdr __user *user_msg, unsigned int user_flags)
-{
-	struct socket *sock;
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct msghdr msg_sys;
-	unsigned long cmsg_ptr;
-	int err, iov_size, total_len, len;
-
-	/* kernel mode address */
-	char addr[MAX_SOCK_ADDR];
-
-	/* user mode address pointers */
-	struct sockaddr __user *uaddr;
-	int __user *uaddr_len;
-
-	if (msghdr_from_user32_to_kern(&msg_sys, user_msg))
-		return -EFAULT;
-
-	sock = sockfd_lookup(fd, &err);
-	if (!sock)
-		goto out;
-
-	err = -EMSGSIZE;
-	if (msg_sys.msg_iovlen > UIO_MAXIOV)
-		goto out_put;
-
-	/* Check whether to allocate the iovec area*/
-	err = -ENOMEM;
-	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
-	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
-		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
-		if (!iov)
-			goto out_put;
-	}
-
-	/*
-	 *	Save the user-mode address (verify_iovec will change the
-	 *	kernel msghdr to use the kernel address space)
-	 */
-	 
-	uaddr = (void __user *) msg_sys.msg_name;
-	uaddr_len = &user_msg->msg_namelen;
-	err = verify_compat_iovec(&msg_sys, iov, addr, VERIFY_WRITE);
-	if (err < 0)
-		goto out_freeiov;
-	total_len = err;
-
-	cmsg_ptr = (unsigned long) msg_sys.msg_control;
-	msg_sys.msg_flags = MSG_CMSG_COMPAT;
-
-	if (sock->file->f_flags & O_NONBLOCK)
-		user_flags |= MSG_DONTWAIT;
-
-	err = sock_recvmsg(sock, &msg_sys, total_len, user_flags);
-	if(err < 0)
-		goto out_freeiov;
-
-	len = err;
-
-	if (uaddr != NULL) {
-		err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, uaddr_len);
-		if (err < 0)
-			goto out_freeiov;
-	}
-	err = __put_user(linux_to_solaris_msgflags(msg_sys.msg_flags), &user_msg->msg_flags);
-	if (err)
-		goto out_freeiov;
-	err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
-			 &user_msg->msg_controllen);
-	if (err)
-		goto out_freeiov;
-	err = len;
-
-out_freeiov:
-	if (iov != iovstack)
-		sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
-	sockfd_put(sock);
-out:
-	return err;
-}
diff --git a/arch/sparc64/solaris/socksys.c b/arch/sparc64/solaris/socksys.c
deleted file mode 100644
index 7736411f244f..000000000000
--- a/arch/sparc64/solaris/socksys.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/* $Id: socksys.c,v 1.21 2002/02/08 03:57:14 davem Exp $
- * socksys.c: /dev/inet/ stuff for Solaris emulation.
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1997, 1998 Patrik Rak (prak3264@ss1000.ms.mff.cuni.cz)
- * Copyright (C) 1995, 1996 Mike Jagdis (jaggy@purplet.demon.co.uk)
- */
-
-/*
- *  Dave, _please_ give me specifications on this fscking mess so that I
- * could at least get it into the state when it wouldn't screw the rest of
- * the kernel over.  socksys.c and timod.c _stink_ and we are not talking
- * H2S here, it's isopropilmercaptan in concentrations way over LD50. -- AV
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/ioctl.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/slab.h>
-#include <linux/syscalls.h>
-#include <linux/in.h>
-
-#include <net/sock.h>
-
-#include <asm/uaccess.h>
-#include <asm/termios.h>
-
-#include "conv.h"
-#include "socksys.h"
-
-static int af_inet_protocols[] = {
-IPPROTO_ICMP, IPPROTO_ICMP, IPPROTO_IGMP, IPPROTO_IPIP, IPPROTO_TCP,
-IPPROTO_EGP, IPPROTO_PUP, IPPROTO_UDP, IPPROTO_IDP, IPPROTO_RAW,
-0, 0, 0, 0, 0, 0,
-};
-
-#ifndef DEBUG_SOLARIS_KMALLOC
-
-#define mykmalloc kmalloc
-#define mykfree kfree
-
-#else
-
-extern void * mykmalloc(size_t s, gfp_t gfp);
-extern void mykfree(void *);
-
-#endif
-
-static unsigned int (*sock_poll)(struct file *, poll_table *);
-
-static struct file_operations socksys_file_ops = {
-	/* Currently empty */
-};
-
-static int socksys_open(struct inode * inode, struct file * filp)
-{
-	int family, type, protocol, fd;
-	struct dentry *dentry;
-	int (*sys_socket)(int,int,int) =
-		(int (*)(int,int,int))SUNOS(97);
-        struct sol_socket_struct * sock;
-	
-	family = ((iminor(inode) >> 4) & 0xf);
-	switch (family) {
-	case AF_UNIX:
-		type = SOCK_STREAM;
-		protocol = 0;
-		break;
-	case AF_INET:
-		protocol = af_inet_protocols[iminor(inode) & 0xf];
-		switch (protocol) {
-		case IPPROTO_TCP: type = SOCK_STREAM; break;
-		case IPPROTO_UDP: type = SOCK_DGRAM; break;
-		default: type = SOCK_RAW; break;
-		}
-		break;
-	default:
-		type = SOCK_RAW;
-		protocol = 0;
-		break;
-	}
-
-	fd = sys_socket(family, type, protocol);
-	if (fd < 0)
-		return fd;
-	/*
-	 * N.B. The following operations are not legal!
-	 *
-	 * No shit.  WTF is it supposed to do, anyway?
-	 *
-	 * Try instead:
-	 * d_delete(filp->f_path.dentry), then d_instantiate with sock inode
-	 */
-	dentry = filp->f_path.dentry;
-	filp->f_path.dentry = dget(fcheck(fd)->f_path.dentry);
-	filp->f_path.dentry->d_inode->i_rdev = inode->i_rdev;
-	filp->f_path.dentry->d_inode->i_flock = inode->i_flock;
-	SOCKET_I(filp->f_path.dentry->d_inode)->file = filp;
-	filp->f_op = &socksys_file_ops;
-        sock = (struct sol_socket_struct*) 
-        	mykmalloc(sizeof(struct sol_socket_struct), GFP_KERNEL);
-        if (!sock) return -ENOMEM;
-	SOLDD(("sock=%016lx(%016lx)\n", sock, filp));
-        sock->magic = SOLARIS_SOCKET_MAGIC;
-        sock->modcount = 0;
-        sock->state = TS_UNBND;
-        sock->offset = 0;
-        sock->pfirst = sock->plast = NULL;
-        filp->private_data = sock;
-	SOLDD(("filp->private_data %016lx\n", filp->private_data));
-
-	sys_close(fd);
-	dput(dentry);
-	return 0;
-}
-
-static int socksys_release(struct inode * inode, struct file * filp)
-{
-        struct sol_socket_struct * sock;
-        struct T_primsg *it;
-
-	/* XXX: check this */
-	sock = (struct sol_socket_struct *)filp->private_data;
-	SOLDD(("sock release %016lx(%016lx)\n", sock, filp));
-	it = sock->pfirst;
-	while (it) {
-		struct T_primsg *next = it->next;
-		
-		SOLDD(("socksys_release %016lx->%016lx\n", it, next));
-		mykfree((char*)it);
-		it = next;
-	}
-	filp->private_data = NULL;
-	SOLDD(("socksys_release %016lx\n", sock));
-	mykfree((char*)sock);
-	return 0;
-}
-
-static unsigned int socksys_poll(struct file * filp, poll_table * wait)
-{
-	struct inode *ino;
-	unsigned int mask = 0;
-
-	ino=filp->f_path.dentry->d_inode;
-	if (ino && S_ISSOCK(ino->i_mode)) {
-		struct sol_socket_struct *sock;
-		sock = (struct sol_socket_struct*)filp->private_data;
-		if (sock && sock->pfirst) {
-			mask |= POLLIN | POLLRDNORM;
-			if (sock->pfirst->pri == MSG_HIPRI)
-				mask |= POLLPRI;
-		}
-	}
-	if (sock_poll)
-		mask |= (*sock_poll)(filp, wait);
-	return mask;
-}
-	
-static const struct file_operations socksys_fops = {
-	.open =		socksys_open,
-	.release =	socksys_release,
-};
-
-int __init init_socksys(void)
-{
-	int ret;
-	struct file * file;
-	int (*sys_socket)(int,int,int) =
-		(int (*)(int,int,int))SUNOS(97);
-	int (*sys_close)(unsigned int) = 
-		(int (*)(unsigned int))SYS(close);
-	
-	ret = register_chrdev (30, "socksys", &socksys_fops);
-	if (ret < 0) {
-		printk ("Couldn't register socksys character device\n");
-		return ret;
-	}
-	ret = sys_socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-	if (ret < 0) {
-		printk ("Couldn't create socket\n");
-		return ret;
-	}
-
-	file = fcheck(ret);
-	/* N.B. Is this valid? Suppose the f_ops are in a module ... */
-	socksys_file_ops = *file->f_op;
-	sys_close(ret);
-	sock_poll = socksys_file_ops.poll;
-	socksys_file_ops.poll = socksys_poll;
-	socksys_file_ops.release = socksys_release;
-	return 0;
-}
-
-void __exit cleanup_socksys(void)
-{
-	unregister_chrdev(30, "socksys");
-}
diff --git a/arch/sparc64/solaris/socksys.h b/arch/sparc64/solaris/socksys.h
deleted file mode 100644
index 5d1b78ec1600..000000000000
--- a/arch/sparc64/solaris/socksys.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/* $Id: socksys.h,v 1.2 1998/03/26 08:46:07 jj Exp $
- * socksys.h: Definitions for STREAMS modules emulation code.
- *
- * Copyright (C) 1998 Patrik Rak (prak3264@ss1000.ms.mff.cuni.cz)
- */
-
-#define MSG_HIPRI	0x01
-#define MSG_ANY		0x02
-#define MSG_BAND	0x04
-
-#define MORECTL		1
-#define MOREDATA	2
-
-#define	TBADADDR		1
-#define	TBADOPT			2
-#define	TACCES			3
-#define TBADF			4
-#define TNOADDR			5
-#define TOUTSTATE	        6
-#define TBADSEQ		        7
-#define TSYSERR			8
-#define TLOOK		        9
-#define TBADDATA	       10
-#define TBUFOVFLW	       11
-#define TFLOW		       12
-#define	TNODATA		       13
-#define TNODIS		       14
-#define TNOUDERR	       15
-#define TBADFLAG	       16
-#define TNOREL		       17
-#define TNOTSUPPORT	       18
-#define TSTATECHNG	       19
-
-#define T_CONN_REQ      0
-#define T_CONN_RES      1
-#define T_DISCON_REQ    2
-#define T_DATA_REQ      3
-#define T_EXDATA_REQ    4
-#define T_INFO_REQ      5
-#define T_BIND_REQ      6
-#define T_UNBIND_REQ    7
-#define T_UNITDATA_REQ  8
-#define T_OPTMGMT_REQ   9
-#define T_ORDREL_REQ    10
-
-#define T_CONN_IND      11
-#define T_CONN_CON      12
-#define T_DISCON_IND    13
-#define T_DATA_IND      14
-#define T_EXDATA_IND    15
-#define T_INFO_ACK      16
-#define T_BIND_ACK      17
-#define T_ERROR_ACK     18
-#define T_OK_ACK        19
-#define T_UNITDATA_IND  20
-#define T_UDERROR_IND   21
-#define T_OPTMGMT_ACK   22
-#define T_ORDREL_IND    23
-
-#define T_NEGOTIATE	0x0004
-#define T_FAILURE	0x0040
-
-#define TS_UNBND	0	/* unbound */
-#define	TS_WACK_BREQ	1	/* waiting for T_BIND_REQ ack  */
-#define TS_WACK_UREQ	2	/* waiting for T_UNBIND_REQ ack */
-#define TS_IDLE		3	/* idle */
-#define TS_WACK_OPTREQ	4	/* waiting for T_OPTMGMT_REQ ack */
-#define TS_WACK_CREQ	5	/* waiting for T_CONN_REQ ack */
-#define TS_WCON_CREQ	6	/* waiting for T_CONN_REQ confirmation */
-#define	TS_WRES_CIND	7	/* waiting for T_CONN_IND */
-#define TS_WACK_CRES	8	/* waiting for T_CONN_RES ack */
-#define TS_DATA_XFER	9	/* data transfer */
-#define TS_WIND_ORDREL	10	/* releasing read but not write */
-#define TS_WREQ_ORDREL	11      /* wait to release write but not read */
-#define TS_WACK_DREQ6	12	/* waiting for T_DISCON_REQ ack */
-#define TS_WACK_DREQ7	13	/* waiting for T_DISCON_REQ ack */
-#define TS_WACK_DREQ9	14	/* waiting for T_DISCON_REQ ack */
-#define TS_WACK_DREQ10	15	/* waiting for T_DISCON_REQ ack */
-#define TS_WACK_DREQ11	16	/* waiting for T_DISCON_REQ ack */
-#define TS_NOSTATES	17
-
-struct T_conn_req {
-	s32 PRIM_type; 
-	s32 DEST_length;
-	s32 DEST_offset;
-	s32 OPT_length;
-	s32 OPT_offset;
-};
-
-struct T_bind_req {
-	s32 PRIM_type;
-	s32 ADDR_length;
-	s32 ADDR_offset;
-	u32 CONIND_number;
-};
-
-struct T_unitdata_req {
-	s32 PRIM_type; 
-	s32 DEST_length;
-	s32 DEST_offset;
-	s32 OPT_length;
-	s32 OPT_offset;
-};
-
-struct T_optmgmt_req {
-	s32 PRIM_type; 
-	s32 OPT_length;
-	s32 OPT_offset;
-	s32 MGMT_flags;
-};
-
-struct T_bind_ack {
-	s32 PRIM_type;
-	s32 ADDR_length;
-	s32 ADDR_offset;
-	u32 CONIND_number;
-};
-
-struct T_error_ack {
-	s32 PRIM_type;
-	s32 ERROR_prim;
-	s32 TLI_error;
-	s32 UNIX_error;
-};
-
-struct T_ok_ack {
-	s32 PRIM_type;
-	s32 CORRECT_prim;
-};
-
-struct T_conn_ind {
-	s32 PRIM_type;
-	s32 SRC_length;
-	s32 SRC_offset;
-	s32 OPT_length;
-	s32 OPT_offset;
-	s32 SEQ_number;
-};
-
-struct T_conn_con {
-	s32 PRIM_type;
-	s32 RES_length;
-	s32 RES_offset;
-	s32 OPT_length;
-	s32 OPT_offset;
-};
-
-struct T_discon_ind {
-	s32 PRIM_type;
-	s32 DISCON_reason;
-	s32 SEQ_number;
-};
-
-struct T_unitdata_ind {
-	s32 PRIM_type;
-	s32 SRC_length;
-	s32 SRC_offset;
-	s32 OPT_length;
-	s32 OPT_offset;
-};
-
-struct T_optmgmt_ack {
-	s32 PRIM_type; 
-	s32 OPT_length;
-	s32 OPT_offset;
-	s32 MGMT_flags;
-};
-
-struct opthdr {
-	s32 level;
-	s32 name;
-	s32 len;
-	char value[0];	
-};
-
-struct T_primsg {
-	struct T_primsg *next;
-	unsigned char pri;
-	unsigned char band;
-	int length;
-	s32 type;
-};
-
-struct strbuf {
-	s32 maxlen;
-	s32 len;
-	u32 buf;
-} ;
-
-/* Constants used by STREAMS modules emulation code */
-
-typedef char sol_module;
-
-#define MAX_NR_STREAM_MODULES   16
-
-/* Private data structure assigned to sockets. */
-
-struct sol_socket_struct {
-        int magic;
-        int modcount;
-        sol_module module[MAX_NR_STREAM_MODULES];
-        long state;
-        int offset;
-        struct T_primsg *pfirst, *plast;
-};
-
-#define SOLARIS_SOCKET_MAGIC    0xADDED
-
diff --git a/arch/sparc64/solaris/systbl.S b/arch/sparc64/solaris/systbl.S
deleted file mode 100644
index 7043ca18caf9..000000000000
--- a/arch/sparc64/solaris/systbl.S
+++ /dev/null
@@ -1,285 +0,0 @@
-/* $Id: systbl.S,v 1.11 2000/03/13 21:57:35 davem Exp $
- * systbl.S: System call entry point table for Solaris compatibility.
- *
- * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1996 Miguel de Icaza (miguel@nuclecu.unam.mx)
- */
-
-#include <asm/unistd.h>
-
-/* Fall back to sys_call_table32 entry */
-#define CHAIN(name)	__NR_##name
-
-/* Pass pt_regs pointer as first argument */
-#define REGS(name)	name+1
-
-/* Hack till all be implemented */
-#define solaris_getpmsg		solaris_unimplemented
-#define solaris_hrtsys		solaris_unimplemented
-#define solaris_msgsys		solaris_unimplemented
-#define solaris_putpmsg		solaris_unimplemented
-#define solaris_semsys		solaris_unimplemented
-
-        .data
-	.globl		solaris_sys_table
-solaris_sys_table:
-	.word solaris_unimplemented	/* nosys		0	*/
-	.word CHAIN(exit)		/* exit		d	1	*/
-	.word CHAIN(fork)		/* fork			2	*/
-	.word CHAIN(read)		/* read		dpd	3	*/
-	.word CHAIN(write)		/* write	dpd	4	*/
-	.word solaris_open		/* open		soo	5	*/
-	.word CHAIN(close)		/* close	d	6	*/
-	.word solaris_wait		/* wait		xxx	7	*/
-	.word CHAIN(creat)		/* creat	so	8	*/
-	.word CHAIN(link)		/* link		ss	9	*/
-	.word CHAIN(unlink)		/* unlink	s	10	*/
-	.word solaris_unimplemented	/* exec		sxx	11	*/
-	.word CHAIN(chdir)		/* chdir	s	12	*/
-	.word CHAIN(time)		/* time			13	*/
-	.word solaris_mknod		/* mknod	sox	14	*/
-	.word CHAIN(chmod)		/* chmod	so	15	*/ 
-	.word CHAIN(chown)		/* chown	sdd	16	*/
-	.word solaris_brk		/* brk/break	x	17	*/
-	.word solaris_stat		/* stat		sp	18	*/
-	.word CHAIN(lseek)		/* seek/lseek	ddd	19	*/
-	.word solaris_getpid		/* getpid		20	*/
-	.word solaris_unimplemented	/* mount		21	*/
-	.word CHAIN(umount)		/* umount	s	22	*/
-	.word CHAIN(setuid)		/* setuid	d	23	*/
-	.word solaris_getuid		/* getuid		24	*/
-	.word CHAIN(stime)		/* stime	d	25	*/
-#if 0
-	.word solaris_ptrace		/* ptrace	xdxx	26	*/
-#else
-	.word CHAIN(ptrace)		/* ptrace	xdxx	26	*/
-#endif
-	.word CHAIN(alarm)		/* alarm	d	27	*/
-	.word solaris_fstat		/* fstat	dp	28	*/
-	.word CHAIN(pause)		/* pause		29	*/
-	.word CHAIN(utime)		/* utime	xx	30	*/
-	.word solaris_unimplemented	/* stty			31	*/
-	.word solaris_unimplemented	/* gtty			32	*/
-	.word solaris_access		/* access	so	33	*/
-	.word CHAIN(nice)		/* nice		d	34	*/
-	.word solaris_statfs		/* statfs	spdd	35	*/
-	.word CHAIN(sync)		/* sync			36	*/
-	.word solaris_kill		/* kill		dd	37	*/
-	.word solaris_fstatfs		/* fstatfs	dpdd	38	*/
-	.word solaris_procids		/* pgrpsys	ddd	39	*/
-	.word solaris_unimplemented	/* xenix		40	*/
-	.word CHAIN(dup)		/* dup		d	41	*/
-	.word CHAIN(pipe)		/* pipe			42	*/
-	.word CHAIN(times)		/* times	p	43	*/
-	.word 44 /*CHAIN(profil)*/	/* prof		xxxx	44	*/
-	.word solaris_unimplemented	/* lock/plock		45	*/
-	.word CHAIN(setgid)		/* setgid	d	46	*/
-	.word solaris_getgid		/* getgid		47	*/
-	.word solaris_sigfunc		/* sigfunc	xx	48	*/
-	.word REGS(solaris_msgsys)	/* msgsys	dxddd	49	*/
-	.word solaris_unimplemented	/* syssun/3b		50	*/
-	.word CHAIN(acct)		/* acct/sysacct	x	51	*/
-	.word solaris_shmsys		/* shmsys	ddxo	52	*/
-	.word REGS(solaris_semsys)	/* semsys	dddx	53	*/
-	.word solaris_ioctl		/* ioctl	dxx	54	*/
-	.word solaris_unimplemented	/* uadmin	xxx	55	*/
-	.word solaris_unimplemented	/* reserved:exch	56	*/
-	.word solaris_utssys		/* utssys	x	57	*/
-	.word CHAIN(fsync)		/* fsync	d	58	*/
-	.word CHAIN(execve)		/* execv	spp	59	*/
-	.word CHAIN(umask)		/* umask	o	60	*/
-	.word CHAIN(chroot)		/* chroot	s	61	*/
-	.word solaris_fcntl		/* fcntl	dxx	62	*/
-	.word solaris_ulimit		/* ulimit	xx	63	*/
-	.word solaris_unimplemented	/* ?			64	*/
-	.word solaris_unimplemented	/* ?			65	*/
-	.word solaris_unimplemented	/* ?			66	*/
-	.word solaris_unimplemented	/* ?			67	*/
-	.word solaris_unimplemented	/* ?			68	*/
-	.word solaris_unimplemented	/* ?			69	*/
-	.word solaris_unimplemented	/* advfs		70	*/
-	.word solaris_unimplemented	/* unadvfs		71	*/
-	.word solaris_unimplemented	/* rmount		72	*/
-	.word solaris_unimplemented	/* rumount		73	*/
-	.word solaris_unimplemented	/* rfstart		74	*/
-	.word solaris_unimplemented	/* ?			75	*/
-	.word solaris_unimplemented	/* rdebug		76	*/
-	.word solaris_unimplemented	/* rfstop		77	*/
-	.word solaris_unimplemented	/* rfsys		78	*/
-	.word CHAIN(rmdir)		/* rmdir	s	79	*/
-	.word CHAIN(mkdir)		/* mkdir	so	80	*/
-	.word CHAIN(getdents)		/* getdents	dxd	81	*/
-	.word solaris_unimplemented	/* libattach		82	*/
-	.word solaris_unimplemented	/* libdetach		83	*/
-	.word CHAIN(sysfs)		/* sysfs	dxx	84	*/
-	.word solaris_getmsg		/* getmsg	dxxx	85	*/
-	.word solaris_putmsg		/* putmsg	dxxd	86	*/
-	.word CHAIN(poll)		/* poll		xdd	87	*/
-	.word solaris_lstat		/* lstat	sp	88	*/
-	.word CHAIN(symlink)		/* symlink	ss	89	*/
-	.word CHAIN(readlink)		/* readlink	spd	90	*/
-	.word CHAIN(setgroups)		/* setgroups	dp	91	*/
-	.word CHAIN(getgroups)		/* getgroups	dp	92	*/
-	.word CHAIN(fchmod)		/* fchmod	do	93	*/
-	.word CHAIN(fchown)		/* fchown	ddd	94	*/
-	.word solaris_sigprocmask	/* sigprocmask	dxx	95	*/
-	.word solaris_sigsuspend	/* sigsuspend	x	96	*/
-	.word solaris_sigaltstack	/* sigaltstack	xx	97	*/
-	.word solaris_sigaction		/* sigaction	dxx	98	*/
-	.word solaris_sigpending	/* sigpending	dd	99	*/
-	.word REGS(solaris_context)	/* context		100	*/
-	.word solaris_unimplemented	/* evsys		101	*/
-	.word solaris_unimplemented	/* evtrapret		102	*/
-	.word solaris_statvfs		/* statvfs	sp	103	*/
-	.word solaris_fstatvfs		/* fstatvfs	dp	104	*/
-	.word solaris_unimplemented	/* unknown		105	*/
-	.word solaris_unimplemented	/* nfssys		106	*/
-	.word solaris_waitid		/* waitid	ddxd	107	*/
-	.word solaris_unimplemented	/* sigsendsys	ddd	108	*/
-	.word REGS(solaris_hrtsys)	/* hrtsys	xxx	109	*/
-	.word solaris_unimplemented	/* acancel	dxd	110	*/
-	.word solaris_unimplemented	/* async		111	*/
-	.word solaris_unimplemented	/* priocntlsys		112	*/
-	.word solaris_pathconf		/* pathconf	sd	113	*/
-	.word CHAIN(mincore)		/* mincore	d	114	*/
-	.word solaris_mmap		/* mmap		xxxxdx	115	*/
-	.word CHAIN(mprotect)		/* mprotect	xdx	116	*/
-	.word CHAIN(munmap)		/* munmap	xd	117	*/
-	.word solaris_fpathconf		/* fpathconf	dd	118	*/
-	.word CHAIN(fork)		/* fork			119	*/
-	.word solaris_unimplemented	/* fchdir	d	120	*/
-	.word CHAIN(readv)		/* readv	dxd	121	*/
-	.word CHAIN(writev)		/* writev	dxd	122	*/
-	.word solaris_xstat		/* xstat	dsx	123	*/
-	.word solaris_lxstat		/* lxstat	dsx	124	*/
-	.word solaris_fxstat		/* fxstat	ddx	125	*/
-	.word solaris_xmknod		/* xmknod	dsox	126	*/
-	.word solaris_unimplemented	/* syslocal	d	127	*/
-	.word solaris_setrlimit		/* setrlimit	dp	128	*/
-	.word solaris_getrlimit		/* getrlimit	dp	129	*/
-	.word CHAIN(chown)		/* lchown	sdd	130	*/
-	.word solaris_unimplemented	/* memcntl		131	*/
-	.word solaris_getpmsg		/* getpmsg	dxxxx	132	*/
-	.word solaris_putpmsg		/* putpmsg	dxxdd	133	*/
-	.word CHAIN(rename)		/* rename	ss	134	*/
-	.word solaris_utsname		/* uname	x	135	*/
-	.word solaris_unimplemented	/* setegid		136	*/
-	.word solaris_sysconf		/* sysconfig	d	137	*/
-	.word solaris_unimplemented	/* adjtime		138	*/
-	.word solaris_sysinfo		/* systeminfo	dsd	139	*/
-	.word solaris_unimplemented	/* ?			140	*/
-	.word solaris_unimplemented	/* seteuid		141	*/
-	.word solaris_unimplemented	/* ?			142	*/
-	.word solaris_unimplemented	/* ?			143	*/
-	.word solaris_unimplemented	/* secsys	dx	144	*/
-	.word solaris_unimplemented	/* filepriv	sdxd	145	*/
-	.word solaris_unimplemented	/* procpriv	dxd	146	*/
-	.word solaris_unimplemented	/* devstat	sdx	147	*/
-	.word solaris_unimplemented	/* aclipc	ddddx	148	*/
-	.word solaris_unimplemented	/* fdevstat	ddx	149	*/
-	.word solaris_unimplemented	/* flvlfile	ddx	150	*/
-	.word solaris_unimplemented	/* lvlfile	sdx	151	*/
-	.word solaris_unimplemented	/* ?			152	*/
-	.word solaris_unimplemented	/* fchroot	d	153	*/
-	.word solaris_unimplemented	/* lvlproc	dx	154	*/
-	.word solaris_unimplemented	/* ?			155	*/
-	.word solaris_gettimeofday	/* gettimeofday	x	156	*/
-	.word CHAIN(getitimer)		/* getitimer	dx	157	*/
-	.word CHAIN(setitimer)		/* setitimer	dxx	158	*/
-	.word solaris_unimplemented	/* lwp-xxx		159	*/
-	.word solaris_unimplemented	/* lwp-xxx		160	*/
-	.word solaris_unimplemented	/* lwp-xxx		161	*/
-	.word solaris_unimplemented	/* lwp-xxx		162	*/
-	.word solaris_unimplemented	/* lwp-xxx		163	*/
-	.word solaris_unimplemented	/* lwp-xxx		164	*/
-	.word solaris_unimplemented	/* lwp-xxx		165	*/
-	.word solaris_unimplemented	/* lwp-xxx		166	*/
-	.word solaris_unimplemented	/* lwp-xxx		167	*/
-	.word solaris_unimplemented	/* lwp-xxx		168	*/
-	.word solaris_unimplemented	/* lwp-xxx		169	*/
-	.word solaris_unimplemented	/* lwp-xxx		170	*/
-	.word solaris_unimplemented	/* lwp-xxx		171	*/
-	.word solaris_unimplemented	/* lwp-xxx		172	*/
-	.word solaris_pread		/* pread	dpdd	173	*/
-	.word solaris_pwrite		/* pwrite	dpdd	174	*/
-	.word REGS(solaris_llseek)	/* llseek	dLd	175	*/
-	.word solaris_unimplemented	/* lwpself		176	*/
-	.word solaris_unimplemented	/* lwpinfo		177	*/
-	.word solaris_unimplemented	/* lwpprivate		178	*/
-	.word solaris_unimplemented	/* processorbind	179	*/
-	.word solaris_unimplemented	/* processorexbind	180	*/
-	.word solaris_unimplemented	/* 			181	*/
-	.word solaris_unimplemented	/* sync_mailbox		182	*/
-	.word solaris_unimplemented	/* prepblock		183	*/
-	.word solaris_unimplemented	/* block		184	*/
-	.word solaris_acl		/* acl		sddp	185	*/
-	.word solaris_unimplemented	/* unblock		186	*/
-	.word solaris_unimplemented	/* cancelblock		187	*/
-	.word solaris_unimplemented	/* ?			188	*/
-	.word solaris_unimplemented	/* xxxxx		189	*/
-	.word solaris_unimplemented	/* xxxxxe		190	*/
-	.word solaris_unimplemented	/*			191	*/
-	.word solaris_unimplemented	/*			192	*/
-	.word solaris_unimplemented	/*			193	*/
-	.word solaris_unimplemented	/*			194	*/
-	.word solaris_unimplemented	/* 			195	*/
-	.word solaris_unimplemented	/* 			196	*/
-	.word solaris_unimplemented	/* 			197	*/
-	.word solaris_unimplemented	/* 			198	*/
-	.word CHAIN(nanosleep)		/* nanosleep	dd	199	*/
-	.word solaris_facl		/* facl		dddp	200	*/
-	.word solaris_unimplemented	/* 			201	*/
-	.word CHAIN(setreuid)		/* setreuid	dd	202	*/
-	.word CHAIN(setregid)		/* setregid	dd	203	*/
-	.word solaris_unimplemented	/* 			204	*/
-	.word solaris_unimplemented	/* 			205	*/
-	.word solaris_unimplemented	/* 			206	*/
-	.word solaris_unimplemented	/* 			207	*/
-	.word solaris_unimplemented	/* 			208	*/
-	.word solaris_unimplemented	/* 			209	*/
-	.word solaris_unimplemented	/* 			210	*/
-	.word solaris_unimplemented	/* 			211	*/
-	.word solaris_unimplemented	/* 			212	*/
-	.word solaris_getdents64	/* getdents64	dpd	213	*/
-	.word REGS(solaris_mmap64)	/* mmap64	xxxxdX	214	*/
-	.word solaris_stat64		/* stat64	sP	215	*/
-	.word solaris_lstat64		/* lstat64	sP	216	*/
-	.word solaris_fstat64		/* fstat64	dP	217	*/
-	.word solaris_statvfs64		/* statvfs64	sP	218	*/
-	.word solaris_fstatvfs64	/* fstatvfs64	dP	219	*/
-	.word solaris_setrlimit64	/* setrlimit64	dP	220	*/
-	.word solaris_getrlimit64	/* getrlimit64	dP	221	*/
-	.word CHAIN(pread64)		/* pread64	dpdD	222	*/
-	.word CHAIN(pwrite64)		/* pwrite64	dpdD	223	*/
-	.word CHAIN(creat)		/* creat64	so	224	*/
-	.word solaris_open		/* open64	soo	225	*/
-	.word solaris_unimplemented	/* 			226	*/
-	.word solaris_unimplemented	/* 			227	*/
-	.word solaris_unimplemented	/* 			228	*/
-	.word solaris_unimplemented	/* 			229	*/
-	.word solaris_socket		/* socket	ddd	230	*/
-	.word solaris_socketpair	/* socketpair	dddp	231	*/
-	.word solaris_bind		/* bind		dpd	232	*/
-	.word solaris_listen		/* listen	dd	233	*/
-	.word solaris_accept		/* accept	dpp	234	*/
-	.word solaris_connect		/* connect	dpd	235	*/
-	.word solaris_shutdown		/* shutdown	dd	236	*/
-	.word solaris_recv		/* recv		dpdd	237	*/
-	.word solaris_recvfrom		/* recvfrom	dpddpp	238	*/
-	.word solaris_recvmsg		/* recvmsg	dpd	239	*/
-	.word solaris_send		/* send		dpdd	240	*/
-	.word solaris_sendmsg		/* sendmsg	dpd	241	*/
-	.word solaris_sendto		/* sendto	dpddpd	242	*/
-	.word solaris_getpeername	/* getpeername	dpp	243	*/
-	.word solaris_getsockname	/* getsockname	dpp	244	*/
-	.word solaris_getsockopt	/* getsockopt	dddpp	245	*/
-	.word solaris_setsockopt	/* setsockopt	dddpp	246	*/
-	.word solaris_unimplemented	/* 			247	*/
-	.word solaris_ntp_gettime	/* ntp_gettime	p	248	*/
-	.word solaris_ntp_adjtime	/* ntp_adjtime	p	249	*/
-	.word solaris_unimplemented	/* 			250	*/
-	.word solaris_unimplemented	/* 			251	*/
-	.word solaris_unimplemented	/* 			252	*/
-	.word solaris_unimplemented	/* 			253	*/
-	.word solaris_unimplemented	/* 			254	*/
-	.word solaris_unimplemented	/* 			255	*/
diff --git a/arch/sparc64/solaris/timod.c b/arch/sparc64/solaris/timod.c
deleted file mode 100644
index 15234fcd191a..000000000000
--- a/arch/sparc64/solaris/timod.c
+++ /dev/null
@@ -1,976 +0,0 @@
-/* $Id: timod.c,v 1.19 2002/02/08 03:57:14 davem Exp $
- * timod.c: timod emulation.
- *
- * Copyright (C) 1998 Patrik Rak (prak3264@ss1000.ms.mff.cuni.cz)
- *
- * Streams & timod emulation based on code
- * Copyright (C) 1995, 1996 Mike Jagdis (jaggy@purplet.demon.co.uk)
- *
- */
- 
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/smp_lock.h>
-#include <linux/ioctl.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/netdevice.h>
-#include <linux/poll.h>
-
-#include <net/sock.h>
-
-#include <asm/uaccess.h>
-#include <asm/termios.h>
-
-#include "conv.h"
-#include "socksys.h"
-
-asmlinkage int solaris_ioctl(unsigned int fd, unsigned int cmd, u32 arg);
-
-static DEFINE_SPINLOCK(timod_pagelock);
-static char * page = NULL ;
-
-#ifndef DEBUG_SOLARIS_KMALLOC
-
-#define mykmalloc kmalloc
-#define mykfree kfree
-
-#else
-
-void * mykmalloc(size_t s, gfp_t gfp)
-{
-	static char * page;
-	static size_t free;
-	void * r;
-	s = ((s + 63) & ~63);
-	if( s > PAGE_SIZE ) {
-		SOLD("too big size, calling real kmalloc");
-		return kmalloc(s, gfp);
-	}
-	if( s > free ) {
-		/* we are wasting memory, but we don't care */
-		page = (char *)__get_free_page(gfp);
-		free = PAGE_SIZE;
-	}
-	r = page;
-	page += s;
-	free -= s;
-	return r;
-}
-
-void mykfree(void *p)
-{
-}
-
-#endif
-
-#ifndef DEBUG_SOLARIS
-
-#define BUF_SIZE	PAGE_SIZE
-#define PUT_MAGIC(a,m)
-#define SCHECK_MAGIC(a,m)
-#define BUF_OFFSET	0
-#define MKCTL_TRAILER	0
-
-#else
-
-#define BUF_SIZE	(PAGE_SIZE-2*sizeof(u64))
-#define BUFPAGE_MAGIC	0xBADC0DEDDEADBABEL
-#define MKCTL_MAGIC	0xDEADBABEBADC0DEDL
-#define PUT_MAGIC(a,m)	do{(*(u64*)(a))=(m);}while(0)
-#define SCHECK_MAGIC(a,m)	do{if((*(u64*)(a))!=(m))printk("%s,%u,%s(): magic %08x at %p corrupted!\n",\
-				__FILE__,__LINE__,__func__,(m),(a));}while(0)
-#define BUF_OFFSET	sizeof(u64)
-#define MKCTL_TRAILER	sizeof(u64)
-
-#endif
-
-static char *getpage( void )
-{
-	char *r;
-	SOLD("getting page");
-	spin_lock(&timod_pagelock);
-	if (page) {
-		r = page;
-		page = NULL;
-		spin_unlock(&timod_pagelock);
-		SOLD("got cached");
-		return r + BUF_OFFSET;
-	}
-	spin_unlock(&timod_pagelock);
-	SOLD("getting new");
-	r = (char *)__get_free_page(GFP_KERNEL);
-	PUT_MAGIC(r,BUFPAGE_MAGIC);
-	PUT_MAGIC(r+PAGE_SIZE-sizeof(u64),BUFPAGE_MAGIC);
-	return r + BUF_OFFSET;
-}
-
-static void putpage(char *p)
-{
-	SOLD("putting page");
-	p = p - BUF_OFFSET;
-	SCHECK_MAGIC(p,BUFPAGE_MAGIC);
-	SCHECK_MAGIC(p+PAGE_SIZE-sizeof(u64),BUFPAGE_MAGIC);
-	spin_lock(&timod_pagelock);
-	if (page) {
-		spin_unlock(&timod_pagelock);
-		free_page((unsigned long)p);
-		SOLD("freed it");
-	} else {
-		page = p;
-		spin_unlock(&timod_pagelock);
-		SOLD("cached it");
-	}
-}
-
-static struct T_primsg *timod_mkctl(int size)
-{
-	struct T_primsg *it;
-
-	SOLD("creating primsg");
-	it = (struct T_primsg *)mykmalloc(size+sizeof(*it)-sizeof(s32)+2*MKCTL_TRAILER, GFP_KERNEL);
-	if (it) {
-		SOLD("got it");
-		it->pri = MSG_HIPRI;
-		it->length = size;
-		PUT_MAGIC((char*)((u64)(((char *)&it->type)+size+7)&~7),MKCTL_MAGIC);
-	}
-	return it;
-}
-
-static void timod_wake_socket(unsigned int fd)
-{
-	struct socket *sock;
-	struct fdtable *fdt;
-
-	SOLD("wakeing socket");
-	fdt = files_fdtable(current->files);
-	sock = SOCKET_I(fdt->fd[fd]->f_path.dentry->d_inode);
-	wake_up_interruptible(&sock->wait);
-	read_lock(&sock->sk->sk_callback_lock);
-	if (sock->fasync_list && !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
-		__kill_fasync(sock->fasync_list, SIGIO, POLL_IN);
-	read_unlock(&sock->sk->sk_callback_lock);
-	SOLD("done");
-}
-
-static void timod_queue(unsigned int fd, struct T_primsg *it)
-{
-	struct sol_socket_struct *sock;
-	struct fdtable *fdt;
-
-	SOLD("queuing primsg");
-	fdt = files_fdtable(current->files);
-	sock = (struct sol_socket_struct *)fdt->fd[fd]->private_data;
-	it->next = sock->pfirst;
-	sock->pfirst = it;
-	if (!sock->plast)
-		sock->plast = it;
-	timod_wake_socket(fd);
-	SOLD("done");
-}
-
-static void timod_queue_end(unsigned int fd, struct T_primsg *it)
-{
-	struct sol_socket_struct *sock;
-	struct fdtable *fdt;
-
-	SOLD("queuing primsg at end");
-	fdt = files_fdtable(current->files);
-	sock = (struct sol_socket_struct *)fdt->fd[fd]->private_data;
-	it->next = NULL;
-	if (sock->plast)
-		sock->plast->next = it;
-	else
-		sock->pfirst = it;
-	sock->plast = it;
-	SOLD("done");
-}
-
-static void timod_error(unsigned int fd, int prim, int terr, int uerr)
-{
-	struct T_primsg *it;
-	
-	SOLD("making error");
-	it = timod_mkctl(sizeof(struct T_error_ack));
-	if (it) {
-		struct T_error_ack *err = (struct T_error_ack *)&it->type;
-		
-		SOLD("got it");
-		err->PRIM_type = T_ERROR_ACK;
-		err->ERROR_prim = prim;
-		err->TLI_error = terr;
-		err->UNIX_error = uerr; /* FIXME: convert this */
-		timod_queue(fd, it);
-	}
-	SOLD("done");
-}
-
-static void timod_ok(unsigned int fd, int prim)
-{
-	struct T_primsg *it;
-	struct T_ok_ack *ok;
-	
-	SOLD("creating ok ack");
-	it = timod_mkctl(sizeof(*ok));
-	if (it) {
-		SOLD("got it");
-		ok = (struct T_ok_ack *)&it->type;
-		ok->PRIM_type = T_OK_ACK;
-		ok->CORRECT_prim = prim;
-		timod_queue(fd, it);
-	}
-	SOLD("done");
-}
-
-static int timod_optmgmt(unsigned int fd, int flag, char __user *opt_buf, int opt_len, int do_ret)
-{
-	int error, failed;
-	int ret_space, ret_len;
-	long args[5];
-	char *ret_pos,*ret_buf;
-	int (*sys_socketcall)(int, unsigned long *) =
-		(int (*)(int, unsigned long *))SYS(socketcall);
-	mm_segment_t old_fs = get_fs();
-
-	SOLD("entry");
-	SOLDD(("fd %u flg %u buf %p len %u doret %u",fd,flag,opt_buf,opt_len,do_ret));
-	if (!do_ret && (!opt_buf || opt_len <= 0))
-		return 0;
-	SOLD("getting page");
-	ret_pos = ret_buf = getpage();
-	ret_space = BUF_SIZE;
-	ret_len = 0;
-	
-	error = failed = 0;
-	SOLD("looping");
-	while(opt_len >= sizeof(struct opthdr)) {
-		struct opthdr *opt;
-		int orig_opt_len; 
-		SOLD("loop start");
-		opt = (struct opthdr *)ret_pos; 
-		if (ret_space < sizeof(struct opthdr)) {
-			failed = TSYSERR;
-			break;
-		}
-		SOLD("getting opthdr");
-		if (copy_from_user(opt, opt_buf, sizeof(struct opthdr)) ||
-			opt->len > opt_len) {
-			failed = TBADOPT;
-			break;
-		}
-		SOLD("got opthdr");
-		if (flag == T_NEGOTIATE) {
-			char *buf;
-			
-			SOLD("handling T_NEGOTIATE");
-			buf = ret_pos + sizeof(struct opthdr);
-			if (ret_space < opt->len + sizeof(struct opthdr) ||
-				copy_from_user(buf, opt_buf+sizeof(struct opthdr), opt->len)) {
-				failed = TSYSERR;
-				break;
-			}
-			SOLD("got optdata");
-			args[0] = fd;
-			args[1] = opt->level;
-			args[2] = opt->name;
-			args[3] = (long)buf;
-			args[4] = opt->len;
-			SOLD("calling SETSOCKOPT");
-			set_fs(KERNEL_DS);
-			error = sys_socketcall(SYS_SETSOCKOPT, args);
-			set_fs(old_fs);
-			if (error) {
-				failed = TBADOPT;
-				break;
-			}
-			SOLD("SETSOCKOPT ok");
-		}
-		orig_opt_len = opt->len;
-		opt->len = ret_space - sizeof(struct opthdr);
-		if (opt->len < 0) {
-			failed = TSYSERR;
-			break;
-		}
-		args[0] = fd;
-		args[1] = opt->level;
-		args[2] = opt->name;
-		args[3] = (long)(ret_pos+sizeof(struct opthdr));
-		args[4] = (long)&opt->len;
-		SOLD("calling GETSOCKOPT");
-		set_fs(KERNEL_DS);
-		error = sys_socketcall(SYS_GETSOCKOPT, args);
-		set_fs(old_fs);
-		if (error) {
-			failed = TBADOPT;
-			break;
-		}
-		SOLD("GETSOCKOPT ok");
-		ret_space -= sizeof(struct opthdr) + opt->len;
-		ret_len += sizeof(struct opthdr) + opt->len;
-		ret_pos += sizeof(struct opthdr) + opt->len;
-		opt_len -= sizeof(struct opthdr) + orig_opt_len;
-		opt_buf += sizeof(struct opthdr) + orig_opt_len;
-		SOLD("loop end");
-	}
-	SOLD("loop done");
-	if (do_ret) {
-		SOLD("generating ret msg");
-		if (failed)
-			timod_error(fd, T_OPTMGMT_REQ, failed, -error);
-		else {
-			struct T_primsg *it;
-			it = timod_mkctl(sizeof(struct T_optmgmt_ack) + ret_len);
-			if (it) {
-				struct T_optmgmt_ack *ack =
-					(struct T_optmgmt_ack *)&it->type;
-				SOLD("got primsg");
-				ack->PRIM_type = T_OPTMGMT_ACK;
-				ack->OPT_length = ret_len;
-				ack->OPT_offset = sizeof(struct T_optmgmt_ack);
-				ack->MGMT_flags = (failed ? T_FAILURE : flag);
-				memcpy(((char*)ack)+sizeof(struct T_optmgmt_ack),
-					ret_buf, ret_len);
-				timod_queue(fd, it);
-			}
-		}
-	}
-	SOLDD(("put_page %p\n", ret_buf));
-	putpage(ret_buf);
-	SOLD("done");	
-	return 0;
-}
-
-int timod_putmsg(unsigned int fd, char __user *ctl_buf, int ctl_len,
-			char __user *data_buf, int data_len, int flags)
-{
-	int ret, error, terror;
-	char *buf;
-	struct file *filp;
-	struct inode *ino;
-	struct fdtable *fdt;
-	struct sol_socket_struct *sock;
-	mm_segment_t old_fs = get_fs();
-	long args[6];
-	int (*sys_socketcall)(int, unsigned long __user *) =
-		(int (*)(int, unsigned long __user *))SYS(socketcall);
-	int (*sys_sendto)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int) =
-		(int (*)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int))SYS(sendto);
-
-	fdt = files_fdtable(current->files);
-	filp = fdt->fd[fd];
-	ino = filp->f_path.dentry->d_inode;
-	sock = (struct sol_socket_struct *)filp->private_data;
-	SOLD("entry");
-	if (get_user(ret, (int __user *)A(ctl_buf)))
-		return -EFAULT;
-	switch (ret) {
-	case T_BIND_REQ:
-	{
-		struct T_bind_req req;
-		
-		SOLDD(("bind %016lx(%016lx)\n", sock, filp));
-		SOLD("T_BIND_REQ");
-		if (sock->state != TS_UNBND) {
-			timod_error(fd, T_BIND_REQ, TOUTSTATE, 0);
-			return 0;
-		}
-		SOLD("state ok");
-		if (copy_from_user(&req, ctl_buf, sizeof(req))) {
-			timod_error(fd, T_BIND_REQ, TSYSERR, EFAULT);
-			return 0;
-		}
-		SOLD("got ctl req");
-		if (req.ADDR_offset && req.ADDR_length) {
-			if (req.ADDR_length > BUF_SIZE) {
-				timod_error(fd, T_BIND_REQ, TSYSERR, EFAULT);
-				return 0;
-			}
-			SOLD("req size ok");
-			buf = getpage();
-			if (copy_from_user(buf, ctl_buf + req.ADDR_offset, req.ADDR_length)) {
-				timod_error(fd, T_BIND_REQ, TSYSERR, EFAULT);
-				putpage(buf);
-				return 0;
-			}
-			SOLD("got ctl data");
-			args[0] = fd;
-			args[1] = (long)buf;
-			args[2] = req.ADDR_length;
-			SOLD("calling BIND");
-			set_fs(KERNEL_DS);
-			error = sys_socketcall(SYS_BIND, args);
-			set_fs(old_fs);
-			putpage(buf);
-			SOLD("BIND returned");
-		} else 
-			error = 0;
-		if (!error) {
-			struct T_primsg *it;
-			if (req.CONIND_number) {
-	  			args[0] = fd;
-  				args[1] = req.CONIND_number;
-  				SOLD("calling LISTEN");
-  				set_fs(KERNEL_DS);
-	  			error = sys_socketcall(SYS_LISTEN, args);
-  				set_fs(old_fs);
-  				SOLD("LISTEN done");
-  			}
-			it = timod_mkctl(sizeof(struct T_bind_ack)+sizeof(struct sockaddr));
-			if (it) {
-				struct T_bind_ack *ack;
-
-				ack = (struct T_bind_ack *)&it->type;
-				ack->PRIM_type = T_BIND_ACK;
-				ack->ADDR_offset = sizeof(*ack);
-				ack->ADDR_length = sizeof(struct sockaddr);
-				ack->CONIND_number = req.CONIND_number;
-				args[0] = fd;
-				args[1] = (long)(ack+sizeof(*ack));
-				args[2] = (long)&ack->ADDR_length;
-				set_fs(KERNEL_DS);
-				sys_socketcall(SYS_GETSOCKNAME,args);
-				set_fs(old_fs);
-				sock->state = TS_IDLE;
-				timod_ok(fd, T_BIND_REQ);
-				timod_queue_end(fd, it);
-				SOLD("BIND done");
-				return 0;
-			}
-		}
-		SOLD("some error");
-		switch (error) {
-			case -EINVAL:
-				terror = TOUTSTATE;
-				error = 0;
-				break;
-			case -EACCES:
-				terror = TACCES;
-				error = 0;
-				break;
-			case -EADDRNOTAVAIL:
-			case -EADDRINUSE:
-				terror = TNOADDR;
-				error = 0;
-				break;
-			default:
-				terror = TSYSERR;
-				break;
-		}
-		timod_error(fd, T_BIND_REQ, terror, -error);
-		SOLD("BIND done");
-		return 0;
-	}
-	case T_CONN_REQ:
-	{
-		struct T_conn_req req;
-		unsigned short oldflags;
-		struct T_primsg *it;
-		SOLD("T_CONN_REQ");
-		if (sock->state != TS_UNBND && sock->state != TS_IDLE) {
-			timod_error(fd, T_CONN_REQ, TOUTSTATE, 0);
-			return 0;
-		}
-		SOLD("state ok");
-		if (copy_from_user(&req, ctl_buf, sizeof(req))) {
-			timod_error(fd, T_CONN_REQ, TSYSERR, EFAULT);
-			return 0;
-		}
-		SOLD("got ctl req");
-		if (ctl_len > BUF_SIZE) {
-			timod_error(fd, T_CONN_REQ, TSYSERR, EFAULT);
-			return 0;
-		}
-		SOLD("req size ok");
-		buf = getpage();
-		if (copy_from_user(buf, ctl_buf, ctl_len)) {
-			timod_error(fd, T_CONN_REQ, TSYSERR, EFAULT);
-			putpage(buf);
-			return 0;
-		}
-#ifdef DEBUG_SOLARIS		
-		{
-			char * ptr = buf;
-			int len = ctl_len;
-			printk("returned data (%d bytes): ",len);
-			while( len-- ) {
-				if (!(len & 7))
-					printk(" ");
-				printk("%02x",(unsigned char)*ptr++);
-			}
-			printk("\n");
-		}
-#endif
-		SOLD("got ctl data");
-		args[0] = fd;
-		args[1] = (long)buf+req.DEST_offset;
-		args[2] = req.DEST_length;
-		oldflags = filp->f_flags;
-		filp->f_flags &= ~O_NONBLOCK;
-		SOLD("calling CONNECT");
-		set_fs(KERNEL_DS);
-		error = sys_socketcall(SYS_CONNECT, args);
-		set_fs(old_fs);
-		filp->f_flags = oldflags;
-		SOLD("CONNECT done");
-		if (!error) {
-			struct T_conn_con *con;
-			SOLD("no error");
-			it = timod_mkctl(ctl_len);
-			if (!it) {
-				putpage(buf);
-				return -ENOMEM;
-			}
-			con = (struct T_conn_con *)&it->type;
-#ifdef DEBUG_SOLARIS			
-			{
-				char * ptr = buf;
-				int len = ctl_len;
-				printk("returned data (%d bytes): ",len);
-				while( len-- ) {
-					if (!(len & 7))
-						printk(" ");
-					printk("%02x",(unsigned char)*ptr++);
-				}
-				printk("\n");
-			}
-#endif
-			memcpy(con, buf, ctl_len);
-			SOLD("copied ctl_buf");
-			con->PRIM_type = T_CONN_CON;
-			sock->state = TS_DATA_XFER;
-		} else {
-			struct T_discon_ind *dis;
-			SOLD("some error");
-			it = timod_mkctl(sizeof(*dis));
-			if (!it) {
-				putpage(buf);
-				return -ENOMEM;
-			}
-			SOLD("got primsg");
-			dis = (struct T_discon_ind *)&it->type;
-			dis->PRIM_type = T_DISCON_IND;
-			dis->DISCON_reason = -error;	/* FIXME: convert this as in iABI_errors() */
-			dis->SEQ_number = 0;
-		}
-		putpage(buf);
-		timod_ok(fd, T_CONN_REQ);
-		it->pri = 0;
-		timod_queue_end(fd, it);
-		SOLD("CONNECT done");
-		return 0;
-	}
-	case T_OPTMGMT_REQ:
-	{
-		struct T_optmgmt_req req;
-		SOLD("OPTMGMT_REQ");
-		if (copy_from_user(&req, ctl_buf, sizeof(req)))
-			return -EFAULT;
-		SOLD("got req");
-		return timod_optmgmt(fd, req.MGMT_flags,
-				req.OPT_offset > 0 ? ctl_buf + req.OPT_offset : NULL,
-				req.OPT_length, 1);
-	}
-	case T_UNITDATA_REQ:
-	{
-		struct T_unitdata_req req;
-		
-		int err;
-		SOLD("T_UNITDATA_REQ");
-		if (sock->state != TS_IDLE && sock->state != TS_DATA_XFER) {
-			timod_error(fd, T_CONN_REQ, TOUTSTATE, 0);
-			return 0;
-		}
-		SOLD("state ok");
-		if (copy_from_user(&req, ctl_buf, sizeof(req))) {
-			timod_error(fd, T_CONN_REQ, TSYSERR, EFAULT);
-			return 0;
-		}
-		SOLD("got ctl req");
-#ifdef DEBUG_SOLARIS		
-		{
-			char * ptr = ctl_buf+req.DEST_offset;
-			int len = req.DEST_length;
-			printk("socket address (%d bytes): ",len);
-			while( len-- ) {
-				char c;
-				if (get_user(c,ptr))
-					printk("??");
-				else
-					printk("%02x",(unsigned char)c);
-				ptr++;
-			}
-			printk("\n");
-		}
-#endif		
-		err = sys_sendto(fd, data_buf, data_len, 0, req.DEST_length > 0 ? (struct sockaddr __user *)(ctl_buf+req.DEST_offset) : NULL, req.DEST_length);
-		if (err == data_len)
-			return 0;
-		if(err >= 0) {
-			printk("timod: sendto failed to send all the data\n");
-			return 0;
-		}
-		timod_error(fd, T_CONN_REQ, TSYSERR, -err);
-		return 0;
-	}
-	default:
-		printk(KERN_INFO "timod_putmsg: unsupported command %u.\n", ret);
-		break;
-	}
-	return -EINVAL;
-}
-
-int timod_getmsg(unsigned int fd, char __user *ctl_buf, int ctl_maxlen, s32 __user *ctl_len,
-			char __user *data_buf, int data_maxlen, s32 __user *data_len, int *flags_p)
-{
-	int error;
-	int oldflags;
-	struct file *filp;
-	struct inode *ino;
-	struct fdtable *fdt;
-	struct sol_socket_struct *sock;
-	struct T_unitdata_ind udi;
-	mm_segment_t old_fs = get_fs();
-	long args[6];
-	char __user *tmpbuf;
-	int tmplen;
-	int (*sys_socketcall)(int, unsigned long __user *) =
-		(int (*)(int, unsigned long __user *))SYS(socketcall);
-	int (*sys_recvfrom)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *);
-	
-	SOLD("entry");
-	SOLDD(("%u %p %d %p %p %d %p %d\n", fd, ctl_buf, ctl_maxlen, ctl_len, data_buf, data_maxlen, data_len, *flags_p));
-	fdt = files_fdtable(current->files);
-	filp = fdt->fd[fd];
-	ino = filp->f_path.dentry->d_inode;
-	sock = (struct sol_socket_struct *)filp->private_data;
-	SOLDD(("%p %p\n", sock->pfirst, sock->pfirst ? sock->pfirst->next : NULL));
-	if ( ctl_maxlen > 0 && !sock->pfirst && SOCKET_I(ino)->type == SOCK_STREAM
-		&& sock->state == TS_IDLE) {
-		SOLD("calling LISTEN");
-		args[0] = fd;
-		args[1] = -1;
-		set_fs(KERNEL_DS);
-		sys_socketcall(SYS_LISTEN, args);
-		set_fs(old_fs);
-		SOLD("LISTEN done");
-	}
-	if (!(filp->f_flags & O_NONBLOCK)) {
-		struct poll_wqueues wait_table;
-		poll_table *wait;
-
-		poll_initwait(&wait_table);
-		wait = &wait_table.pt;
-		for(;;) {
-			SOLD("loop");
-			set_current_state(TASK_INTERRUPTIBLE);
-			/* ! ( l<0 || ( l>=0 && ( ! pfirst || (flags == HIPRI && pri != HIPRI) ) ) ) */ 
-			/* ( ! l<0 && ! ( l>=0 && ( ! pfirst || (flags == HIPRI && pri != HIPRI) ) ) ) */ 
-			/* ( l>=0 && ( ! l>=0 || ! ( ! pfirst || (flags == HIPRI && pri != HIPRI) ) ) ) */ 
-			/* ( l>=0 && ( l<0 || ( pfirst && ! (flags == HIPRI && pri != HIPRI) ) ) ) */ 
-			/* ( l>=0 && ( l<0 || ( pfirst && (flags != HIPRI || pri == HIPRI) ) ) ) */ 
-			/* ( l>=0 && ( pfirst && (flags != HIPRI || pri == HIPRI) ) ) */ 
-			if (ctl_maxlen >= 0 && sock->pfirst && (*flags_p != MSG_HIPRI || sock->pfirst->pri == MSG_HIPRI))
-				break;
-			SOLD("cond 1 passed");
-			if (
-			#if 1
-				*flags_p != MSG_HIPRI &&
-			#endif
-				((filp->f_op->poll(filp, wait) & POLLIN) ||
-				(filp->f_op->poll(filp, NULL) & POLLIN) ||
-				signal_pending(current))
-			) {
-				break;
-			}
-			if( *flags_p == MSG_HIPRI ) {
-				SOLD("avoiding lockup");
-				break ;
-			}
-			if(wait_table.error) {
-				SOLD("wait-table error");
-				poll_freewait(&wait_table);
-				return wait_table.error;
-			}
-			SOLD("scheduling");
-			schedule();
-		}
-		SOLD("loop done");
-		current->state = TASK_RUNNING;
-		poll_freewait(&wait_table);
-		if (signal_pending(current)) {
-			SOLD("signal pending");
-			return -EINTR;
-		}
-	}
-	if (ctl_maxlen >= 0 && sock->pfirst) {
-		struct T_primsg *it = sock->pfirst;
-		int l = min_t(int, ctl_maxlen, it->length);
-		SCHECK_MAGIC((char*)((u64)(((char *)&it->type)+sock->offset+it->length+7)&~7),MKCTL_MAGIC);
-		SOLD("purting ctl data");
-		if(copy_to_user(ctl_buf,
-			(char*)&it->type + sock->offset, l))
-			return -EFAULT;
-		SOLD("pur it");
-		if(put_user(l, ctl_len))
-			return -EFAULT;
-		SOLD("set ctl_len");
-		*flags_p = it->pri;
-		it->length -= l;
-		if (it->length) {
-			SOLD("more ctl");
-			sock->offset += l;
-			return MORECTL;
-		} else {
-			SOLD("removing message");
-			sock->pfirst = it->next;
-			if (!sock->pfirst)
-				sock->plast = NULL;
-			SOLDD(("getmsg kfree %016lx->%016lx\n", it, sock->pfirst));
-			mykfree(it);
-			sock->offset = 0;
-			SOLD("ctl done");
-			return 0;
-		}
-	}
-	*flags_p = 0;
-	if (ctl_maxlen >= 0) {
-		SOLD("ACCEPT perhaps?");
-		if (SOCKET_I(ino)->type == SOCK_STREAM && sock->state == TS_IDLE) {
-			struct T_conn_ind ind;
-			char *buf = getpage();
-			int len = BUF_SIZE;
-
-			SOLD("trying ACCEPT");
-			if (put_user(ctl_maxlen - sizeof(ind), ctl_len))
-				return -EFAULT;
-			args[0] = fd;
-			args[1] = (long)buf;
-			args[2] = (long)&len;
-			oldflags = filp->f_flags;
-			filp->f_flags |= O_NONBLOCK;
-			SOLD("calling ACCEPT");
-			set_fs(KERNEL_DS);
-			error = sys_socketcall(SYS_ACCEPT, args);
-			set_fs(old_fs);
-			filp->f_flags = oldflags;
-			if (error < 0) {
-				SOLD("some error");
-				putpage(buf);
-				return error;
-			}
-			if (error) {
-				SOLD("connect");
-				putpage(buf);
-				if (sizeof(ind) > ctl_maxlen) {
-					SOLD("generating CONN_IND");
-					ind.PRIM_type = T_CONN_IND;
-					ind.SRC_length = len;
-					ind.SRC_offset = sizeof(ind);
-					ind.OPT_length = ind.OPT_offset = 0;
-					ind.SEQ_number = error;
-					if(copy_to_user(ctl_buf, &ind, sizeof(ind))||
-					   put_user(sizeof(ind)+ind.SRC_length,ctl_len))
-						return -EFAULT;
-					SOLD("CONN_IND created");
-				}
-				if (data_maxlen >= 0)
-					put_user(0, data_len);
-				SOLD("CONN_IND done");
-				return 0;
-			}
-			if (len>ctl_maxlen) {
-				SOLD("data don't fit");
-				putpage(buf);
-				return -EFAULT;		/* XXX - is this ok ? */
-			}
-			if(copy_to_user(ctl_buf,buf,len) || put_user(len,ctl_len)){
-				SOLD("can't copy data");
-				putpage(buf);
-				return -EFAULT;
-			}
-			SOLD("ACCEPT done");
-			putpage(buf);
-		}
-	}
-	SOLD("checking data req");
-	if (data_maxlen <= 0) {
-		if (data_maxlen == 0)
-			put_user(0, data_len);
-		if (ctl_maxlen >= 0)
-			put_user(0, ctl_len);
-		return -EAGAIN;
-	}
-	SOLD("wants data");
-	if (ctl_maxlen > sizeof(udi) && sock->state == TS_IDLE) {
-		SOLD("udi fits");
-		tmpbuf = ctl_buf + sizeof(udi);
-		tmplen = ctl_maxlen - sizeof(udi);
-	} else {
-		SOLD("udi does not fit");
-		tmpbuf = NULL;
-		tmplen = 0;
-	}
-	if (put_user(tmplen, ctl_len))
-		return -EFAULT;
-	SOLD("set ctl_len");
-	oldflags = filp->f_flags;
-	filp->f_flags |= O_NONBLOCK;
-	SOLD("calling recvfrom");
-	sys_recvfrom = (int (*)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *))SYS(recvfrom);
-	error = sys_recvfrom(fd, data_buf, data_maxlen, 0, (struct sockaddr __user *)tmpbuf, ctl_len);
-	filp->f_flags = oldflags;
-	if (error < 0)
-		return error;
-	SOLD("error >= 0" ) ;
-	if (error && ctl_maxlen > sizeof(udi) && sock->state == TS_IDLE) {
-		SOLD("generating udi");
-		udi.PRIM_type = T_UNITDATA_IND;
-		if (get_user(udi.SRC_length, ctl_len))
-			return -EFAULT;
-		udi.SRC_offset = sizeof(udi);
-		udi.OPT_length = udi.OPT_offset = 0;
-		if (copy_to_user(ctl_buf, &udi, sizeof(udi)) ||
-		    put_user(sizeof(udi)+udi.SRC_length, ctl_len))
-			return -EFAULT;
-		SOLD("udi done");
-	} else {
-		if (put_user(0, ctl_len))
-			return -EFAULT;
-	}
-	put_user(error, data_len);
-	SOLD("done");
-	return 0;
-}
-
-asmlinkage int solaris_getmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)
-{
-	struct file *filp;
-	struct inode *ino;
-	struct strbuf __user *ctlptr;
-	struct strbuf __user *datptr;
-	struct strbuf ctl, dat;
-	int __user *flgptr;
-	int flags;
-	int error = -EBADF;
-	struct fdtable *fdt;
-
-	SOLD("entry");
-	lock_kernel();
-	if (fd >= sysctl_nr_open)
-		goto out;
-
-	fdt = files_fdtable(current->files);
-	filp = fdt->fd[fd];
-	if(!filp) goto out;
-
-	ino = filp->f_path.dentry->d_inode;
-	if (!ino || !S_ISSOCK(ino->i_mode))
-		goto out;
-
-	ctlptr = (struct strbuf __user *)A(arg1);
-	datptr = (struct strbuf __user *)A(arg2);
-	flgptr = (int __user *)A(arg3);
-
-	error = -EFAULT;
-
-	if (ctlptr) {
-		if (copy_from_user(&ctl,ctlptr,sizeof(struct strbuf)) || 
-		    put_user(-1,&ctlptr->len))
-			goto out;
-	} else
-		ctl.maxlen = -1;
-
-	if (datptr) {
-		if (copy_from_user(&dat,datptr,sizeof(struct strbuf)) || 
-		    put_user(-1,&datptr->len))
-			goto out;
-	} else
-		dat.maxlen = -1;
-
-	if (get_user(flags,flgptr))
-		goto out;
-
-	switch (flags) {
-	case 0:
-	case MSG_HIPRI:
-	case MSG_ANY:
-	case MSG_BAND:
-		break;
-	default:
-		error = -EINVAL;
-		goto out;
-	}
-
-	error = timod_getmsg(fd,A(ctl.buf),ctl.maxlen,&ctlptr->len,
-				A(dat.buf),dat.maxlen,&datptr->len,&flags);
-
-	if (!error && put_user(flags,flgptr))
-		error = -EFAULT;
-out:
-	unlock_kernel();
-	SOLD("done");
-	return error;
-}
-
-asmlinkage int solaris_putmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)
-{
-	struct file *filp;
-	struct inode *ino;
-	struct strbuf __user *ctlptr;
-	struct strbuf __user *datptr;
-	struct strbuf ctl, dat;
-	int flags = (int) arg3;
-	int error = -EBADF;
-	struct fdtable *fdt;
-
-	SOLD("entry");
-	lock_kernel();
-	if (fd >= sysctl_nr_open)
-		goto out;
-
-	fdt = files_fdtable(current->files);
-	filp = fdt->fd[fd];
-	if(!filp) goto out;
-
-	ino = filp->f_path.dentry->d_inode;
-	if (!ino) goto out;
-
-	if (!S_ISSOCK(ino->i_mode) &&
-		(imajor(ino) != 30 || iminor(ino) != 1))
-		goto out;
-
-	ctlptr = A(arg1);
-	datptr = A(arg2);
-
-	error = -EFAULT;
-
-	if (ctlptr) {
-		if (copy_from_user(&ctl,ctlptr,sizeof(ctl)))
-			goto out;
-		if (ctl.len < 0 && flags) {
-			error = -EINVAL;
-			goto out;
-		}
-	} else {
-		ctl.len = 0;
-		ctl.buf = 0;
-	}
-
-	if (datptr) {
-		if (copy_from_user(&dat,datptr,sizeof(dat)))
-			goto out;
-	} else {
-		dat.len = 0;
-		dat.buf = 0;
-	}
-
-	error = timod_putmsg(fd,A(ctl.buf),ctl.len,
-				A(dat.buf),dat.len,flags);
-out:
-	unlock_kernel();
-	SOLD("done");
-	return error;
-}
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b5c3b6114add..853845abcca6 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -62,7 +62,7 @@ config BINFMT_SHARED_FLAT
 config BINFMT_AOUT
 	tristate "Kernel support for a.out and ECOFF binaries"
 	depends on ARCH_SUPPORTS_AOUT && \
-		(X86_32 || ALPHA || ARM || M68K || SPARC32)
+		(X86_32 || ALPHA || ARM || M68K)
 	---help---
 	  A.out (Assembler.OUTput) is a set of formats for libraries and
 	  executables used in the earliest versions of UNIX.  Linux used
diff --git a/include/asm-sparc/Kbuild b/include/asm-sparc/Kbuild
index c6a55cf0d337..671223718f0a 100644
--- a/include/asm-sparc/Kbuild
+++ b/include/asm-sparc/Kbuild
@@ -5,7 +5,6 @@ header-y += asi.h
 header-y += bpp.h
 header-y += jsflash.h
 header-y += openpromio.h
-header-y += pconf.h
 header-y += reg.h
 header-y += traps.h
 header-y += vfc_ioctls.h
diff --git a/include/asm-sparc/a.out-core.h b/include/asm-sparc/a.out-core.h
deleted file mode 100644
index e8fd338ed0b2..000000000000
--- a/include/asm-sparc/a.out-core.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* a.out coredump register dumper
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_A_OUT_CORE_H
-#define _ASM_A_OUT_CORE_H
-
-#ifdef __KERNEL__
-
-#include <linux/user.h>
-
-/*
- * fill in the user structure for an a.out core dump
- */
-static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
-{
-	unsigned long first_stack_page;
-
-	dump->magic = SUNOS_CORE_MAGIC;
-	dump->len = sizeof(struct user);
-	dump->regs.psr = regs->psr;
-	dump->regs.pc = regs->pc;
-	dump->regs.npc = regs->npc;
-	dump->regs.y = regs->y;
-	/* fuck me plenty */
-	memcpy(&dump->regs.regs[0], &regs->u_regs[1], (sizeof(unsigned long) * 15));
-	dump->uexec = current->thread.core_exec;
-	dump->u_tsize = (((unsigned long) current->mm->end_code) -
-		((unsigned long) current->mm->start_code)) & ~(PAGE_SIZE - 1);
-	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1)));
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_dsize &= ~(PAGE_SIZE - 1);
-	first_stack_page = (regs->u_regs[UREG_FP] & ~(PAGE_SIZE - 1));
-	dump->u_ssize = (TASK_SIZE - first_stack_page) & ~(PAGE_SIZE - 1);
-	memcpy(&dump->fpu.fpstatus.fregs.regs[0], &current->thread.float_regs[0], (sizeof(unsigned long) * 32));
-	dump->fpu.fpstatus.fsr = current->thread.fsr;
-	dump->fpu.fpstatus.flags = dump->fpu.fpstatus.extra = 0;
-	dump->fpu.fpstatus.fpq_count = current->thread.fpqdepth;
-	memcpy(&dump->fpu.fpstatus.fpq[0], &current->thread.fpqueue[0],
-	       ((sizeof(unsigned long) * 2) * 16));
-	dump->sigcode = 0;
-}
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_A_OUT_CORE_H */
diff --git a/include/asm-sparc/a.out.h b/include/asm-sparc/a.out.h
deleted file mode 100644
index 2f1c3748a068..000000000000
--- a/include/asm-sparc/a.out.h
+++ /dev/null
@@ -1,97 +0,0 @@
-#ifndef __SPARC_A_OUT_H__
-#define __SPARC_A_OUT_H__
-
-#define SPARC_PGSIZE    0x2000        /* Thanks to the sun4 architecture... */
-#define SEGMENT_SIZE    SPARC_PGSIZE  /* whee... */
-
-#ifndef __ASSEMBLY__
-
-struct exec {
-	unsigned char a_dynamic:1;      /* A __DYNAMIC is in this image */
-	unsigned char a_toolversion:7;
-	unsigned char a_machtype;
-	unsigned short a_info;
-	unsigned int a_text;		/* length of text, in bytes */
-	unsigned int a_data;		/* length of data, in bytes */
-	unsigned int a_bss;		/* length of bss, in bytes */
-	unsigned int a_syms;		/* length of symbol table, in bytes */
-	unsigned int a_entry;		/* where program begins */
-	unsigned int a_trsize;
-	unsigned int a_drsize;
-};
-
-#endif /* !__ASSEMBLY__ */
-
-/* Where in the file does the text information begin? */
-#define N_TXTOFF(x)     (N_MAGIC(x) == ZMAGIC ? 0 : sizeof (struct exec))
-
-/* Where do the Symbols start? */
-#define N_SYMOFF(x)     (N_TXTOFF(x) + (x).a_text +   \
-                         (x).a_data + (x).a_trsize +  \
-                         (x).a_drsize)
-
-/* Where does text segment go in memory after being loaded? */
-#define N_TXTADDR(x)    (unsigned long)(((N_MAGIC(x) == ZMAGIC) && \
-	                 ((x).a_entry < SPARC_PGSIZE)) ?   \
-                          0 : SPARC_PGSIZE)
-
-/* And same for the data segment.. */
-#define N_DATADDR(x) (N_MAGIC(x)==OMAGIC ?         \
-                      (N_TXTADDR(x) + (x).a_text)  \
-		       : (unsigned long) (_N_SEGMENT_ROUND (_N_TXTENDADDR(x))))
-
-#define N_TRSIZE(a)	((a).a_trsize)
-#define N_DRSIZE(a)	((a).a_drsize)
-#define N_SYMSIZE(a)	((a).a_syms)
-
-#ifndef __ASSEMBLY__
-
-/*
- * Sparc relocation types
- */
-enum reloc_type
-{
-	RELOC_8,
-	RELOC_16,
-	RELOC_32,	/* simplest relocs */
-	RELOC_DISP8,
-	RELOC_DISP16,
-	RELOC_DISP32,	/* Disp's (pc-rel) */
-	RELOC_WDISP30,
-	RELOC_WDISP22,  /* SR word disp's */
-	RELOC_HI22,
-	RELOC_22,	/* SR 22-bit relocs */
-	RELOC_13,
-	RELOC_LO10,	/* SR 13&10-bit relocs */
-	RELOC_SFA_BASE,
-	RELOC_SFA_OFF13, /* SR S.F.A. relocs */
-	RELOC_BASE10,
-	RELOC_BASE13,
-	RELOC_BASE22,	/* base_relative pic */
-	RELOC_PC10,
-	RELOC_PC22,	/* special pc-rel pic */
-	RELOC_JMP_TBL,	/* jmp_tbl_rel in pic */
-	RELOC_SEGOFF16,	/* ShLib offset-in-seg */
-	RELOC_GLOB_DAT,
-	RELOC_JMP_SLOT,
-	RELOC_RELATIVE 	/* rtld relocs */
-};
-
-/*
- * Format of a relocation datum.
- */
-struct relocation_info /* used when header.a_machtype == M_SPARC */
-{
-        unsigned int    r_address;  /* relocation addr */
-        unsigned int    r_index:24; /* segment index or symbol index */
-        unsigned int    r_extern:1; /* if F, r_index==SEG#; if T, SYM idx */
-        unsigned int    r_pad:2;    /* <unused> */
-        enum reloc_type r_type:5;   /* type of relocation to perform */
-        int             r_addend;   /* addend for relocation value */
-};
-
-#define N_RELOCATION_INFO_DECLARED 1
-
-#endif /* !(__ASSEMBLY__) */
-
-#endif /* __SPARC_A_OUT_H__ */
diff --git a/include/asm-sparc/head.h b/include/asm-sparc/head.h
index 1a03c28da92d..fcdba5116339 100644
--- a/include/asm-sparc/head.h
+++ b/include/asm-sparc/head.h
@@ -46,45 +46,12 @@
         b linux_sparc_syscall; \
         rd %psr, %l0;
 
-/* Software trap for SunOS4.1.x system calls. */
-#define SUNOS_SYSCALL_TRAP \
-        rd %psr, %l0; \
-        sethi %hi(sunos_sys_table), %l7; \
-        b linux_sparc_syscall; \
-        or %l7, %lo(sunos_sys_table), %l7;
-
-#define SUNOS_NO_SYSCALL_TRAP \
-        b sunos_syscall; \
-        rd %psr, %l0; \
-        nop; \
-        nop;
-
-/* Software trap for Slowaris system calls. */
-#define SOLARIS_SYSCALL_TRAP \
-        b solaris_syscall; \
-        rd %psr, %l0; \
-        nop; \
-        nop;
-
-#define INDIRECT_SOLARIS_SYSCALL(x) \
-	mov x, %g1; \
-	b solaris_syscall; \
-	rd %psr, %l0; \
-	nop;
-
 #define BREAKPOINT_TRAP \
 	b breakpoint_trap; \
 	rd %psr,%l0; \
 	nop; \
 	nop;
 
-/* Software trap for Sparc-netbsd system calls. */
-#define NETBSD_SYSCALL_TRAP \
-        sethi %hi(sys_call_table), %l7; \
-        or %l7, %lo(sys_call_table), %l7; \
-        b bsd_syscall; \
-        rd %psr, %l0;
-
 /* The Get Condition Codes software trap for userland. */
 #define GETCC_TRAP \
         b getcc_trap_handler; mov %psr, %l0; nop; nop;
diff --git a/include/asm-sparc/ioctls.h b/include/asm-sparc/ioctls.h
index 058c2064f706..3f4d0087b6a3 100644
--- a/include/asm-sparc/ioctls.h
+++ b/include/asm-sparc/ioctls.h
@@ -43,8 +43,6 @@
 #define __TIOCSETX        _IOW('t', 34, int) /* SunOS Specific */
 #define __TIOCGETX        _IOR('t', 35, int) /* SunOS Specific */
 #define TIOCCONS	_IO('t', 36)
-#define __TIOCSSIZE     _IOW('t', 37, struct sunos_ttysize) /* SunOS Specific */
-#define __TIOCGSIZE     _IOR('t', 38, struct sunos_ttysize) /* SunOS Specific */
 #define TIOCGSOFTCAR	_IOR('t', 100, int)
 #define TIOCSSOFTCAR	_IOW('t', 101, int)
 #define __TIOCUCNTL       _IOW('t', 102, int) /* SunOS Specific */
diff --git a/include/asm-sparc/mman.h b/include/asm-sparc/mman.h
index b7dc40bc68f4..e18be984c01d 100644
--- a/include/asm-sparc/mman.h
+++ b/include/asm-sparc/mman.h
@@ -22,19 +22,6 @@
 #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 
-/* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system
- * XXX calls.
- */
-
-/* SunOS sys_mctl() stuff... */
-#define MC_SYNC         1  /* Sync pages in memory with storage (usu. a file) */
-#define MC_LOCK         2  /* Lock pages into core ram, do not allow swapping of them */
-#define MC_UNLOCK       3  /* Unlock pages locked via previous mctl() with MC_LOCK arg */
-#define MC_LOCKAS       5  /* Lock an entire address space of the calling process */
-#define MC_UNLOCKAS     6  /* Unlock entire address space of calling process */
-
-#define MADV_FREE	0x5		/* (Solaris) contents can be freed */
-
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 #define arch_mmap_check	sparc_mmap_check
diff --git a/include/asm-sparc/namei.h b/include/asm-sparc/namei.h
index f2461e8a11ac..618344d89cc4 100644
--- a/include/asm-sparc/namei.h
+++ b/include/asm-sparc/namei.h
@@ -8,19 +8,6 @@
 #ifndef __SPARC_NAMEI_H
 #define __SPARC_NAMEI_H
 
-#define SPARC_BSD_EMUL "/usr/gnemul/sunos/"
-#define SPARC_SOL_EMUL "/usr/gnemul/solaris/"
-
-static inline char * __emul_prefix(void)
-{
-	switch (current->personality) {
-	case PER_SUNOS:
-		return SPARC_BSD_EMUL;
-	case PER_SVR4:
-		return SPARC_SOL_EMUL;
-	default:
-		return NULL;
-	}
-}
+#define __emul_prefix() NULL
 
 #endif /* __SPARC_NAMEI_H */
diff --git a/include/asm-sparc/pconf.h b/include/asm-sparc/pconf.h
deleted file mode 100644
index d73c1f1c49dc..000000000000
--- a/include/asm-sparc/pconf.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* $Id: pconf.h,v 1.3 1996/04/25 06:13:25 davem Exp $
- * pconf.h: pathconf() and fpathconf() defines for SunOS
- *          system call compatibility.
- *
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#ifndef _SPARC_PCONF_H
-#define _SPARC_PCONF_H
-
-#include <linux/fs.h>
-#include <linux/limits.h>
-
-#define _PCONF_LINK       1 /* Max number of links to an object        */
-#define _PCONF_CANON      2 /* TTY input buffer line size              */
-#define _PCONF_INPUT      3 /* Biggest packet a tty can imbibe at once */
-#define _PCONF_NAME       4 /* Filename length max                     */
-#define _PCONF_PATH       5 /* Max size of a pathname                  */
-#define _PCONF_PIPE       6 /* Buffer size for a pipe                  */
-#define _PCONF_CHRESTRICT 7 /* Can only root chown files?              */
-#define _PCONF_NOTRUNC    8 /* Are pathnames truncated if too big?     */
-#define _PCONF_VDISABLE   9 /* Magic char to disable special tty chars */
-#define _PCONF_MAXPCONF   9
-
-#endif /* !(_SPARC_PCONF_H) */
diff --git a/include/asm-sparc/processor.h b/include/asm-sparc/processor.h
index 40b1e41fdea7..e3006979709b 100644
--- a/include/asm-sparc/processor.h
+++ b/include/asm-sparc/processor.h
@@ -13,8 +13,6 @@
  */
 #define current_text_addr() ({ void *pc; __asm__("sethi %%hi(1f), %0; or %0, %%lo(1f), %0;\n1:" : "=r" (pc)); pc; })
 
-#include <linux/a.out.h>
-
 #include <asm/psr.h>
 #include <asm/ptrace.h>
 #include <asm/head.h>
@@ -67,7 +65,6 @@ struct thread_struct {
 	struct fpq	fpqueue[16];
 	unsigned long flags;
 	mm_segment_t current_ds;
-	struct exec core_exec;     /* just what it says. */
 	int new_signal;
 };
 
diff --git a/include/asm-sparc/socket.h b/include/asm-sparc/socket.h
index 2e2bd0b7c8e3..a00e15df227c 100644
--- a/include/asm-sparc/socket.h
+++ b/include/asm-sparc/socket.h
@@ -24,9 +24,6 @@
 #define SO_SNDTIMEO     0x4000
 #define SO_ACCEPTCONN	0x8000
 
-/* wha!??? */
-#define SO_DONTLINGER   (~SO_LINGER)  /* Older SunOS compat. hack */
-
 #define SO_SNDBUF	0x1001
 #define SO_RCVBUF	0x1002
 #define SO_SNDBUFFORCE	0x100a
diff --git a/include/asm-sparc/solerrno.h b/include/asm-sparc/solerrno.h
deleted file mode 100644
index 8abce7e4639f..000000000000
--- a/include/asm-sparc/solerrno.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* $Id: solerrno.h,v 1.5 1996/04/25 06:13:32 davem Exp $
- * solerrno.h: Solaris error return codes for compatibility.
- *
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#ifndef _SPARC_SOLERRNO_H
-#define _SPARC_SOLERRNO_H
-
-#define SOL_EPERM          1     /* Required superuser access perms  */
-#define SOL_ENOENT         2     /* File or directory does not exist */
-#define SOL_ESRCH          3     /* Process did not exist            */
-#define	SOL_EINTR          4     /* System call was interrupted      */
-#define	SOL_EIO            5     /* An i/o error occurred            */
-#define	SOL_ENXIO          6     /* Device or Address does not exist */
-#define	SOL_E2BIG          7	 /* Too many arguments were given    */
-#define	SOL_ENOEXEC        8     /* Header of executable was munged  */
-#define	SOL_EBADF          9     /* Bogus file number                */
-#define	SOL_ECHILD         10    /* No children of process exist     */
-#define	SOL_EAGAIN         11    /* beep beep, "try again later"     */
-#define	SOL_ENOMEM         12    /* No memory available              */
-#define	SOL_EACCES         13    /* Access not allowed               */
-#define	SOL_EFAULT         14    /* Address passed was invalid       */
-#define	SOL_ENOTBLK        15    /* blkdev op on non-block device    */
-#define	SOL_EBUSY          16    /* Mounted device was busy          */
-#define	SOL_EEXIST         17    /* File specified already exists    */
-#define	SOL_EXDEV          18    /* Link request across diff devices */
-#define	SOL_ENODEV         19    /* Device does not exist on system  */
-#define	SOL_ENOTDIR        20    /* Dir operation on non-directory   */
-#define	SOL_EISDIR         21    /* File was of directory type       */
-#define	SOL_EINVAL         22    /* Argument passed was invalid      */
-#define	SOL_ENFILE         23    /* No more room in file table       */
-#define	SOL_EMFILE         24    /* Proc has too many files open     */
-#define	SOL_ENOTTY         25    /* Ioctl was invalid for req device */
-#define	SOL_ETXTBSY        26    /* Text file in busy state          */
-#define	SOL_EFBIG          27    /* Too big of a file for operation  */
-#define	SOL_ENOSPC         28    /* Disk is full                     */
-#define	SOL_ESPIPE         29    /* Seek attempted on non-seeking dev*/
-#define	SOL_EROFS          30    /* Write attempted on read-only fs  */
-#define	SOL_EMLINK         31    /* Too many links in file search    */
-#define	SOL_EPIPE          32    /* Call a plumber                   */
-#define	SOL_EDOM           33    /* Argument was out of fct domain   */
-#define	SOL_ERANGE         34    /* Could not represent math result  */
-#define	SOL_ENOMSG         35    /* Message of req type doesn't exist */
-#define	SOL_EIDRM          36    /* Identifier has been removed      */
-#define	SOL_ECHRNG         37    /* Req channel number out of range  */
-#define	SOL_EL2NSYNC       38    /* Could not sync at run level 2    */
-#define	SOL_EL3HLT         39    /* Halted at run level 3            */
-#define	SOL_EL3RST         40    /* Reset at run level 3             */
-#define	SOL_ELNRNG         41    /* Out of range link number         */
-#define	SOL_EUNATCH        42    /* Driver for protocol not attached */
-#define	SOL_ENOCSI         43    /* CSI structure not around         */
-#define	SOL_EL2HLT         44    /* Halted at run level 2            */
-#define	SOL_EDEADLK        45    /* Deadlock condition detected      */
-#define	SOL_ENOLCK         46    /* Record locks unavailable         */
-#define	SOL_ECANCELED      47    /* Cancellation of oper. happened   */
-#define	SOL_ENOTSUP        48    /* Attempt of unsupported operation */
-#define	SOL_EDQUOT         49    /* Users disk quota exceeded        */
-#define	SOL_EBADE          50    /* Invalid exchange                 */
-#define	SOL_EBADR          51    /* Request descriptor was invalid   */
-#define	SOL_EXFULL         52    /* Full exchange                    */
-#define	SOL_ENOANO         53    /* ano does not exist               */
-#define	SOL_EBADRQC        54    /* Req code was invalid             */
-#define	SOL_EBADSLT        55    /* Bad slot number                  */
-#define	SOL_EDEADLOCK      56    /* Deadlock in fs error             */
-#define	SOL_EBFONT         57    /* Font file format invalid         */
-/* YOW, I LOVE SYSV STREAMS!!!! */
-#define	SOL_ENOSTR         60    /* Stream-op on non-stream dev      */
-#define	SOL_ENODATA        61    /* No data avail at this time       */
-#define	SOL_ETIME          62    /* Expiration of time occurred      */
-#define	SOL_ENOSR          63    /* Streams resources exhausted      */
-#define	SOL_ENONET         64    /* No network connected             */
-#define	SOL_ENOPKG         65    /* Non-installed package            */
-#define	SOL_EREMOTE        66    /* Object was on remote machine     */
-#define	SOL_ENOLINK        67    /* Cut link                         */
-#define	SOL_EADV           68    /* Error in advertise               */
-#define	SOL_ESRMNT         69    /* Some magic srmount problem       */
-#define	SOL_ECOMM          70    /* During send, comm error occurred */
-#define	SOL_EPROTO         71    /* Protocol botch                   */
-#define	SOL_EMULTIHOP      74    /* Multihop attempted               */
-#define	SOL_EBADMSG        77    /* Message was unreadable           */
-#define	SOL_ENAMETOOLONG   78    /* Too long of a path name          */
-#define	SOL_EOVERFLOW      79    /* Data type too small for datum    */
-#define	SOL_ENOTUNIQ       80    /* Logical name was not unique      */
-#define	SOL_EBADFD         81    /* Op cannot be performed on fd     */
-#define	SOL_EREMCHG        82    /* Remote address is now different  */
-#define	SOL_ELIBACC        83    /* Shared lib could not be accessed */
-#define	SOL_ELIBBAD        84    /* ShLib is corrupted in some way   */
-#define	SOL_ELIBSCN        85    /* A.out ShLib problems             */
-#define	SOL_ELIBMAX        86    /* Exceeded ShLib linkage limit     */
-#define	SOL_ELIBEXEC       87    /* Execution of ShLib attempted     */
-#define	SOL_EILSEQ         88    /* Bad byte sequence found          */
-#define	SOL_ENOSYS         89    /* Invalid filesystem operation     */
-#define	SOL_ELOOP          90    /* Detected loop in symbolic links  */
-#define	SOL_ERESTART       91    /* System call is restartable       */
-#define	SOL_ESTRPIPE       92    /* Do not sleep in head of stream   */
-#define	SOL_ENOTEMPTY      93    /* Rmdir of non-empty directory     */
-#define	SOL_EUSERS         94    /* Over abundance of users for ufs  */
-#define	SOL_ENOTSOCK       95    /* Sock-op on non-sock              */
-#define	SOL_EDESTADDRREQ   96    /* No dest addr given, but needed   */
-#define	SOL_EMSGSIZE       97    /* Msg too big                      */
-#define	SOL_EPROTOTYPE     98    /* Bad socket protocol              */
-#define	SOL_ENOPROTOOPT    99    /* Unavailable protocol             */
-#define	SOL_EPROTONOSUPPORT 120  /* Unsupported protocol             */
-#define	SOL_ESOCKTNOSUPPORT 121  /* Unsupported socket type          */
-#define	SOL_EOPNOTSUPP     122   /* Unsupported sock-op              */
-#define	SOL_EPFNOSUPPORT   123   /* Unsupported protocol family      */
-#define	SOL_EAFNOSUPPORT   124   /* Unsup addr family for protocol   */
-#define	SOL_EADDRINUSE     125   /* Req addr is already in use       */
-#define	SOL_EADDRNOTAVAIL  126   /* Req addr not available right now */
-#define	SOL_ENETDOWN       127   /* Your subnet is on fire           */
-#define	SOL_ENETUNREACH    128   /* Someone playing with gateway and */
-                                 /* did not tell you he was going to */
-#define	SOL_ENETRESET      129   /* Buy less-buggy ethernet cards    */
-#define	SOL_ECONNABORTED   130   /* Aborted connection due to sw     */
-#define	SOL_ECONNRESET     131   /* Your peers reset your connection */
-#define	SOL_ENOBUFS        132   /* No buffer space available        */
-#define	SOL_EISCONN        133   /* Connect on already connected     */
-                                 /* socket attempted                 */
-#define	SOL_ENOTCONN       134   /* Comm on non-connected socket     */
-#define	SOL_ESHUTDOWN      143   /* Op attempted after sock-shutdown */
-#define	SOL_ETOOMANYREFS   144   /* Reference limit exceeded         */
-#define	SOL_ETIMEDOUT      145   /* Timed out connection             */
-#define	SOL_ECONNREFUSED   146   /* Connection refused by remote host*/
-#define	SOL_EHOSTDOWN      147   /* Remote host is up in flames      */
-#define	SOL_EHOSTUNREACH   148   /* Make a left at Easton Ave.....   */
-#define	SOL_EWOULDBLOCK    EAGAIN /* Just an alias */
-#define	SOL_EALREADY       149   /* Operation is already occurring   */
-#define	SOL_EINPROGRESS    150   /* Operation is happening now       */
-#define	SOL_ESTALE         151   /* Fungus growth on NFS file handle */
-
-#endif /* !(_SPARC_SOLERRNO_H) */
diff --git a/include/asm-sparc/svr4.h b/include/asm-sparc/svr4.h
deleted file mode 100644
index da1f1c980e2d..000000000000
--- a/include/asm-sparc/svr4.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Solaris/SPARC constants and definitions -- 
- * (C) 1996 Miguel de Icaza
- *
- * This file is not meant to be included by user level applications
- * but the solaris syscall emulator
- */
-
-#ifndef _SPARC_SVR4_H
-#define _SPARC_SVR4_H
-
-/* Signals as used by svr4 */
-typedef struct {                /* signal set type */
-	ulong sigbits[4];
-} svr4_sigset_t;
-
-/* Values for siginfo.code */
-#define SVR4_SINOINFO 32767
-/* Siginfo, sucker expects bunch of information on those parameters */
-typedef union {
-	char total_size [128];
-	struct {
-		int signo;
-		int code;
-		int error;
-		union {
-		} data; 
-	} siginfo;
-} svr4_siginfo_t;
-
-/* Context definition */
-
-/* Location of the user stored registers into a greg_t */
-enum {
-	SVR4_PSR, SVR4_PC, SVR4_NPC, SVR4_Y,
-	SVR4_G1,  SVR4_G2, SVR4_G3,  SVR4_G4,
-	SVR4_G5,  SVR4_G6, SVR4_G7,  SVR4_O0,
-	SVR4_O1,  SVR4_O2, SVR4_O3,  SVR4_O4,
-	SVR4_O5,  SVR4_O6, SVR4_O7
-};
-
-/* sizeof (regs) / sizeof (greg_t), defined in the ABI */
-#define SVR4_NREGS  19
-#define SVR4_MAXWIN 31
-
-typedef struct {
-	uint rwin_lo[8];
-	uint rwin_in[8];
-} svr4_rwindow_t;
-
-typedef struct {
-	int            count;
-	int            __user *winptr [SVR4_MAXWIN]; /* pointer to the windows */
-	svr4_rwindow_t win[SVR4_MAXWIN];      /* the windows */
-} svr4_gwindows_t;
-
-typedef int svr4_gregset_t[SVR4_NREGS];
-
-typedef struct {
-	double   fpu_regs[32];
-	void     *fp_q;
-	unsigned fp_fsr;
-	u_char   fp_nqel;
-	u_char   fp_nqsize;
-	u_char   inuse;		/* if fpu is in use */
-} svr4_fregset_t;
-
-typedef struct {
-	uint    id;		/* if this holds "xrs" string => ptr is valid */
-	caddr_t ptr;
-} svr4_xrs_t;
-
-/* Machine dependent context */
-typedef struct {
-	svr4_gregset_t   greg;	/* registers 0..19 (see top) */
-	svr4_gwindows_t  __user *gwin;	/* may point to register windows */
-	svr4_fregset_t   freg;	/* floating point registers */
-	svr4_xrs_t       xrs;	/* mhm? */
-	long             pad[19];
-} svr4_mcontext_t;
-
-/* flags for stack_t.flags */
-enum svr4_stack_flags {
-	SVR4_SS_ONSTACK,
-	SVR4_SS_DISABLE,
-};
-
-/* signal stack exection place, unsupported */
-typedef struct svr4_stack_t {
-        char __user *sp;
-        int  size;
-        int  flags;
-} svr4_stack_t;
-
-/* Context used by getcontext and setcontext */
-typedef struct svr4_ucontext_t {
-	u_long               flags; /* context flags, indicate what is loaded */
-	struct svr4_ucontext *link;
-	svr4_sigset_t        sigmask;
-	svr4_stack_t         stack;
-	svr4_mcontext_t      mcontext;
-	long                 pad[23];
-} svr4_ucontext_t;                          
-
-/* windows hold the windows as they were at signal time,
- * ucontext->mcontext holds a pointer to them.
- * addresses for uc and si are passed as parameters to svr4 signal
- * handler
- */
-
-/* This is the signal frame that is passed to the signal handler */
-typedef struct {
-	svr4_gwindows_t gw;	/* windows */
-	svr4_ucontext_t uc;	/* machine context */
-	svr4_siginfo_t  si;	/* siginfo */
-} svr4_signal_frame_t;
-
-#define SVR4_SF_ALIGNED (((sizeof (svr4_signal_frame_t) + 7) & (~7)))
-
-#endif /* include control */
diff --git a/include/asm-sparc/termios.h b/include/asm-sparc/termios.h
index 4333232abb9f..733d40504e1e 100644
--- a/include/asm-sparc/termios.h
+++ b/include/asm-sparc/termios.h
@@ -33,11 +33,6 @@ struct ltchars {
 };
 #endif /* __KERNEL__ */
 
-struct sunos_ttysize {
-	int st_lines;   /* Lines on the terminal */
-	int st_columns; /* Columns on the terminal */
-};
-
 struct winsize {
 	unsigned short ws_row;
 	unsigned short ws_col;
diff --git a/include/asm-sparc/user.h b/include/asm-sparc/user.h
index b5f1abf733d5..3400ea87f148 100644
--- a/include/asm-sparc/user.h
+++ b/include/asm-sparc/user.h
@@ -1,60 +1,6 @@
-/* $Id: user.h,v 1.5 1998/02/23 01:49:22 rth Exp $
- * asm-sparc/user.h: Core file definitions for the Sparc.
- *
- * Keep in sync with reg.h.  Actually, we could get rid of this
- * one, since we won't a.out core dump that much anyways - miguel.
- * Copyright (C) 1995 (davem@caip.rutgers.edu)
- */
 #ifndef _SPARC_USER_H
 #define _SPARC_USER_H
 
-#include <asm/a.out.h>
-struct sunos_regs {
-	unsigned long psr, pc, npc, y;
-	unsigned long regs[15];
-};
-
-struct sunos_fpqueue {
-	unsigned long *addr;
-	unsigned long inst;
-};
-
-struct sunos_fp {
-	union {
-		unsigned long regs[32];
-		double reg_dbls[16];
-	} fregs;
-	unsigned long fsr;
-	unsigned long flags;
-	unsigned long extra;
-	unsigned long fpq_count;
-	struct sunos_fpqueue fpq[16];
-};
-
-struct sunos_fpu {
-	struct sunos_fp fpstatus;
-};
-
-/* The SunOS core file header layout. */
-struct user {
-	unsigned long magic;
-	unsigned long len;
-	struct sunos_regs regs;
-	struct exec uexec;
-	int           signal;
-	size_t        u_tsize; /* all of these in bytes! */
-	size_t        u_dsize;
-	size_t        u_ssize;
-	char          u_comm[17];
-	struct sunos_fpu fpu;
-	unsigned long sigcode;   /* Special sigcontext subcode, if any */
-};
-
-#define NBPG                   0x2000
-#define UPAGES                 1
-#define HOST_TEXT_START_ADDR   (u.start_code)
-#define HOST_DATA_START_ADDR   (u.uexec.a_data)
-#define HOST_STACK_END_ADDR    (- u.u_ssize * NBPG)
-#define SUNOS_CORE_MAGIC       0x080456
+/* Nothing to define.  */
 
 #endif /* !(_SPARC_USER_H) */
diff --git a/include/asm-sparc64/Kbuild b/include/asm-sparc64/Kbuild
index a90dc82129d1..dce1cf9a9313 100644
--- a/include/asm-sparc64/Kbuild
+++ b/include/asm-sparc64/Kbuild
@@ -12,7 +12,6 @@ header-y += display7seg.h
 header-y += envctrl.h
 header-y += openprom.h
 header-y += openpromio.h
-header-y += pconf.h
 header-y += psrcompat.h
 header-y += pstate.h
 header-y += reg.h
diff --git a/include/asm-sparc64/a.out-core.h b/include/asm-sparc64/a.out-core.h
deleted file mode 100644
index 3499b3c425ca..000000000000
--- a/include/asm-sparc64/a.out-core.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* a.out coredump register dumper
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_A_OUT_CORE_H
-#define _ASM_A_OUT_CORE_H
-
-#ifdef __KERNEL__
-
-#include <linux/user.h>
-
-/*
- * fill in the user structure for an a.out core dump
- */
-static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
-{
-	/* Only should be used for SunOS and ancient a.out
-	 * SparcLinux binaries...  Not worth implementing.
-	 */
-	memset(dump, 0, sizeof(struct user));
-}
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_A_OUT_CORE_H */
diff --git a/include/asm-sparc64/a.out.h b/include/asm-sparc64/a.out.h
deleted file mode 100644
index 44208c2a188e..000000000000
--- a/include/asm-sparc64/a.out.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-sparc/a.out.h>
diff --git a/include/asm-sparc64/ioctls.h b/include/asm-sparc64/ioctls.h
index 083c9a0f37de..c1be40647c99 100644
--- a/include/asm-sparc64/ioctls.h
+++ b/include/asm-sparc64/ioctls.h
@@ -44,8 +44,6 @@
 #define __TIOCSETX        _IOW('t', 34, int) /* SunOS Specific */
 #define __TIOCGETX        _IOR('t', 35, int) /* SunOS Specific */
 #define TIOCCONS	_IO('t', 36)
-#define __TIOCSSIZE     _IOW('t', 37, struct sunos_ttysize) /* SunOS Specific */
-#define __TIOCGSIZE     _IOR('t', 38, struct sunos_ttysize) /* SunOS Specific */
 #define TIOCGSOFTCAR	_IOR('t', 100, int)
 #define TIOCSSOFTCAR	_IOW('t', 101, int)
 #define __TIOCUCNTL       _IOW('t', 102, int) /* SunOS Specific */
diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h
index 8cc1860be630..e584563b56eb 100644
--- a/include/asm-sparc64/mman.h
+++ b/include/asm-sparc64/mman.h
@@ -22,19 +22,6 @@
 #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 
-/* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system
- * XXX calls.
- */
-
-/* SunOS sys_mctl() stuff... */
-#define MC_SYNC         1  /* Sync pages in memory with storage (usu. a file) */
-#define MC_LOCK         2  /* Lock pages into core ram, do not allow swapping of them */
-#define MC_UNLOCK       3  /* Unlock pages locked via previous mctl() with MC_LOCK arg */
-#define MC_LOCKAS       5  /* Lock an entire address space of the calling process */
-#define MC_UNLOCKAS     6  /* Unlock entire address space of calling process */
-
-#define MADV_FREE	0x5		/* (Solaris) contents can be freed */
-
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 #define arch_mmap_check	sparc64_mmap_check
diff --git a/include/asm-sparc64/namei.h b/include/asm-sparc64/namei.h
index ccda19e28695..275161f21213 100644
--- a/include/asm-sparc64/namei.h
+++ b/include/asm-sparc64/namei.h
@@ -8,19 +8,6 @@
 #ifndef __SPARC64_NAMEI_H
 #define __SPARC64_NAMEI_H
 
-#define SPARC_BSD_EMUL "/usr/gnemul/sunos/"
-#define SPARC_SOL_EMUL "/usr/gnemul/solaris/"
-
-static inline char * __emul_prefix(void)
-{
-	switch (current->personality) {
-	case PER_SUNOS:
-		return SPARC_BSD_EMUL;
-	case PER_SVR4:
-		return SPARC_SOL_EMUL;
-	default:
-		return NULL;
-	}
-}
+#define __emul_prefix() NULL
 
 #endif /* __SPARC64_NAMEI_H */
diff --git a/include/asm-sparc64/pconf.h b/include/asm-sparc64/pconf.h
deleted file mode 100644
index aad106a70908..000000000000
--- a/include/asm-sparc64/pconf.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* $Id: pconf.h,v 1.1 1996/12/02 00:09:10 davem Exp $
- * pconf.h: pathconf() and fpathconf() defines for SunOS
- *          system call compatibility.
- *
- * Copyright (C) 1995, 1996 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#ifndef _SPARC64_PCONF_H
-#define _SPARC64_PCONF_H
-
-#include <linux/fs.h>
-#include <linux/limits.h>
-
-#define _PCONF_LINK       1 /* Max number of links to an object        */
-#define _PCONF_CANON      2 /* TTY input buffer line size              */
-#define _PCONF_INPUT      3 /* Biggest packet a tty can imbibe at once */
-#define _PCONF_NAME       4 /* Filename length max                     */
-#define _PCONF_PATH       5 /* Max size of a pathname                  */
-#define _PCONF_PIPE       6 /* Buffer size for a pipe                  */
-#define _PCONF_CHRESTRICT 7 /* Can only root chown files?              */
-#define _PCONF_NOTRUNC    8 /* Are pathnames truncated if too big?     */
-#define _PCONF_VDISABLE   9 /* Magic char to disable special tty chars */
-#define _PCONF_MAXPCONF   9
-
-#endif /* !(_SPARC64_PCONF_H) */
diff --git a/include/asm-sparc64/socket.h b/include/asm-sparc64/socket.h
index 44a625af6e31..8cf071fae3eb 100644
--- a/include/asm-sparc64/socket.h
+++ b/include/asm-sparc64/socket.h
@@ -24,9 +24,6 @@
 #define SO_SNDTIMEO     0x4000
 #define SO_ACCEPTCONN	0x8000
 
-/* wha!??? */
-#define SO_DONTLINGER   (~SO_LINGER)  /* Older SunOS compat. hack */
-
 #define SO_SNDBUF	0x1001
 #define SO_RCVBUF	0x1002
 #define SO_SNDBUFFORCE	0x100a
diff --git a/include/asm-sparc64/solerrno.h b/include/asm-sparc64/solerrno.h
deleted file mode 100644
index a2ea6fcf3446..000000000000
--- a/include/asm-sparc64/solerrno.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* $Id: solerrno.h,v 1.1 1996/12/26 14:22:40 davem Exp $
- * solerrno.h: Solaris error return codes for compatibility.
- *
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- */
-
-#ifndef _SPARC64_SOLERRNO_H
-#define _SPARC64_SOLERRNO_H
-
-#define SOL_EPERM          1     /* Required superuser access perms  */
-#define SOL_ENOENT         2     /* File or directory does not exist */
-#define SOL_ESRCH          3     /* Process did not exist            */
-#define	SOL_EINTR          4     /* System call was interrupted      */
-#define	SOL_EIO            5     /* An i/o error occurred            */
-#define	SOL_ENXIO          6     /* Device or Address does not exist */
-#define	SOL_E2BIG          7	 /* Too many arguments were given    */
-#define	SOL_ENOEXEC        8     /* Header of executable was munged  */
-#define	SOL_EBADF          9     /* Bogus file number                */
-#define	SOL_ECHILD         10    /* No children of process exist     */
-#define	SOL_EAGAIN         11    /* beep beep, "try again later"     */
-#define	SOL_ENOMEM         12    /* No memory available              */
-#define	SOL_EACCES         13    /* Access not allowed               */
-#define	SOL_EFAULT         14    /* Address passed was invalid       */
-#define	SOL_ENOTBLK        15    /* blkdev op on non-block device    */
-#define	SOL_EBUSY          16    /* Mounted device was busy          */
-#define	SOL_EEXIST         17    /* File specified already exists    */
-#define	SOL_EXDEV          18    /* Link request across diff devices */
-#define	SOL_ENODEV         19    /* Device does not exist on system  */
-#define	SOL_ENOTDIR        20    /* Dir operation on non-directory   */
-#define	SOL_EISDIR         21    /* File was of directory type       */
-#define	SOL_EINVAL         22    /* Argument passed was invalid      */
-#define	SOL_ENFILE         23    /* No more room in file table       */
-#define	SOL_EMFILE         24    /* Proc has too many files open     */
-#define	SOL_ENOTTY         25    /* Ioctl was invalid for req device */
-#define	SOL_ETXTBSY        26    /* Text file in busy state          */
-#define	SOL_EFBIG          27    /* Too big of a file for operation  */
-#define	SOL_ENOSPC         28    /* Disk is full                     */
-#define	SOL_ESPIPE         29    /* Seek attempted on non-seeking dev*/
-#define	SOL_EROFS          30    /* Write attempted on read-only fs  */
-#define	SOL_EMLINK         31    /* Too many links in file search    */
-#define	SOL_EPIPE          32    /* Call a plumber                   */
-#define	SOL_EDOM           33    /* Argument was out of fct domain   */
-#define	SOL_ERANGE         34    /* Could not represent math result  */
-#define	SOL_ENOMSG         35    /* Message of req type doesn't exist */
-#define	SOL_EIDRM          36    /* Identifier has been removed      */
-#define	SOL_ECHRNG         37    /* Req channel number out of range  */
-#define	SOL_EL2NSYNC       38    /* Could not sync at run level 2    */
-#define	SOL_EL3HLT         39    /* Halted at run level 3            */
-#define	SOL_EL3RST         40    /* Reset at run level 3             */
-#define	SOL_ELNRNG         41    /* Out of range link number         */
-#define	SOL_EUNATCH        42    /* Driver for protocol not attached */
-#define	SOL_ENOCSI         43    /* CSI structure not around         */
-#define	SOL_EL2HLT         44    /* Halted at run level 2            */
-#define	SOL_EDEADLK        45    /* Deadlock condition detected      */
-#define	SOL_ENOLCK         46    /* Record locks unavailable         */
-#define	SOL_ECANCELED      47    /* Cancellation of oper. happened   */
-#define	SOL_ENOTSUP        48    /* Attempt of unsupported operation */
-#define	SOL_EDQUOT         49    /* Users disk quota exceeded        */
-#define	SOL_EBADE          50    /* Invalid exchange                 */
-#define	SOL_EBADR          51    /* Request descriptor was invalid   */
-#define	SOL_EXFULL         52    /* Full exchange                    */
-#define	SOL_ENOANO         53    /* ano does not exist               */
-#define	SOL_EBADRQC        54    /* Req code was invalid             */
-#define	SOL_EBADSLT        55    /* Bad slot number                  */
-#define	SOL_EDEADLOCK      56    /* Deadlock in fs error             */
-#define	SOL_EBFONT         57    /* Font file format invalid         */
-/* YOW, I LOVE SYSV STREAMS!!!! */
-#define	SOL_ENOSTR         60    /* Stream-op on non-stream dev      */
-#define	SOL_ENODATA        61    /* No data avail at this time       */
-#define	SOL_ETIME          62    /* Expiration of time occurred      */
-#define	SOL_ENOSR          63    /* Streams resources exhausted      */
-#define	SOL_ENONET         64    /* No network connected             */
-#define	SOL_ENOPKG         65    /* Non-installed package            */
-#define	SOL_EREMOTE        66    /* Object was on remote machine     */
-#define	SOL_ENOLINK        67    /* Cut link                         */
-#define	SOL_EADV           68    /* Error in advertise               */
-#define	SOL_ESRMNT         69    /* Some magic srmount problem       */
-#define	SOL_ECOMM          70    /* During send, comm error occurred */
-#define	SOL_EPROTO         71    /* Protocol botch                   */
-#define	SOL_EMULTIHOP      74    /* Multihop attempted               */
-#define	SOL_EBADMSG        77    /* Message was unreadable           */
-#define	SOL_ENAMETOOLONG   78    /* Too long of a path name          */
-#define	SOL_EOVERFLOW      79    /* Data type too small for datum    */
-#define	SOL_ENOTUNIQ       80    /* Logical name was not unique      */
-#define	SOL_EBADFD         81    /* Op cannot be performed on fd     */
-#define	SOL_EREMCHG        82    /* Remote address is now different  */
-#define	SOL_ELIBACC        83    /* Shared lib could not be accessed */
-#define	SOL_ELIBBAD        84    /* ShLib is corrupted in some way   */
-#define	SOL_ELIBSCN        85    /* A.out ShLib problems             */
-#define	SOL_ELIBMAX        86    /* Exceeded ShLib linkage limit     */
-#define	SOL_ELIBEXEC       87    /* Execution of ShLib attempted     */
-#define	SOL_EILSEQ         88    /* Bad byte sequence found          */
-#define	SOL_ENOSYS         89    /* Invalid filesystem operation     */
-#define	SOL_ELOOP          90    /* Detected loop in symbolic links  */
-#define	SOL_ERESTART       91    /* System call is restartable       */
-#define	SOL_ESTRPIPE       92    /* Do not sleep in head of stream   */
-#define	SOL_ENOTEMPTY      93    /* Rmdir of non-empty directory     */
-#define	SOL_EUSERS         94    /* Over abundance of users for ufs  */
-#define	SOL_ENOTSOCK       95    /* Sock-op on non-sock              */
-#define	SOL_EDESTADDRREQ   96    /* No dest addr given, but needed   */
-#define	SOL_EMSGSIZE       97    /* Msg too big                      */
-#define	SOL_EPROTOTYPE     98    /* Bad socket protocol              */
-#define	SOL_ENOPROTOOPT    99    /* Unavailable protocol             */
-#define	SOL_EPROTONOSUPPORT 120  /* Unsupported protocol             */
-#define	SOL_ESOCKTNOSUPPORT 121  /* Unsupported socket type          */
-#define	SOL_EOPNOTSUPP     122   /* Unsupported sock-op              */
-#define	SOL_EPFNOSUPPORT   123   /* Unsupported protocol family      */
-#define	SOL_EAFNOSUPPORT   124   /* Unsup addr family for protocol   */
-#define	SOL_EADDRINUSE     125   /* Req addr is already in use       */
-#define	SOL_EADDRNOTAVAIL  126   /* Req addr not available right now */
-#define	SOL_ENETDOWN       127   /* Your subnet is on fire           */
-#define	SOL_ENETUNREACH    128   /* Someone playing with gateway and */
-                                 /* did not tell you he was going to */
-#define	SOL_ENETRESET      129   /* Buy less-buggy ethernet cards    */
-#define	SOL_ECONNABORTED   130   /* Aborted connection due to sw     */
-#define	SOL_ECONNRESET     131   /* Your peers reset your connection */
-#define	SOL_ENOBUFS        132   /* No buffer space available        */
-#define	SOL_EISCONN        133   /* Connect on already connected     */
-                                 /* socket attempted                 */
-#define	SOL_ENOTCONN       134   /* Comm on non-connected socket     */
-#define	SOL_ESHUTDOWN      143   /* Op attempted after sock-shutdown */
-#define	SOL_ETOOMANYREFS   144   /* Reference limit exceeded         */
-#define	SOL_ETIMEDOUT      145   /* Timed out connection             */
-#define	SOL_ECONNREFUSED   146   /* Connection refused by remote host*/
-#define	SOL_EHOSTDOWN      147   /* Remote host is up in flames      */
-#define	SOL_EHOSTUNREACH   148   /* Make a left at Easton Ave.....   */
-#define	SOL_EWOULDBLOCK    EAGAIN /* Just an alias */
-#define	SOL_EALREADY       149   /* Operation is already occurring   */
-#define	SOL_EINPROGRESS    150   /* Operation is happening now       */
-#define	SOL_ESTALE         151   /* Fungus growth on NFS file handle */
-
-#endif /* !(_SPARC64_SOLERRNO_H) */
diff --git a/include/asm-sparc64/svr4.h b/include/asm-sparc64/svr4.h
deleted file mode 100644
index c96d5f116e1c..000000000000
--- a/include/asm-sparc64/svr4.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Solaris/SPARC constants and definitions -- 
- * (C) 1996 Miguel de Icaza
- *
- * This file is not meant to be included by user level applications
- * but the solaris syscall emulator
- */
-
-#ifndef _SPARC64_SVR4_H
-#define _SPARC64_SVR4_H
-
-/* Signals as used by svr4 */
-typedef struct {                /* signal set type */
-	uint sigbits[4];
-} svr4_sigset_t;
-
-/* Values for siginfo.code */
-#define SVR4_SINOINFO 32767
-/* Siginfo, sucker expects bunch of information on those parameters */
-typedef union {
-	char total_size [128];
-	struct {
-		int signo;
-		int code;
-		int error;
-		union {
-		} data; 
-	} siginfo;
-} svr4_siginfo_t;
-
-/* Context definition */
-
-/* Location of the user stored registers into a greg_t */
-enum {
-	SVR4_PSR, SVR4_PC, SVR4_NPC, SVR4_Y,
-	SVR4_G1,  SVR4_G2, SVR4_G3,  SVR4_G4,
-	SVR4_G5,  SVR4_G6, SVR4_G7,  SVR4_O0,
-	SVR4_O1,  SVR4_O2, SVR4_O3,  SVR4_O4,
-	SVR4_O5,  SVR4_O6, SVR4_O7
-};
-
-/* sizeof (regs) / sizeof (greg_t), defined in the ABI */
-#define SVR4_NREGS  19
-#define SVR4_MAXWIN 31
-
-typedef struct {
-	u32 rwin_lo[8];
-	u32 rwin_in[8];
-} svr4_rwindow_t;
-
-typedef struct {
-	int            count;
-	u32            winptr [SVR4_MAXWIN]; /* pointer to the windows */
-
-	svr4_rwindow_t win[SVR4_MAXWIN];      /* the windows */
-} svr4_gwindows_t;
-
-typedef int svr4_gregset_t[SVR4_NREGS];
-
-typedef struct {
-	u64   	 fpu_regs[32];
-	u32	 fp_q;
-	u32      fp_fsr;
-	u_char   fp_nqel;
-	u_char   fp_nqsize;
-	u_char   inuse;		/* if fpu is in use */
-} svr4_fregset_t;
-
-typedef struct {
-	u32    id;		/* if this holds "xrs" string => ptr is valid */
-	u32    ptr;
-} svr4_xrs_t;
-
-/* Machine dependent context */
-typedef struct {
-	svr4_gregset_t   greg;	/* registers 0..19 (see top) */
-	u32		 gwin;	/* may point to register windows */
-	svr4_fregset_t   freg;	/* floating point registers */
-	svr4_xrs_t       xrs;	/* mhm? */
-	int              pad[19];
-} svr4_mcontext_t;
-
-/* flags for stack_t.flags */
-enum svr4_stack_flags {
-	SVR4_SS_ONSTACK,
-	SVR4_SS_DISABLE,
-};
-
-/* signal stack execution place, unsupported */
-typedef struct svr4_stack_t {
-        u32  sp;
-        int  size;
-        int  flags;
-} svr4_stack_t;
-
-/* Context used by getcontext and setcontext */
-typedef struct svr4_ucontext_t {
-	u32		flags; /* context flags, indicate what is loaded */
-	u32		link;
-	svr4_sigset_t	sigmask;
-	svr4_stack_t	stack;
-	svr4_mcontext_t	mcontext;
-	int		pad[23];
-} svr4_ucontext_t;                          
-
-/* windows hold the windows as they were at signal time,
- * ucontext->mcontext holds a pointer to them.
- * addresses for uc and si are passed as parameters to svr4 signal
- * handler
- */
-
-/* This is the signal frame that is passed to the signal handler */
-typedef struct {
-	svr4_gwindows_t gw;	/* windows */
-	svr4_ucontext_t uc;	/* machine context */
-	svr4_siginfo_t  si;	/* siginfo */
-} svr4_signal_frame_t;
-
-#define SVR4_SF_ALIGNED (((sizeof (svr4_signal_frame_t) + 7) & (~7)))
-
-#endif /* include control */
diff --git a/include/asm-sparc64/termios.h b/include/asm-sparc64/termios.h
index ef527211f8a8..cacbea171ad7 100644
--- a/include/asm-sparc64/termios.h
+++ b/include/asm-sparc64/termios.h
@@ -33,11 +33,6 @@ struct ltchars {
 };
 #endif /* __KERNEL__ */
 
-struct sunos_ttysize {
-	int st_lines;   /* Lines on the terminal */
-	int st_columns; /* Columns on the terminal */
-};
-
 struct winsize {
 	unsigned short ws_row;
 	unsigned short ws_col;
diff --git a/include/asm-sparc64/ttable.h b/include/asm-sparc64/ttable.h
index bbb9c8f13d61..7208a777750e 100644
--- a/include/asm-sparc64/ttable.h
+++ b/include/asm-sparc64/ttable.h
@@ -99,14 +99,6 @@
 	 or	%l7, %lo(systbl), %l7;			\
 	nop; nop;
 	
-#define INDIRECT_SOLARIS_SYSCALL(num)			\
-	sethi	%hi(109f), %g7;				\
-	ba,pt	%xcc, etrap;				\
-109:	 or	%g7, %lo(109b), %g7;			\
-	ba,pt	%xcc, tl0_solaris + 0xc;		\
-	 mov	num, %g1;				\
-	nop;nop;nop;
-	
 #define TRAP_UTRAP(handler,lvl)				\
 	mov	handler, %g3;				\
 	ba,pt	%xcc, utrap_trap;			\
@@ -117,11 +109,6 @@
 	nop;						\
 	nop;
 
-#ifdef CONFIG_SUNOS_EMUL
-#define SUNOS_SYSCALL_TRAP SYSCALL_TRAP(linux_sparc_syscall32, sunos_sys_table)
-#else
-#define SUNOS_SYSCALL_TRAP TRAP(sunos_syscall)
-#endif
 #ifdef CONFIG_COMPAT
 #define	LINUX_32BIT_SYSCALL_TRAP SYSCALL_TRAP(linux_sparc_syscall32, sys_call_table32)
 #else
@@ -130,11 +117,6 @@
 #define LINUX_64BIT_SYSCALL_TRAP SYSCALL_TRAP(linux_sparc_syscall, sys_call_table64)
 #define GETCC_TRAP TRAP(getcc)
 #define SETCC_TRAP TRAP(setcc)
-#ifdef CONFIG_SOLARIS_EMUL
-#define SOLARIS_SYSCALL_TRAP TRAP(solaris_sparc_syscall)
-#else
-#define SOLARIS_SYSCALL_TRAP TRAP(solaris_syscall)
-#endif
 #define BREAKPOINT_TRAP TRAP(breakpoint_trap)
 
 #ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h
index 77559da0ea3f..13be4453a1f0 100644
--- a/include/asm-sparc64/unistd.h
+++ b/include/asm-sparc64/unistd.h
@@ -338,16 +338,6 @@
 #define NR_SYSCALLS		317
 
 #ifdef __KERNEL__
-/* sysconf options, for SunOS compatibility */
-#define   _SC_ARG_MAX             1
-#define   _SC_CHILD_MAX           2
-#define   _SC_CLK_TCK             3
-#define   _SC_NGROUPS_MAX         4
-#define   _SC_OPEN_MAX            5
-#define   _SC_JOB_CONTROL         6
-#define   _SC_SAVED_IDS           7
-#define   _SC_VERSION             8
-
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/include/asm-sparc64/user.h b/include/asm-sparc64/user.h
index 02b138943837..29fc6e906c29 100644
--- a/include/asm-sparc64/user.h
+++ b/include/asm-sparc64/user.h
@@ -1,60 +1 @@
-/* $Id: user.h,v 1.1 1996/12/26 14:22:44 davem Exp $
- * asm-sparc64/user.h: Core file definitions for the Sparc.
- *
- * Keep in sync with reg.h.  Actually, we could get rid of this
- * one, since we won't a.out core dump that much anyways - miguel.
- * Copyright (C) 1995 (davem@caip.rutgers.edu)
- */
-#ifndef _SPARC64_USER_H
-#define _SPARC64_USER_H
-
-#include <linux/a.out.h>
-struct sunos_regs {
-	unsigned int psr, pc, npc, y;
-	unsigned int regs[15];
-};
-
-struct sunos_fpqueue {
-	unsigned int *addr;
-	unsigned int inst;
-};
-
-struct sunos_fp {
-	union {
-		unsigned int regs[32];
-		double reg_dbls[16];
-	} fregs;
-	unsigned int fsr;
-	unsigned int flags;
-	unsigned int extra;
-	unsigned int fpq_count;
-	struct sunos_fpqueue fpq[16];
-};
-
-struct sunos_fpu {
-	struct sunos_fp fpstatus;
-};
-
-/* The SunOS core file header layout. */
-struct user {
-	unsigned int magic;
-	unsigned int len;
-	struct sunos_regs regs;
-	struct exec uexec;
-	int           signal;
-	size_t        u_tsize; /* all of these in bytes! */
-	size_t        u_dsize;
-	size_t        u_ssize;
-	char          u_comm[17];
-	struct sunos_fpu fpu;
-	unsigned int  sigcode;   /* Special sigcontext subcode, if any */
-};
-
-#define NBPG                   PAGE_SIZE /* XXX 4096 maybe? */
-#define UPAGES                 1
-#define HOST_TEXT_START_ADDR   (u.start_code)
-#define HOST_DATA_START_ADDR   (u.start_data)
-#define HOST_STACK_END_ADDR    (u.start_stack + u.u_ssize * NBPG)
-#define SUNOS_CORE_MAGIC       0x080456
-
-#endif /* !(_SPARC64_USER_H) */
+#include <asm-sparc/user.h>
diff --git a/net/core/sock.c b/net/core/sock.c
index 54c836a2216b..bf6f83e48e87 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -450,15 +450,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	 *	Options without arguments
 	 */
 
-#ifdef SO_DONTLINGER		/* Compatibility item... */
-	if (optname == SO_DONTLINGER) {
-		lock_sock(sk);
-		sock_reset_flag(sk, SOCK_LINGER);
-		release_sock(sk);
-		return 0;
-	}
-#endif
-
 	if (optname == SO_BINDTODEVICE)
 		return sock_bindtodevice(sk, optval, optlen);
 
-- 
cgit v1.2.3


From f5264481c8049673e2cc8c7aca410931f571ba2d Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Mon, 21 Apr 2008 22:15:06 +0000
Subject: trivial: small cleanups

These are small cleanups all over the tree.

Trivial style and comment changes to
  fs/select.c, kernel/signal.c, kernel/stop_machine.c & mm/pdflush.c

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
---
 fs/select.c           | 2 +-
 kernel/signal.c       | 4 ++--
 kernel/stop_machine.c | 3 +--
 mm/pdflush.c          | 4 ++--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 5633fe980781..00f58c5c7e05 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 		wait = NULL;
 		if (retval || !*timeout || signal_pending(current))
 			break;
-		if(table.error) {
+		if (table.error) {
 			retval = table.error;
 			break;
 		}
diff --git a/kernel/signal.c b/kernel/signal.c
index cc8303cd093d..64ad0ed15992 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -220,7 +220,7 @@ void flush_signals(struct task_struct *t)
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	clear_tsk_thread_flag(t,TIF_SIGPENDING);
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
 	flush_sigqueue(&t->pending);
 	flush_sigqueue(&t->signal->shared_pending);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -424,7 +424,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	}
 	if (signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
-	     info->si_sys_private){
+	     info->si_sys_private) {
 		/*
 		 * Release the siglock to ensure proper locking order
 		 * of timer locks outside of siglocks.  Note, we leave
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70c..316283cb60cb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -135,8 +135,7 @@ static void restart_machine(void)
 	preempt_enable_no_resched();
 }
 
-struct stop_machine_data
-{
+struct stop_machine_data {
 	int (*fn)(void *);
 	void *data;
 	struct completion done;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e3..3931f716454a 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -17,8 +17,8 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/fs.h>		// Needed by writeback.h
-#include <linux/writeback.h>	// Prototypes pdflush_operation()
+#include <linux/fs.h>		/* Needed by writeback.h	  */
+#include <linux/writeback.h>	/* Prototypes pdflush_operation() */
 #include <linux/kthread.h>
 #include <linux/cpuset.h>
 #include <linux/freezer.h>
-- 
cgit v1.2.3


From 1cc8dcf569a3fcefb7ae32652225f2bd3e85257e Mon Sep 17 00:00:00 2001
From: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Date: Mon, 21 Apr 2008 22:45:55 +0000
Subject: ext*: spelling fix prefered -> preferred

Spelling fix: prefered -> preferred

Signed-off-by: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
---
 fs/ext2/ialloc.c | 2 +-
 fs/ext2/inode.c  | 4 ++--
 fs/ext3/ialloc.c | 2 +-
 fs/ext3/inode.c  | 6 +++---
 fs/ext4/ialloc.c | 2 +-
 fs/ext4/inode.c  | 6 +++---
 6 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5deb8b74e649..08f647d8188d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -253,7 +253,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
  * it has too few free inodes left (min_inodes) or 
  * it has too few free blocks left (min_blocks) or 
  * it's already running too large debt (max_debt). 
- * Parent's group is prefered, if it doesn't satisfy these 
+ * Parent's group is preferred, if it doesn't satisfy these 
  * conditions we search cyclically through the rest. If none 
  * of the groups look good we just look for a group with more 
  * free inodes than average (starting at parent's group). 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c62006805427..b8a2990bab83 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -239,7 +239,7 @@ no_block:
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
- *	This function returns the prefered place for block allocation.
+ *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
@@ -283,7 +283,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
 }
 
 /**
- *	ext2_find_goal - find a prefered place for allocation.
+ *	ext2_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4f4020c54683..96dd5573e49b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -239,7 +239,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
  * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb95670a27eb..c683609b0e3a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -392,7 +392,7 @@ no_block:
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
- *	This function returns the prefered place for block allocation.
+ *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
@@ -436,12 +436,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 }
 
 /**
- *	ext3_find_goal - find a prefered place for allocation.
+ *	ext3_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
- *	Normally this function find the prefered place for block allocation,
+ *	Normally this function find the preferred place for block allocation,
  *	returns it.
  */
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8036b9b5376b..486e46a3918d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -305,7 +305,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
  * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 945cbf6cb1fc..8fab233cb05f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -382,7 +382,7 @@ no_block:
  *	@inode: owner
  *	@ind: descriptor of indirect block.
  *
- *	This function returns the prefered place for block allocation.
+ *	This function returns the preferred place for block allocation.
  *	It is used when heuristic for sequential allocation fails.
  *	Rules are:
  *	  + if there is a block to the left of our position - allocate near it.
@@ -432,12 +432,12 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 }
 
 /**
- *	ext4_find_goal - find a prefered place for allocation.
+ *	ext4_find_goal - find a preferred place for allocation.
  *	@inode: owner
  *	@block:  block we want
  *	@partial: pointer to the last triple within a chain
  *
- *	Normally this function find the prefered place for block allocation,
+ *	Normally this function find the preferred place for block allocation,
  *	returns it.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-- 
cgit v1.2.3


From 6d59e7f582ef1c1988542d0fc3b36d0087b757ce Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2008 15:48:17 -0400
Subject: [PATCH] move a bunch of declarations to fs/internal.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h          | 11 +++++++++++
 fs/pnode.c             |  1 +
 fs/super.c             |  1 +
 include/linux/dcache.h |  1 -
 include/linux/fs.h     |  6 ------
 include/linux/mount.h  |  2 --
 6 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 392e8ccd6fc4..80aa9a023372 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,3 +43,14 @@ extern void __init chrdev_init(void);
  * namespace.c
  */
 extern int copy_mount_options(const void __user *, unsigned long *);
+
+extern void free_vfsmnt(struct vfsmount *);
+extern struct vfsmount *alloc_vfsmnt(const char *);
+extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
+				struct vfsmount *);
+extern void release_mounts(struct list_head *);
+extern void umount_tree(struct vfsmount *, int, struct list_head *);
+extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+
+extern void __init mnt_init(void);
diff --git a/fs/pnode.c b/fs/pnode.c
index 1d8f5447f3f7..a9e0d6fadbcd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -9,6 +9,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include "internal.h"
 #include "pnode.h"
 
 /* return the next shared peer mount of @p */
diff --git a/fs/super.c b/fs/super.c
index 1f8f05ede437..4798350b2bc9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,6 +39,7 @@
 #include <linux/mutex.h>
 #include <linux/file.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 
 
 LIST_HEAD(super_blocks);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6bd646096fa6..fabd16d03a27 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -359,7 +359,6 @@ static inline int d_mountpoint(struct dentry *dentry)
 }
 
 extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
-extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0c609e71c379..cc2be2cf7d41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -305,7 +305,6 @@ struct vfsmount;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
-extern void __init mnt_init(void);
 extern void __init files_init(unsigned long);
 
 struct buffer_head;
@@ -1536,12 +1535,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
-extern void umount_tree(struct vfsmount *, int, struct list_head *);
-extern void release_mounts(struct list_head *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
-extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
-extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
-				  struct vfsmount *);
 extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
 extern void drop_collected_mounts(struct vfsmount *);
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index d6600e3f7e45..87b24cea1863 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -94,8 +94,6 @@ static inline void mntput(struct vfsmount *mnt)
 	}
 }
 
-extern void free_vfsmnt(struct vfsmount *mnt);
-extern struct vfsmount *alloc_vfsmnt(const char *name);
 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
 				      const char *name, void *data);
 
-- 
cgit v1.2.3


From 1a60a280778ff90270fc7390d9ec102f713a5a29 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2008 16:19:49 -0400
Subject: [PATCH] lock exclusively in collect_mounts() and
 drop_collected_mounts()

Taking namespace_sem shared there isn't worth the trouble, especially with
vfsmount ID allocation about to be added.  That way we know that umount_tree(),
copy_tree() and clone_mnt() are _always_ serialized by namespace_sem.
umount_tree() still needs vfsmount_lock (it manipulates hash chains, among
other things), but that's a separate story.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 678f7ce060f2..af2fb3707d0a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1091,20 +1091,20 @@ Enomem:
 struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct vfsmount *tree;
-	down_read(&namespace_sem);
+	down_write(&namespace_sem);
 	tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
-	up_read(&namespace_sem);
+	up_write(&namespace_sem);
 	return tree;
 }
 
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	LIST_HEAD(umount_list);
-	down_read(&namespace_sem);
+	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
 	umount_tree(mnt, 0, &umount_list);
 	spin_unlock(&vfsmount_lock);
-	up_read(&namespace_sem);
+	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
 
-- 
cgit v1.2.3


From b5266eb4c8d1a2887a19aaec8144ee4ad1b054c3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2008 17:48:24 -0400
Subject: [PATCH] switch a bunch of LSM hooks from nameidata to path

Namely, ones from namespace.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c             | 11 +++++-----
 include/linux/security.h   | 52 +++++++++++++++++++++++-----------------------
 security/dummy.c           | 10 ++++-----
 security/security.c        | 20 +++++++++---------
 security/selinux/hooks.c   |  8 +++----
 security/smack/smack_lsm.c |  4 ++--
 6 files changed, 53 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index af2fb3707d0a..87d2d82010bb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1220,7 +1220,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 	if (IS_DEADDIR(nd->path.dentry->d_inode))
 		goto out_unlock;
 
-	err = security_sb_check_sb(mnt, nd);
+	err = security_sb_check_sb(mnt, &nd->path);
 	if (err)
 		goto out_unlock;
 
@@ -1230,7 +1230,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 out_unlock:
 	mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
 	if (!err)
-		security_sb_post_addmount(mnt, nd);
+		security_sb_post_addmount(mnt, &nd->path);
 	return err;
 }
 
@@ -1746,7 +1746,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (retval)
 		return retval;
 
-	retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+	retval = security_sb_mount(dev_name, &nd.path,
+				   type_page, flags, data_page);
 	if (retval)
 		goto dput_out;
 
@@ -2007,7 +2008,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	if (error)
 		goto out1;
 
-	error = security_sb_pivotroot(&old_nd, &new_nd);
+	error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
 	if (error) {
 		path_put(&old_nd.path);
 		goto out1;
@@ -2070,7 +2071,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&user_nd.path, &new_nd.path);
-	security_sb_post_pivotroot(&user_nd, &new_nd);
+	security_sb_post_pivotroot(&user_nd.path, &new_nd.path);
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
diff --git a/include/linux/security.h b/include/linux/security.h
index fea1f4aa4dd5..53a34539382a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -230,7 +230,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	loopback/bind mount (@flags & MS_BIND), @dev_name identifies the
  *	pathname of the object being mounted.
  *	@dev_name contains the name for object being mounted.
- *	@nd contains the nameidata structure for mount point object.
+ *	@path contains the path for mount point object.
  *	@type contains the filesystem type.
  *	@flags contains the mount flags.
  *	@data contains the filesystem-specific data.
@@ -249,7 +249,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Check permission before the device with superblock @mnt->sb is mounted
  *	on the mount point named by @nd.
  *	@mnt contains the vfsmount for device being mounted.
- *	@nd contains the nameidata object for the mount point.
+ *	@path contains the path for the mount point.
  *	Return 0 if permission is granted.
  * @sb_umount:
  *	Check permission before the @mnt file system is unmounted.
@@ -278,16 +278,16 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	This hook is called any time a mount is successfully grafetd to
  *	the tree.
  *	@mnt contains the mounted filesystem.
- *	@mountpoint_nd contains the nameidata structure for the mount point.
+ *	@mountpoint contains the path for the mount point.
  * @sb_pivotroot:
  *	Check permission before pivoting the root filesystem.
- *	@old_nd contains the nameidata structure for the new location of the current root (put_old).
- *      @new_nd contains the nameidata structure for the new root (new_root).
+ *	@old_path contains the path for the new location of the current root (put_old).
+ *      @new_path contains the path for the new root (new_root).
  *	Return 0 if permission is granted.
  * @sb_post_pivotroot:
  *	Update module state after a successful pivot.
- *	@old_nd contains the nameidata structure for the old root.
- *      @new_nd contains the nameidata structure for the new root.
+ *	@old_path contains the path for the old root.
+ *      @new_path contains the path for the new root.
  * @sb_get_mnt_opts:
  *	Get the security relevant mount options used for a superblock
  *	@sb the superblock to get security mount options from
@@ -1315,20 +1315,20 @@ struct security_operations {
 	int (*sb_copy_data)(char *orig, char *copy);
 	int (*sb_kern_mount) (struct super_block *sb, void *data);
 	int (*sb_statfs) (struct dentry *dentry);
-	int (*sb_mount) (char *dev_name, struct nameidata * nd,
+	int (*sb_mount) (char *dev_name, struct path *path,
 			 char *type, unsigned long flags, void *data);
-	int (*sb_check_sb) (struct vfsmount * mnt, struct nameidata * nd);
+	int (*sb_check_sb) (struct vfsmount * mnt, struct path *path);
 	int (*sb_umount) (struct vfsmount * mnt, int flags);
 	void (*sb_umount_close) (struct vfsmount * mnt);
 	void (*sb_umount_busy) (struct vfsmount * mnt);
 	void (*sb_post_remount) (struct vfsmount * mnt,
 				 unsigned long flags, void *data);
 	void (*sb_post_addmount) (struct vfsmount * mnt,
-				  struct nameidata * mountpoint_nd);
-	int (*sb_pivotroot) (struct nameidata * old_nd,
-			     struct nameidata * new_nd);
-	void (*sb_post_pivotroot) (struct nameidata * old_nd,
-				   struct nameidata * new_nd);
+				  struct path *mountpoint);
+	int (*sb_pivotroot) (struct path *old_path,
+			     struct path *new_path);
+	void (*sb_post_pivotroot) (struct path *old_path,
+				   struct path *new_path);
 	int (*sb_get_mnt_opts) (const struct super_block *sb,
 				struct security_mnt_opts *opts);
 	int (*sb_set_mnt_opts) (struct super_block *sb,
@@ -1593,16 +1593,16 @@ void security_sb_free(struct super_block *sb);
 int security_sb_copy_data(char *orig, char *copy);
 int security_sb_kern_mount(struct super_block *sb, void *data);
 int security_sb_statfs(struct dentry *dentry);
-int security_sb_mount(char *dev_name, struct nameidata *nd,
+int security_sb_mount(char *dev_name, struct path *path,
                        char *type, unsigned long flags, void *data);
-int security_sb_check_sb(struct vfsmount *mnt, struct nameidata *nd);
+int security_sb_check_sb(struct vfsmount *mnt, struct path *path);
 int security_sb_umount(struct vfsmount *mnt, int flags);
 void security_sb_umount_close(struct vfsmount *mnt);
 void security_sb_umount_busy(struct vfsmount *mnt);
 void security_sb_post_remount(struct vfsmount *mnt, unsigned long flags, void *data);
-void security_sb_post_addmount(struct vfsmount *mnt, struct nameidata *mountpoint_nd);
-int security_sb_pivotroot(struct nameidata *old_nd, struct nameidata *new_nd);
-void security_sb_post_pivotroot(struct nameidata *old_nd, struct nameidata *new_nd);
+void security_sb_post_addmount(struct vfsmount *mnt, struct path *mountpoint);
+int security_sb_pivotroot(struct path *old_path, struct path *new_path);
+void security_sb_post_pivotroot(struct path *old_path, struct path *new_path);
 int security_sb_get_mnt_opts(const struct super_block *sb,
 				struct security_mnt_opts *opts);
 int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts);
@@ -1872,7 +1872,7 @@ static inline int security_sb_statfs (struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_sb_mount (char *dev_name, struct nameidata *nd,
+static inline int security_sb_mount (char *dev_name, struct path *path,
 				    char *type, unsigned long flags,
 				    void *data)
 {
@@ -1880,7 +1880,7 @@ static inline int security_sb_mount (char *dev_name, struct nameidata *nd,
 }
 
 static inline int security_sb_check_sb (struct vfsmount *mnt,
-					struct nameidata *nd)
+					struct path *path)
 {
 	return 0;
 }
@@ -1901,17 +1901,17 @@ static inline void security_sb_post_remount (struct vfsmount *mnt,
 { }
 
 static inline void security_sb_post_addmount (struct vfsmount *mnt,
-					      struct nameidata *mountpoint_nd)
+					      struct path *mountpoint)
 { }
 
-static inline int security_sb_pivotroot (struct nameidata *old_nd,
-					 struct nameidata *new_nd)
+static inline int security_sb_pivotroot (struct path *old_path,
+					 struct path *new_path)
 {
 	return 0;
 }
 
-static inline void security_sb_post_pivotroot (struct nameidata *old_nd,
-					       struct nameidata *new_nd)
+static inline void security_sb_post_pivotroot (struct path *old_path,
+					       struct path *new_path)
 { }
 static inline int security_sb_get_mnt_opts(const struct super_block *sb,
 					   struct security_mnt_opts *opts)
diff --git a/security/dummy.c b/security/dummy.c
index 98d5f969cdc8..b0232bbf427b 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -196,13 +196,13 @@ static int dummy_sb_statfs (struct dentry *dentry)
 	return 0;
 }
 
-static int dummy_sb_mount (char *dev_name, struct nameidata *nd, char *type,
+static int dummy_sb_mount (char *dev_name, struct path *path, char *type,
 			   unsigned long flags, void *data)
 {
 	return 0;
 }
 
-static int dummy_sb_check_sb (struct vfsmount *mnt, struct nameidata *nd)
+static int dummy_sb_check_sb (struct vfsmount *mnt, struct path *path)
 {
 	return 0;
 }
@@ -229,17 +229,17 @@ static void dummy_sb_post_remount (struct vfsmount *mnt, unsigned long flags,
 }
 
 
-static void dummy_sb_post_addmount (struct vfsmount *mnt, struct nameidata *nd)
+static void dummy_sb_post_addmount (struct vfsmount *mnt, struct path *path)
 {
 	return;
 }
 
-static int dummy_sb_pivotroot (struct nameidata *old_nd, struct nameidata *new_nd)
+static int dummy_sb_pivotroot (struct path *old_path, struct path *new_path)
 {
 	return 0;
 }
 
-static void dummy_sb_post_pivotroot (struct nameidata *old_nd, struct nameidata *new_nd)
+static void dummy_sb_post_pivotroot (struct path *old_path, struct path *new_path)
 {
 	return;
 }
diff --git a/security/security.c b/security/security.c
index 2e250c7028eb..8a285c7b9962 100644
--- a/security/security.c
+++ b/security/security.c
@@ -296,15 +296,15 @@ int security_sb_statfs(struct dentry *dentry)
 	return security_ops->sb_statfs(dentry);
 }
 
-int security_sb_mount(char *dev_name, struct nameidata *nd,
+int security_sb_mount(char *dev_name, struct path *path,
                        char *type, unsigned long flags, void *data)
 {
-	return security_ops->sb_mount(dev_name, nd, type, flags, data);
+	return security_ops->sb_mount(dev_name, path, type, flags, data);
 }
 
-int security_sb_check_sb(struct vfsmount *mnt, struct nameidata *nd)
+int security_sb_check_sb(struct vfsmount *mnt, struct path *path)
 {
-	return security_ops->sb_check_sb(mnt, nd);
+	return security_ops->sb_check_sb(mnt, path);
 }
 
 int security_sb_umount(struct vfsmount *mnt, int flags)
@@ -327,19 +327,19 @@ void security_sb_post_remount(struct vfsmount *mnt, unsigned long flags, void *d
 	security_ops->sb_post_remount(mnt, flags, data);
 }
 
-void security_sb_post_addmount(struct vfsmount *mnt, struct nameidata *mountpoint_nd)
+void security_sb_post_addmount(struct vfsmount *mnt, struct path *mountpoint)
 {
-	security_ops->sb_post_addmount(mnt, mountpoint_nd);
+	security_ops->sb_post_addmount(mnt, mountpoint);
 }
 
-int security_sb_pivotroot(struct nameidata *old_nd, struct nameidata *new_nd)
+int security_sb_pivotroot(struct path *old_path, struct path *new_path)
 {
-	return security_ops->sb_pivotroot(old_nd, new_nd);
+	return security_ops->sb_pivotroot(old_path, new_path);
 }
 
-void security_sb_post_pivotroot(struct nameidata *old_nd, struct nameidata *new_nd)
+void security_sb_post_pivotroot(struct path *old_path, struct path *new_path)
 {
-	security_ops->sb_post_pivotroot(old_nd, new_nd);
+	security_ops->sb_post_pivotroot(old_path, new_path);
 }
 
 int security_sb_get_mnt_opts(const struct super_block *sb,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1bf2543ea942..38fbb168dbed 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2392,22 +2392,22 @@ static int selinux_sb_statfs(struct dentry *dentry)
 }
 
 static int selinux_mount(char *dev_name,
-			 struct nameidata *nd,
+			 struct path *path,
 			 char *type,
 			 unsigned long flags,
 			 void *data)
 {
 	int rc;
 
-	rc = secondary_ops->sb_mount(dev_name, nd, type, flags, data);
+	rc = secondary_ops->sb_mount(dev_name, path, type, flags, data);
 	if (rc)
 		return rc;
 
 	if (flags & MS_REMOUNT)
-		return superblock_has_perm(current, nd->path.mnt->mnt_sb,
+		return superblock_has_perm(current, path->mnt->mnt_sb,
 					   FILESYSTEM__REMOUNT, NULL);
 	else
-		return dentry_has_perm(current, nd->path.mnt, nd->path.dentry,
+		return dentry_has_perm(current, path->mnt, path->dentry,
 				       FILE__MOUNTON);
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 93f5b0ce662a..4215971434e6 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -315,10 +315,10 @@ static int smack_sb_statfs(struct dentry *dentry)
  * Returns 0 if current can write the floor of the filesystem
  * being mounted on, an error code otherwise.
  */
-static int smack_sb_mount(char *dev_name, struct nameidata *nd,
+static int smack_sb_mount(char *dev_name, struct path *path,
 			  char *type, unsigned long flags, void *data)
 {
-	struct superblock_smack *sbp = nd->path.mnt->mnt_sb->s_security;
+	struct superblock_smack *sbp = path->mnt->mnt_sb->s_security;
 
 	return smk_curacc(sbp->smk_floor, MAY_WRITE);
 }
-- 
cgit v1.2.3


From 8c3ee42e80ccead805806b3cb50b9855ceb957a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2008 18:00:39 -0400
Subject: [PATCH] get rid of more nameidata passing in namespace.c

Further reduction of stack footprint (sys_pivot_root());
lose useless BKL in there, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 53 +++++++++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 87d2d82010bb..1bf302d0478b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1205,32 +1205,32 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	return 0;
 }
 
-static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
+static int graft_tree(struct vfsmount *mnt, struct path *path)
 {
 	int err;
 	if (mnt->mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 
-	if (S_ISDIR(nd->path.dentry->d_inode->i_mode) !=
+	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
 	      S_ISDIR(mnt->mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 
 	err = -ENOENT;
-	mutex_lock(&nd->path.dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(nd->path.dentry->d_inode))
+	mutex_lock(&path->dentry->d_inode->i_mutex);
+	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out_unlock;
 
-	err = security_sb_check_sb(mnt, &nd->path);
+	err = security_sb_check_sb(mnt, path);
 	if (err)
 		goto out_unlock;
 
 	err = -ENOENT;
-	if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry))
-		err = attach_recursive_mnt(mnt, &nd->path, NULL);
+	if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+		err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
-	mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
 	if (!err)
-		security_sb_post_addmount(mnt, &nd->path);
+		security_sb_post_addmount(mnt, path);
 	return err;
 }
 
@@ -1294,7 +1294,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name,
 	if (!mnt)
 		goto out;
 
-	err = graft_tree(mnt, nd);
+	err = graft_tree(mnt, &nd->path);
 	if (err) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
@@ -1501,7 +1501,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 		goto unlock;
 
 	newmnt->mnt_flags = mnt_flags;
-	if ((err = graft_tree(newmnt, nd)))
+	if ((err = graft_tree(newmnt, &nd->path)))
 		goto unlock;
 
 	if (fslist) /* add to the specified expiration list */
@@ -1987,15 +1987,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 			       const char __user * put_old)
 {
 	struct vfsmount *tmp;
-	struct nameidata new_nd, old_nd, user_nd;
-	struct path parent_path, root_parent;
+	struct nameidata new_nd, old_nd;
+	struct path parent_path, root_parent, root;
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	lock_kernel();
-
 	error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			    &new_nd);
 	if (error)
@@ -2015,7 +2013,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	}
 
 	read_lock(&current->fs->lock);
-	user_nd.path = current->fs->root;
+	root = current->fs->root;
 	path_get(&current->fs->root);
 	read_unlock(&current->fs->lock);
 	down_write(&namespace_sem);
@@ -2023,9 +2021,9 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	error = -EINVAL;
 	if (IS_MNT_SHARED(old_nd.path.mnt) ||
 		IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
-		IS_MNT_SHARED(user_nd.path.mnt->mnt_parent))
+		IS_MNT_SHARED(root.mnt->mnt_parent))
 		goto out2;
-	if (!check_mnt(user_nd.path.mnt))
+	if (!check_mnt(root.mnt))
 		goto out2;
 	error = -ENOENT;
 	if (IS_DEADDIR(new_nd.path.dentry->d_inode))
@@ -2035,13 +2033,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
 		goto out2;
 	error = -EBUSY;
-	if (new_nd.path.mnt == user_nd.path.mnt ||
-	    old_nd.path.mnt == user_nd.path.mnt)
+	if (new_nd.path.mnt == root.mnt ||
+	    old_nd.path.mnt == root.mnt)
 		goto out2; /* loop, on the same file system  */
 	error = -EINVAL;
-	if (user_nd.path.mnt->mnt_root != user_nd.path.dentry)
+	if (root.mnt->mnt_root != root.dentry)
 		goto out2; /* not a mountpoint */
-	if (user_nd.path.mnt->mnt_parent == user_nd.path.mnt)
+	if (root.mnt->mnt_parent == root.mnt)
 		goto out2; /* not attached */
 	if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
 		goto out2; /* not a mountpoint */
@@ -2063,27 +2061,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	} else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
 		goto out3;
 	detach_mnt(new_nd.path.mnt, &parent_path);
-	detach_mnt(user_nd.path.mnt, &root_parent);
+	detach_mnt(root.mnt, &root_parent);
 	/* mount old root on put_old */
-	attach_mnt(user_nd.path.mnt, &old_nd.path);
+	attach_mnt(root.mnt, &old_nd.path);
 	/* mount new_root on / */
 	attach_mnt(new_nd.path.mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	spin_unlock(&vfsmount_lock);
-	chroot_fs_refs(&user_nd.path, &new_nd.path);
-	security_sb_post_pivotroot(&user_nd.path, &new_nd.path);
+	chroot_fs_refs(&root, &new_nd.path);
+	security_sb_post_pivotroot(&root, &new_nd.path);
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
 out2:
 	mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
 	up_write(&namespace_sem);
-	path_put(&user_nd.path);
+	path_put(&root);
 	path_put(&old_nd.path);
 out1:
 	path_put(&new_nd.path);
 out0:
-	unlock_kernel();
 	return error;
 out3:
 	spin_unlock(&vfsmount_lock);
-- 
cgit v1.2.3


From 4e1b36fb485dd81b0818ef1bc8fb5c0f2923a283 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Mar 2008 00:16:03 -0400
Subject: [PATCH] umount_tree() will unhash everything itself

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pnode.c b/fs/pnode.c
index a9e0d6fadbcd..f968e35d9785 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -212,8 +212,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
 out:
 	spin_lock(&vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
-		child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
-		list_del_init(&child->mnt_hash);
+		child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
 	spin_unlock(&vfsmount_lock);
-- 
cgit v1.2.3


From 521b5d0c40386f4a9805cdec7bd979fc96a86aeb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Mar 2008 00:46:41 -0400
Subject: [PATCH] teach seq_file to discard entries

Allow ->show() return SEQ_SKIP; that will discard all
output from that element and move on.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pnode.h               |  1 +
 fs/seq_file.c            | 16 ++++++++++++----
 include/linux/seq_file.h |  2 ++
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/pnode.h b/fs/pnode.h
index f249be2fee7a..973c3f825e7d 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -35,4 +35,5 @@ int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
 		struct list_head *);
 int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct vfsmount *, int);
+void mnt_release_group_id(struct vfsmount *);
 #endif /* _LINUX_PNODE_H */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 853770274f20..bf2bcfd4bcfb 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,6 +25,7 @@
  *	into the buffer.  In case of error ->start() and ->next() return
  *	ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
  *	returns 0 in case of success and negative number in case of error.
+ *	Returning SEQ_SKIP means "discard this element and move on".
  */
 int seq_open(struct file *file, const struct seq_operations *op)
 {
@@ -114,8 +115,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 		if (!p || IS_ERR(p))
 			break;
 		err = m->op->show(m, p);
-		if (err)
+		if (err < 0)
 			break;
+		if (unlikely(err))
+			m->count = 0;
 		if (m->count < m->size)
 			goto Fill;
 		m->op->stop(m, p);
@@ -140,9 +143,10 @@ Fill:
 			break;
 		}
 		err = m->op->show(m, p);
-		if (err || m->count == m->size) {
+		if (m->count == m->size || err) {
 			m->count = offs;
-			break;
+			if (likely(err <= 0))
+				break;
 		}
 		pos = next;
 	}
@@ -199,8 +203,12 @@ static int traverse(struct seq_file *m, loff_t offset)
 		if (IS_ERR(p))
 			break;
 		error = m->op->show(m, p);
-		if (error)
+		if (error < 0)
 			break;
+		if (unlikely(error)) {
+			error = 0;
+			m->count = 0;
+		}
 		if (m->count == m->size)
 			goto Eoverflow;
 		if (pos + m->count > offset) {
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 1da1e6208a0a..d65796dc26d9 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -30,6 +30,8 @@ struct seq_operations {
 	int (*show) (struct seq_file *m, void *v);
 };
 
+#define SEQ_SKIP 1
+
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
-- 
cgit v1.2.3


From 16abef0e9e79643827fd5a2a14a07bced851ae72 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 22 Apr 2008 15:09:22 +0200
Subject: fs: use loff_t type instead of long long

Use offset type consistently.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/read_write.c | 6 +++---
 fs/seq_file.c   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 49a98718ecdf..f0d1240a5c69 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(generic_ro_fops);
 
 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	long long retval;
+	loff_t retval;
 	struct inode *inode = file->f_mapping->host;
 
 	mutex_lock(&inode->i_mutex);
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(generic_file_llseek);
 
 loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 {
-	long long retval;
+	loff_t retval;
 
 	lock_kernel();
 	switch (origin) {
@@ -91,7 +91,7 @@ EXPORT_SYMBOL(no_llseek);
 
 loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
-	long long retval;
+	loff_t retval;
 
 	lock_kernel();
 	switch (origin) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 853770274f20..9943408b315e 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -239,7 +239,7 @@ Eoverflow:
 loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
-	long long retval = -EINVAL;
+	loff_t retval = -EINVAL;
 
 	mutex_lock(&m->lock);
 	m->version = file->f_version;
-- 
cgit v1.2.3


From ed1524371716466e9c762808b02601d0d0276a92 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Apr 2008 19:51:27 -0400
Subject: [PATCH] double-free of inode on alloc_file() failure exit in
 create_write_pipe()

Duh...  Fortunately, the bug is quite recent (post-2.6.25) and, embarrassingly,
mine ;-/

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 8be381bbcb54..f73492b6817e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -988,7 +988,10 @@ struct file *create_write_pipe(void)
 	return f;
 
  err_dentry:
+	free_pipe_info(inode);
 	dput(dentry);
+	return ERR_PTR(err);
+
  err_inode:
 	free_pipe_info(inode);
 	iput(inode);
-- 
cgit v1.2.3


From 9b4f526cdc0f95f635607dfba6ac788b3deca188 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Apr 2008 01:32:44 -0400
Subject: [PATCH] proc_readfd_common() race fix

Since we drop the rcu_read_lock inside the loop, we can't assume
that files->fdt will remain unchanged (and not freed) between
iterations.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81d7d145292a..7313c62e3e9d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1626,7 +1626,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 	unsigned int fd, ino;
 	int retval;
 	struct files_struct * files;
-	struct fdtable *fdt;
 
 	retval = -ENOENT;
 	if (!p)
@@ -1649,9 +1648,8 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 			if (!files)
 				goto out;
 			rcu_read_lock();
-			fdt = files_fdtable(files);
 			for (fd = filp->f_pos-2;
-			     fd < fdt->max_fds;
+			     fd < files_fdtable(files)->max_fds;
 			     fd++, filp->f_pos++) {
 				char name[PROC_NUMBUF];
 				int len;
-- 
cgit v1.2.3


From 934b25c597c0e98304a7eaec198a87e4633a42bb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Apr 2008 00:04:04 -0400
Subject: [PATCH] remove unused label in xattr.c (noise from ro-bind)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xattr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index f7062da505d4..89a942f07e1b 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -307,7 +307,6 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 		error = setxattr(dentry, name, value, size, flags);
 		mnt_drop_write(f->f_path.mnt);
 	}
-out_fput:
 	fput(f);
 	return error;
 }
-- 
cgit v1.2.3


From 6092d048183b76bfa3f84b32f8158dd8d10bd811 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Thu, 27 Mar 2008 13:06:20 +0100
Subject: [patch 1/7] vfs: mountinfo: add dentry_path()

[mszeredi@suse.cz] split big patch into managable chunks

Add the following functions:

  dentry_path()
  seq_dentry()

These are similar to d_path() and seq_path().  But instead of
calculating the path within a mount namespace, they calculate the path
from the root of the filesystem to a given dentry, ignoring mounts
completely.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c              | 90 +++++++++++++++++++++++++++++++++++-------------
 fs/seq_file.c            | 65 +++++++++++++++++++++++++---------
 include/linux/dcache.h   |  1 +
 include/linux/seq_file.h |  2 ++
 4 files changed, 118 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 43455776711e..635c2aa427ed 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1746,6 +1746,17 @@ shouldnt_be_hashed:
 	goto shouldnt_be_hashed;
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+			  int namelen)
+{
+	*buflen -= namelen;
+	if (*buflen < 0)
+		return -ENAMETOOLONG;
+	*buffer -= namelen;
+	memcpy(*buffer, str, namelen);
+	return 0;
+}
+
 /**
  * d_path - return the path of a dentry
  * @dentry: dentry to report
@@ -1767,17 +1778,11 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
 {
 	char * end = buffer+buflen;
 	char * retval;
-	int namelen;
-
-	*--end = '\0';
-	buflen--;
-	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-		buflen -= 10;
-		end -= 10;
-		if (buflen < 0)
+
+	prepend(&end, &buflen, "\0", 1);
+	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
-		memcpy(end, " (deleted)", 10);
-	}
 
 	if (buflen < 1)
 		goto Elong;
@@ -1804,13 +1809,10 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
-		namelen = dentry->d_name.len;
-		buflen -= namelen + 1;
-		if (buflen < 0)
+		if ((prepend(&end, &buflen, dentry->d_name.name,
+				dentry->d_name.len) != 0) ||
+		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
-		end -= namelen;
-		memcpy(end, dentry->d_name.name, namelen);
-		*--end = '/';
 		retval = end;
 		dentry = parent;
 	}
@@ -1818,12 +1820,10 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
 	return retval;
 
 global_root:
-	namelen = dentry->d_name.len;
-	buflen -= namelen;
-	if (buflen < 0)
+	retval += 1;	/* hit the slash */
+	if (prepend(&retval, &buflen, dentry->d_name.name,
+		    dentry->d_name.len) != 0)
 		goto Elong;
-	retval -= namelen-1;	/* hit the slash */
-	memcpy(retval, dentry->d_name.name, namelen);
 	return retval;
 Elong:
 	return ERR_PTR(-ENAMETOOLONG);
@@ -1859,7 +1859,7 @@ char *d_path(struct path *path, char *buf, int buflen)
 
 	read_lock(&current->fs->lock);
 	root = current->fs->root;
-	path_get(&current->fs->root);
+	path_get(&root);
 	read_unlock(&current->fs->lock);
 	spin_lock(&dcache_lock);
 	res = __d_path(path->dentry, path->mnt, &root, buf, buflen);
@@ -1889,6 +1889,48 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 	return memcpy(buffer, temp, sz);
 }
 
+/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+	char *end = buf + buflen;
+	char *retval;
+
+	spin_lock(&dcache_lock);
+	prepend(&end, &buflen, "\0", 1);
+	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+		(prepend(&end, &buflen, "//deleted", 9) != 0))
+			goto Elong;
+	if (buflen < 1)
+		goto Elong;
+	/* Get '/' right */
+	retval = end-1;
+	*retval = '/';
+
+	for (;;) {
+		struct dentry *parent;
+		if (IS_ROOT(dentry))
+			break;
+
+		parent = dentry->d_parent;
+		prefetch(parent);
+
+		if ((prepend(&end, &buflen, dentry->d_name.name,
+				dentry->d_name.len) != 0) ||
+		    (prepend(&end, &buflen, "/", 1) != 0))
+			goto Elong;
+
+		retval = end;
+		dentry = parent;
+	}
+	spin_unlock(&dcache_lock);
+	return retval;
+Elong:
+	spin_unlock(&dcache_lock);
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
@@ -1918,9 +1960,9 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
 
 	read_lock(&current->fs->lock);
 	pwd = current->fs->pwd;
-	path_get(&current->fs->pwd);
+	path_get(&pwd);
 	root = current->fs->root;
-	path_get(&current->fs->root);
+	path_get(&root);
 	read_unlock(&current->fs->lock);
 
 	error = -ENOENT;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index cdfd996ca6ef..ae59f5a6c5c1 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -350,28 +350,40 @@ int seq_printf(struct seq_file *m, const char *f, ...)
 }
 EXPORT_SYMBOL(seq_printf);
 
+static char *mangle_path(char *s, char *p, char *esc)
+{
+	while (s <= p) {
+		char c = *p++;
+		if (!c) {
+			return s;
+		} else if (!strchr(esc, c)) {
+			*s++ = c;
+		} else if (s + 4 > p) {
+			break;
+		} else {
+			*s++ = '\\';
+			*s++ = '0' + ((c & 0300) >> 6);
+			*s++ = '0' + ((c & 070) >> 3);
+			*s++ = '0' + (c & 07);
+		}
+	}
+	return NULL;
+}
+
+/*
+ * return the absolute path of 'dentry' residing in mount 'mnt'.
+ */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
 	if (m->count < m->size) {
 		char *s = m->buf + m->count;
 		char *p = d_path(path, s, m->size - m->count);
 		if (!IS_ERR(p)) {
-			while (s <= p) {
-				char c = *p++;
-				if (!c) {
-					p = m->buf + m->count;
-					m->count = s - m->buf;
-					return s - p;
-				} else if (!strchr(esc, c)) {
-					*s++ = c;
-				} else if (s + 4 > p) {
-					break;
-				} else {
-					*s++ = '\\';
-					*s++ = '0' + ((c & 0300) >> 6);
-					*s++ = '0' + ((c & 070) >> 3);
-					*s++ = '0' + (c & 07);
-				}
+			s = mangle_path(s, p, esc);
+			if (s) {
+				p = m->buf + m->count;
+				m->count = s - m->buf;
+				return s - p;
 			}
 		}
 	}
@@ -380,6 +392,27 @@ int seq_path(struct seq_file *m, struct path *path, char *esc)
 }
 EXPORT_SYMBOL(seq_path);
 
+/*
+ * returns the path of the 'dentry' from the root of its filesystem.
+ */
+int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
+{
+	if (m->count < m->size) {
+		char *s = m->buf + m->count;
+		char *p = dentry_path(dentry, s, m->size - m->count);
+		if (!IS_ERR(p)) {
+			s = mangle_path(s, p, esc);
+			if (s) {
+				p = m->buf + m->count;
+				m->count = s - m->buf;
+				return s - p;
+			}
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
 	return NULL + (*pos == 0);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fabd16d03a27..63960033b6f5 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -302,6 +302,7 @@ extern int d_validate(struct dentry *, struct dentry *);
 extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
 extern char *d_path(struct path *, char *, int);
+extern char *dentry_path(struct dentry *, char *, int);
 
 /* Allocation counts.. */
 
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index d65796dc26d9..11676ccef7b3 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -10,6 +10,7 @@ struct seq_operations;
 struct file;
 struct path;
 struct inode;
+struct dentry;
 
 struct seq_file {
 	char *buf;
@@ -44,6 +45,7 @@ int seq_printf(struct seq_file *, const char *, ...)
 	__attribute__ ((format (printf,2,3)));
 
 int seq_path(struct seq_file *, struct path *, char *);
+int seq_dentry(struct seq_file *, struct dentry *, char *);
 
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_release(struct inode *, struct file *);
-- 
cgit v1.2.3


From 9d1bc60138977d9c79471b344a64f2df13b2ccef Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 27 Mar 2008 13:06:21 +0100
Subject: [patch 2/7] vfs: mountinfo: add seq_file_root()

Add a new function:

  seq_file_root()

This is similar to seq_path(), but calculates the path relative to the
given root, instead of current->fs->root.  If the path was unreachable
from root, then modify the root parameter to reflect this.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c              | 24 ++++++++++++++++--------
 fs/seq_file.c            | 30 ++++++++++++++++++++++++++++++
 include/linux/dcache.h   |  1 +
 include/linux/seq_file.h |  2 ++
 4 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 635c2aa427ed..3ee588d5f585 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1759,10 +1759,8 @@ static int prepend(char **buffer, int *buflen, const char *str,
 
 /**
  * d_path - return the path of a dentry
- * @dentry: dentry to report
- * @vfsmnt: vfsmnt to which the dentry belongs
- * @root: root dentry
- * @rootmnt: vfsmnt to which the root dentry belongs
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry (may be modified by this function)
  * @buffer: buffer to return value in
  * @buflen: buffer length
  *
@@ -1772,10 +1770,15 @@ static int prepend(char **buffer, int *buflen, const char *str,
  * Returns the buffer or an error code if the path was too long.
  *
  * "buflen" should be positive. Caller holds the dcache_lock.
+ *
+ * If path is not reachable from the supplied root, then the value of
+ * root is changed (without modifying refcounts).
  */
-static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
-		       struct path *root, char *buffer, int buflen)
+char *__d_path(const struct path *path, struct path *root,
+	       char *buffer, int buflen)
 {
+	struct dentry *dentry = path->dentry;
+	struct vfsmount *vfsmnt = path->mnt;
 	char * end = buffer+buflen;
 	char * retval;
 
@@ -1824,6 +1827,8 @@ global_root:
 	if (prepend(&retval, &buflen, dentry->d_name.name,
 		    dentry->d_name.len) != 0)
 		goto Elong;
+	root->mnt = vfsmnt;
+	root->dentry = dentry;
 	return retval;
 Elong:
 	return ERR_PTR(-ENAMETOOLONG);
@@ -1846,6 +1851,7 @@ char *d_path(struct path *path, char *buf, int buflen)
 {
 	char *res;
 	struct path root;
+	struct path tmp;
 
 	/*
 	 * We have various synthetic filesystems that never get mounted.  On
@@ -1862,7 +1868,8 @@ char *d_path(struct path *path, char *buf, int buflen)
 	path_get(&root);
 	read_unlock(&current->fs->lock);
 	spin_lock(&dcache_lock);
-	res = __d_path(path->dentry, path->mnt, &root, buf, buflen);
+	tmp = root;
+	res = __d_path(path, &tmp, buf, buflen);
 	spin_unlock(&dcache_lock);
 	path_put(&root);
 	return res;
@@ -1970,9 +1977,10 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
 	spin_lock(&dcache_lock);
 	if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
 		unsigned long len;
+		struct path tmp = root;
 		char * cwd;
 
-		cwd = __d_path(pwd.dentry, pwd.mnt, &root, page, PAGE_SIZE);
+		cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
 		spin_unlock(&dcache_lock);
 
 		error = PTR_ERR(cwd);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ae59f5a6c5c1..3f54dbd6c49b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -392,6 +392,36 @@ int seq_path(struct seq_file *m, struct path *path, char *esc)
 }
 EXPORT_SYMBOL(seq_path);
 
+/*
+ * Same as seq_path, but relative to supplied root.
+ *
+ * root may be changed, see __d_path().
+ */
+int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
+		  char *esc)
+{
+	int err = -ENAMETOOLONG;
+	if (m->count < m->size) {
+		char *s = m->buf + m->count;
+		char *p;
+
+		spin_lock(&dcache_lock);
+		p = __d_path(path, root, s, m->size - m->count);
+		spin_unlock(&dcache_lock);
+		err = PTR_ERR(p);
+		if (!IS_ERR(p)) {
+			s = mangle_path(s, p, esc);
+			if (s) {
+				p = m->buf + m->count;
+				m->count = s - m->buf;
+				return 0;
+			}
+		}
+	}
+	m->count = m->size;
+	return err;
+}
+
 /*
  * returns the path of the 'dentry' from the root of its filesystem.
  */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 63960033b6f5..cfb1627ac51c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -301,6 +301,7 @@ extern int d_validate(struct dentry *, struct dentry *);
  */
 extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
+extern char *__d_path(const struct path *path, struct path *root, char *, int);
 extern char *d_path(struct path *, char *, int);
 extern char *dentry_path(struct dentry *, char *, int);
 
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 11676ccef7b3..5b5369c3c209 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -46,6 +46,8 @@ int seq_printf(struct seq_file *, const char *, ...)
 
 int seq_path(struct seq_file *, struct path *, char *);
 int seq_dentry(struct seq_file *, struct dentry *, char *);
+int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
+		  char *esc);
 
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_release(struct inode *, struct file *);
-- 
cgit v1.2.3


From 73cd49ecdde92fdce131938bdaff4993010d181b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 26 Mar 2008 22:11:34 +0100
Subject: [patch 3/7] vfs: mountinfo: add mount ID

Add a unique ID to each vfsmount using the IDR infrastructure.  The
identifiers are reused after the vfsmount is freed.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 34 ++++++++++++++++++++++++++++++++++
 include/linux/mount.h |  1 +
 2 files changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1bf302d0478b..8ca6317cb401 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/ramfs.h>
 #include <linux/log2.h>
+#include <linux/idr.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -39,6 +40,7 @@
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static int event;
+static DEFINE_IDA(mnt_id_ida);
 
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
@@ -58,10 +60,41 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
 
+/* allocation is serialized by namespace_sem */
+static int mnt_alloc_id(struct vfsmount *mnt)
+{
+	int res;
+
+retry:
+	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
+	spin_lock(&vfsmount_lock);
+	res = ida_get_new(&mnt_id_ida, &mnt->mnt_id);
+	spin_unlock(&vfsmount_lock);
+	if (res == -EAGAIN)
+		goto retry;
+
+	return res;
+}
+
+static void mnt_free_id(struct vfsmount *mnt)
+{
+	spin_lock(&vfsmount_lock);
+	ida_remove(&mnt_id_ida, mnt->mnt_id);
+	spin_unlock(&vfsmount_lock);
+}
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
+		int err;
+
+		err = mnt_alloc_id(mnt);
+		if (err) {
+			kmem_cache_free(mnt_cache, mnt);
+			return NULL;
+		}
+
 		atomic_set(&mnt->mnt_count, 1);
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
@@ -353,6 +386,7 @@ EXPORT_SYMBOL(simple_set_mnt);
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
+	mnt_free_id(mnt);
 	kmem_cache_free(mnt_cache, mnt);
 }
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 87b24cea1863..f0dfb17ffccc 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -56,6 +56,7 @@ struct vfsmount {
 	struct list_head mnt_slave;	/* slave list entry */
 	struct vfsmount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
+	int mnt_id;			/* mount identifier */
 	/*
 	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
 	 * to let these frequently modified fields in a separate cache line
-- 
cgit v1.2.3


From 719f5d7f0b90ac2c8f8ca4232eb322b266fea01e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 27 Mar 2008 13:06:23 +0100
Subject: [patch 4/7] vfs: mountinfo: add mount peer group ID

Add a unique ID to each peer group using the IDR infrastructure.  The
identifiers are reused after the peer group dissolves.

The IDR structures are protected by holding namepspace_sem for write
while allocating or deallocating IDs.

IDs are allocated when a previously unshared vfsmount becomes the
first member of a peer group.  When a new member is added to an
existing group, the ID is copied from one of the old members.

IDs are freed when the last member of a peer group is unshared.

Setting the MNT_SHARED flag on members of a subtree is done as a
separate step, after all the IDs have been allocated.  This way an
allocation failure can be cleaned up easilty, without affecting the
propagation state.

Based on design sketch by Al Viro.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 93 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/pnode.c            |  5 ++-
 include/linux/mount.h |  1 +
 3 files changed, 95 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 8ca6317cb401..cefa1d9939b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,6 +41,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static int event;
 static DEFINE_IDA(mnt_id_ida);
+static DEFINE_IDA(mnt_group_ida);
 
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
@@ -83,6 +84,28 @@ static void mnt_free_id(struct vfsmount *mnt)
 	spin_unlock(&vfsmount_lock);
 }
 
+/*
+ * Allocate a new peer group ID
+ *
+ * mnt_group_ida is protected by namespace_sem
+ */
+static int mnt_alloc_group_id(struct vfsmount *mnt)
+{
+	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id);
+}
+
+/*
+ * Release a peer group ID
+ */
+void mnt_release_group_id(struct vfsmount *mnt)
+{
+	ida_remove(&mnt_group_ida, mnt->mnt_group_id);
+	mnt->mnt_group_id = 0;
+}
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -533,6 +556,17 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
 
 	if (mnt) {
+		if (flag & (CL_SLAVE | CL_PRIVATE))
+			mnt->mnt_group_id = 0; /* not a peer of original */
+		else
+			mnt->mnt_group_id = old->mnt_group_id;
+
+		if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
+			int err = mnt_alloc_group_id(mnt);
+			if (err)
+				goto out_free;
+		}
+
 		mnt->mnt_flags = old->mnt_flags;
 		atomic_inc(&sb->s_active);
 		mnt->mnt_sb = sb;
@@ -562,6 +596,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 		}
 	}
 	return mnt;
+
+ out_free:
+	free_vfsmnt(mnt);
+	return NULL;
 }
 
 static inline void __mntput(struct vfsmount *mnt)
@@ -1142,6 +1180,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
 	release_mounts(&umount_list);
 }
 
+static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
+{
+	struct vfsmount *p;
+
+	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
+		if (p->mnt_group_id && !IS_MNT_SHARED(p))
+			mnt_release_group_id(p);
+	}
+}
+
+static int invent_group_ids(struct vfsmount *mnt, bool recurse)
+{
+	struct vfsmount *p;
+
+	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
+		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+			int err = mnt_alloc_group_id(p);
+			if (err) {
+				cleanup_group_ids(mnt, p);
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
 /*
  *  @source_mnt : mount tree to be attached
  *  @nd         : place the mount tree @source_mnt is attached
@@ -1212,9 +1277,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	struct vfsmount *dest_mnt = path->mnt;
 	struct dentry *dest_dentry = path->dentry;
 	struct vfsmount *child, *p;
+	int err;
 
-	if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
-		return -EINVAL;
+	if (IS_MNT_SHARED(dest_mnt)) {
+		err = invent_group_ids(source_mnt, true);
+		if (err)
+			goto out;
+	}
+	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
+	if (err)
+		goto out_cleanup_ids;
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1237,6 +1309,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	}
 	spin_unlock(&vfsmount_lock);
 	return 0;
+
+ out_cleanup_ids:
+	if (IS_MNT_SHARED(dest_mnt))
+		cleanup_group_ids(source_mnt, NULL);
+ out:
+	return err;
 }
 
 static int graft_tree(struct vfsmount *mnt, struct path *path)
@@ -1277,6 +1355,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
 	struct vfsmount *m, *mnt = nd->path.mnt;
 	int recurse = flag & MS_REC;
 	int type = flag & ~MS_REC;
+	int err = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1285,12 +1364,20 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
 		return -EINVAL;
 
 	down_write(&namespace_sem);
+	if (type == MS_SHARED) {
+		err = invent_group_ids(mnt, recurse);
+		if (err)
+			goto out_unlock;
+	}
+
 	spin_lock(&vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
 	spin_unlock(&vfsmount_lock);
+
+ out_unlock:
 	up_write(&namespace_sem);
-	return 0;
+	return err;
 }
 
 /*
diff --git a/fs/pnode.c b/fs/pnode.c
index f968e35d9785..d18d66491a01 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -46,7 +46,11 @@ static int do_make_slave(struct vfsmount *mnt)
 		if (peer_mnt == mnt)
 			peer_mnt = NULL;
 	}
+	if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
+		mnt_release_group_id(mnt);
+
 	list_del_init(&mnt->mnt_share);
+	mnt->mnt_group_id = 0;
 
 	if (peer_mnt)
 		master = peer_mnt;
@@ -68,7 +72,6 @@ static int do_make_slave(struct vfsmount *mnt)
 	}
 	mnt->mnt_master = master;
 	CLEAR_MNT_SHARED(mnt);
-	INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	return 0;
 }
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index f0dfb17ffccc..b4836d58f428 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -57,6 +57,7 @@ struct vfsmount {
 	struct vfsmount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	int mnt_id;			/* mount identifier */
+	int mnt_group_id;		/* peer group identifier */
 	/*
 	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
 	 * to let these frequently modified fields in a separate cache line
-- 
cgit v1.2.3


From a1a2c409b666befc58c2db9c7fbddf200f153470 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 27 Mar 2008 13:06:24 +0100
Subject: [patch 5/7] vfs: mountinfo: allow using process root

Allow /proc/<pid>/mountinfo to use the root of <pid> to calculate
mountpoints.

 - move definition of 'struct proc_mounts' to <linux/mnt_namespace.h>
 - add the process's namespace and root to this structure
 - pass a pointer to 'struct proc_mounts' into seq_operations

In addition the following cleanups are made:

 - use a common open function for /proc/<pid>/{mounts,mountstat}
 - surround namespace.c part of these proc files with #ifdef CONFIG_PROC_FS
 - make the seq_operations structures const

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c                |  14 +++---
 fs/proc/base.c                | 108 ++++++++++++++++++++----------------------
 include/linux/mnt_namespace.h |  11 +++++
 3 files changed, 70 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index cefa1d9939b0..dfdf51e81c1c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -724,20 +724,21 @@ void save_mount_options(struct super_block *sb, char *options)
 }
 EXPORT_SYMBOL(save_mount_options);
 
+#ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct mnt_namespace *n = m->private;
+	struct proc_mounts *p = m->private;
 
 	down_read(&namespace_sem);
-	return seq_list_start(&n->list, *pos);
+	return seq_list_start(&p->ns->list, *pos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct mnt_namespace *n = m->private;
+	struct proc_mounts *p = m->private;
 
-	return seq_list_next(v, &n->list, pos);
+	return seq_list_next(v, &p->ns->list, pos);
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -794,7 +795,7 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 	return err;
 }
 
-struct seq_operations mounts_op = {
+const struct seq_operations mounts_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
@@ -833,12 +834,13 @@ static int show_vfsstat(struct seq_file *m, void *v)
 	return err;
 }
 
-struct seq_operations mountstats_op = {
+const struct seq_operations mountstats_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_vfsstat,
 };
+#endif  /* CONFIG_PROC_FS */
 
 /**
  * may_umount_tree - check if a mount tree is busy
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7313c62e3e9d..a04b3db7a296 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -502,17 +502,14 @@ static const struct inode_operations proc_def_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-extern const struct seq_operations mounts_op;
-struct proc_mounts {
-	struct seq_file m;
-	int event;
-};
-
-static int mounts_open(struct inode *inode, struct file *file)
+static int mounts_open_common(struct inode *inode, struct file *file,
+			      const struct seq_operations *op)
 {
 	struct task_struct *task = get_proc_task(inode);
 	struct nsproxy *nsp;
 	struct mnt_namespace *ns = NULL;
+	struct fs_struct *fs = NULL;
+	struct path root;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
 
@@ -525,40 +522,61 @@ static int mounts_open(struct inode *inode, struct file *file)
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-
+		if (ns)
+			fs = get_fs_struct(task);
 		put_task_struct(task);
 	}
 
-	if (ns) {
-		ret = -ENOMEM;
-		p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
-		if (p) {
-			file->private_data = &p->m;
-			ret = seq_open(file, &mounts_op);
-			if (!ret) {
-				p->m.private = ns;
-				p->event = ns->event;
-				return 0;
-			}
-			kfree(p);
-		}
-		put_mnt_ns(ns);
-	}
+	if (!ns)
+		goto err;
+	if (!fs)
+		goto err_put_ns;
+
+	read_lock(&fs->lock);
+	root = fs->root;
+	path_get(&root);
+	read_unlock(&fs->lock);
+	put_fs_struct(fs);
+
+	ret = -ENOMEM;
+	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
+	if (!p)
+		goto err_put_path;
+
+	file->private_data = &p->m;
+	ret = seq_open(file, op);
+	if (ret)
+		goto err_free;
+
+	p->m.private = p;
+	p->ns = ns;
+	p->root = root;
+	p->event = ns->event;
+
+	return 0;
+
+ err_free:
+	kfree(p);
+ err_put_path:
+	path_put(&root);
+ err_put_ns:
+	put_mnt_ns(ns);
+ err:
 	return ret;
 }
 
 static int mounts_release(struct inode *inode, struct file *file)
 {
-	struct seq_file *m = file->private_data;
-	struct mnt_namespace *ns = m->private;
-	put_mnt_ns(ns);
+	struct proc_mounts *p = file->private_data;
+	path_put(&p->root);
+	put_mnt_ns(p->ns);
 	return seq_release(inode, file);
 }
 
 static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
 	struct proc_mounts *p = file->private_data;
-	struct mnt_namespace *ns = p->m.private;
+	struct mnt_namespace *ns = p->ns;
 	unsigned res = 0;
 
 	poll_wait(file, &ns->poll, wait);
@@ -573,6 +591,11 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 	return res;
 }
 
+static int mounts_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, &mounts_op);
+}
+
 static const struct file_operations proc_mounts_operations = {
 	.open		= mounts_open,
 	.read		= seq_read,
@@ -581,38 +604,9 @@ static const struct file_operations proc_mounts_operations = {
 	.poll		= mounts_poll,
 };
 
-extern const struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-	int ret = seq_open(file, &mountstats_op);
-
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		struct nsproxy *nsp;
-		struct mnt_namespace *mnt_ns = NULL;
-		struct task_struct *task = get_proc_task(inode);
-
-		if (task) {
-			rcu_read_lock();
-			nsp = task_nsproxy(task);
-			if (nsp) {
-				mnt_ns = nsp->mnt_ns;
-				if (mnt_ns)
-					get_mnt_ns(mnt_ns);
-			}
-			rcu_read_unlock();
-
-			put_task_struct(task);
-		}
-
-		if (mnt_ns)
-			m->private = mnt_ns;
-		else {
-			seq_release(inode, file);
-			ret = -EINVAL;
-		}
-	}
-	return ret;
+	return mounts_open_common(inode, file, &mountstats_op);
 }
 
 static const struct file_operations proc_mountstats_operations = {
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 8eed44f8ca73..c078aacc8116 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -5,6 +5,7 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
+#include <linux/seq_file.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -14,6 +15,13 @@ struct mnt_namespace {
 	int event;
 };
 
+struct proc_mounts {
+	struct seq_file m; /* must be the first element */
+	struct mnt_namespace *ns;
+	struct path root;
+	int event;
+};
+
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
@@ -37,5 +45,8 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
 	atomic_inc(&ns->count);
 }
 
+extern const struct seq_operations mounts_op;
+extern const struct seq_operations mountstats_op;
+
 #endif
 #endif
-- 
cgit v1.2.3


From 2d4d4864ac08caff5c204a752bd004eed4f08760 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram@us.ibm.com>
Date: Thu, 27 Mar 2008 13:06:25 +0100
Subject: [patch 6/7] vfs: mountinfo: add /proc/<pid>/mountinfo

[mszeredi@suse.cz] rewrite and split big patch into managable chunks

/proc/mounts in its current form lacks important information:

 - propagation state
 - root of mount for bind mounts
 - the st_dev value used within the filesystem
 - identifier for each mount and it's parent

It also suffers from the following problems:

 - not easily extendable
 - ambiguity of mountpoints within a chrooted environment
 - doesn't distinguish between filesystem dependent and independent options
 - doesn't distinguish between per mount and per super block options

This patch introduces /proc/<pid>/mountinfo which attempts to address
all these deficiencies.

Code shared between /proc/<pid>/mounts and /proc/<pid>/mountinfo is
extracted into separate functions.

Thanks to Al Viro for the help in getting the design right.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/proc.txt |  32 ++++++++++
 fs/namespace.c                     | 119 ++++++++++++++++++++++++++++++-------
 fs/proc/base.c                     |  15 +++++
 include/linux/mnt_namespace.h      |   1 +
 4 files changed, 144 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 518ebe609e2b..2cd920f92e5e 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -43,6 +43,7 @@ Table of Contents
   2.13	/proc/<pid>/oom_score - Display current oom-killer score
   2.14	/proc/<pid>/io - Display the IO accounting fields
   2.15	/proc/<pid>/coredump_filter - Core dump filtering settings
+  2.16	/proc/<pid>/mountinfo - Information about mounts
 
 ------------------------------------------------------------------------------
 Preface
@@ -2348,4 +2349,35 @@ For example:
   $ echo 0x7 > /proc/self/coredump_filter
   $ ./some_program
 
+2.16	/proc/<pid>/mountinfo - Information about mounts
+--------------------------------------------------------
+
+This file contains lines of the form:
+
+36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+(1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+(1) mount ID:  unique identifier of the mount (may be reused after umount)
+(2) parent ID:  ID of parent (or of self for the top of the mount tree)
+(3) major:minor:  value of st_dev for files on filesystem
+(4) root:  root of the mount within the filesystem
+(5) mount point:  mount point relative to the process's root
+(6) mount options:  per mount options
+(7) optional fields:  zero or more fields of the form "tag[:value]"
+(8) separator:  marks the end of the optional fields
+(9) filesystem type:  name of filesystem of the form "type[.subtype]"
+(10) mount source:  filesystem specific information or "none"
+(11) super options:  per super block options
+
+Parsers should ignore all unrecognised optional fields.  Currently the
+possible optional fields are:
+
+shared:X  mount is shared in peer group X
+master:X  mount is slave to peer group X
+unbindable  mount is unbindable
+
+For more information on mount propagation see:
+
+  Documentation/filesystems/sharedsubtree.txt
+
 ------------------------------------------------------------------------------
diff --git a/fs/namespace.c b/fs/namespace.c
index dfdf51e81c1c..c807b8d5f891 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -746,20 +746,30 @@ static void m_stop(struct seq_file *m, void *v)
 	up_read(&namespace_sem);
 }
 
-static int show_vfsmnt(struct seq_file *m, void *v)
+struct proc_fs_info {
+	int flag;
+	const char *str;
+};
+
+static void show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
-	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
-	int err = 0;
-	static struct proc_fs_info {
-		int flag;
-		char *str;
-	} fs_info[] = {
+	static const struct proc_fs_info fs_info[] = {
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
 		{ 0, NULL }
 	};
-	static struct proc_fs_info mnt_info[] = {
+	const struct proc_fs_info *fs_infop;
+
+	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
+		if (sb->s_flags & fs_infop->flag)
+			seq_puts(m, fs_infop->str);
+	}
+}
+
+static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+{
+	static const struct proc_fs_info mnt_info[] = {
 		{ MNT_NOSUID, ",nosuid" },
 		{ MNT_NODEV, ",nodev" },
 		{ MNT_NOEXEC, ",noexec" },
@@ -768,27 +778,37 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 		{ MNT_RELATIME, ",relatime" },
 		{ 0, NULL }
 	};
-	struct proc_fs_info *fs_infop;
+	const struct proc_fs_info *fs_infop;
+
+	for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
+		if (mnt->mnt_flags & fs_infop->flag)
+			seq_puts(m, fs_infop->str);
+	}
+}
+
+static void show_type(struct seq_file *m, struct super_block *sb)
+{
+	mangle(m, sb->s_type->name);
+	if (sb->s_subtype && sb->s_subtype[0]) {
+		seq_putc(m, '.');
+		mangle(m, sb->s_subtype);
+	}
+}
+
+static int show_vfsmnt(struct seq_file *m, void *v)
+{
+	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+	int err = 0;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
 
 	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
 	seq_putc(m, ' ');
 	seq_path(m, &mnt_path, " \t\n\\");
 	seq_putc(m, ' ');
-	mangle(m, mnt->mnt_sb->s_type->name);
-	if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) {
-		seq_putc(m, '.');
-		mangle(m, mnt->mnt_sb->s_subtype);
-	}
+	show_type(m, mnt->mnt_sb);
 	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
-		if (mnt->mnt_sb->s_flags & fs_infop->flag)
-			seq_puts(m, fs_infop->str);
-	}
-	for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
-		if (mnt->mnt_flags & fs_infop->flag)
-			seq_puts(m, fs_infop->str);
-	}
+	show_sb_opts(m, mnt->mnt_sb);
+	show_mnt_opts(m, mnt);
 	if (mnt->mnt_sb->s_op->show_options)
 		err = mnt->mnt_sb->s_op->show_options(m, mnt);
 	seq_puts(m, " 0 0\n");
@@ -802,6 +822,59 @@ const struct seq_operations mounts_op = {
 	.show	= show_vfsmnt
 };
 
+static int show_mountinfo(struct seq_file *m, void *v)
+{
+	struct proc_mounts *p = m->private;
+	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+	struct super_block *sb = mnt->mnt_sb;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	struct path root = p->root;
+	int err = 0;
+
+	seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
+		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
+	seq_dentry(m, mnt->mnt_root, " \t\n\\");
+	seq_putc(m, ' ');
+	seq_path_root(m, &mnt_path, &root, " \t\n\\");
+	if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
+		/*
+		 * Mountpoint is outside root, discard that one.  Ugly,
+		 * but less so than trying to do that in iterator in a
+		 * race-free way (due to renames).
+		 */
+		return SEQ_SKIP;
+	}
+	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
+	show_mnt_opts(m, mnt);
+
+	/* Tagged fields ("foo:X" or "bar") */
+	if (IS_MNT_SHARED(mnt))
+		seq_printf(m, " shared:%i", mnt->mnt_group_id);
+	if (IS_MNT_SLAVE(mnt))
+		seq_printf(m, " master:%i", mnt->mnt_master->mnt_group_id);
+	if (IS_MNT_UNBINDABLE(mnt))
+		seq_puts(m, " unbindable");
+
+	/* Filesystem specific data */
+	seq_puts(m, " - ");
+	show_type(m, sb);
+	seq_putc(m, ' ');
+	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
+	show_sb_opts(m, sb);
+	if (sb->s_op->show_options)
+		err = sb->s_op->show_options(m, mnt);
+	seq_putc(m, '\n');
+	return err;
+}
+
+const struct seq_operations mountinfo_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_mountinfo,
+};
+
 static int show_vfsstat(struct seq_file *m, void *v)
 {
 	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
@@ -822,7 +895,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
 
 	/* file system type */
 	seq_puts(m, "with fstype ");
-	mangle(m, mnt->mnt_sb->s_type->name);
+	show_type(m, mnt->mnt_sb);
 
 	/* optional statistics */
 	if (mnt->mnt_sb->s_op->show_stats) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a04b3db7a296..c5e412a00b17 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -604,6 +604,19 @@ static const struct file_operations proc_mounts_operations = {
 	.poll		= mounts_poll,
 };
 
+static int mountinfo_open(struct inode *inode, struct file *file)
+{
+	return mounts_open_common(inode, file, &mountinfo_op);
+}
+
+static const struct file_operations proc_mountinfo_operations = {
+	.open		= mountinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+	.poll		= mounts_poll,
+};
+
 static int mountstats_open(struct inode *inode, struct file *file)
 {
 	return mounts_open_common(inode, file, &mountstats_op);
@@ -2303,6 +2316,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	LNK("root",       root),
 	LNK("exe",        exe),
 	REG("mounts",     S_IRUGO, mounts),
+	REG("mountinfo",  S_IRUGO, mountinfo),
 	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, clear_refs),
@@ -2635,6 +2649,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	LNK("root",      root),
 	LNK("exe",       exe),
 	REG("mounts",    S_IRUGO, mounts),
+	REG("mountinfo",  S_IRUGO, mountinfo),
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",     S_IRUGO, smaps),
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index c078aacc8116..830bbcd449d6 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -46,6 +46,7 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
 }
 
 extern const struct seq_operations mounts_op;
+extern const struct seq_operations mountinfo_op;
 extern const struct seq_operations mountstats_op;
 
 #endif
-- 
cgit v1.2.3


From 97e7e0f71d6d948c25f11f0a33878d9356d9579e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 27 Mar 2008 13:06:26 +0100
Subject: [patch 7/7] vfs: mountinfo: show dominating group id

Show peer group ID of nearest dominating group that has intersection
with the mount's namespace.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/proc.txt |  6 +++++
 fs/namespace.c                     |  9 +++++--
 fs/pnode.c                         | 51 ++++++++++++++++++++++++++++++++++++++
 fs/pnode.h                         |  1 +
 4 files changed, 65 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2cd920f92e5e..2a99116edc47 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -2374,8 +2374,14 @@ possible optional fields are:
 
 shared:X  mount is shared in peer group X
 master:X  mount is slave to peer group X
+propagate_from:X  mount is slave and receives propagation from peer group X (*)
 unbindable  mount is unbindable
 
+(*) X is the closest dominant peer group under the process's root.  If
+X is the immediate master of the mount, or if there's no dominant peer
+group under the same root, then only the "master:X" field is present
+and not the "propagate_from:X" field.
+
 For more information on mount propagation see:
 
   Documentation/filesystems/sharedsubtree.txt
diff --git a/fs/namespace.c b/fs/namespace.c
index c807b8d5f891..0505fb61aa74 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -850,8 +850,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	/* Tagged fields ("foo:X" or "bar") */
 	if (IS_MNT_SHARED(mnt))
 		seq_printf(m, " shared:%i", mnt->mnt_group_id);
-	if (IS_MNT_SLAVE(mnt))
-		seq_printf(m, " master:%i", mnt->mnt_master->mnt_group_id);
+	if (IS_MNT_SLAVE(mnt)) {
+		int master = mnt->mnt_master->mnt_group_id;
+		int dom = get_dominating_id(mnt, &p->root);
+		seq_printf(m, " master:%i", master);
+		if (dom && dom != master)
+			seq_printf(m, " propagate_from:%i", dom);
+	}
 	if (IS_MNT_UNBINDABLE(mnt))
 		seq_puts(m, " unbindable");
 
diff --git a/fs/pnode.c b/fs/pnode.c
index d18d66491a01..8d5f392ec3d3 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -28,6 +28,57 @@ static inline struct vfsmount *next_slave(struct vfsmount *p)
 	return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
 }
 
+/*
+ * Return true if path is reachable from root
+ *
+ * namespace_sem is held, and mnt is attached
+ */
+static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry,
+			 const struct path *root)
+{
+	while (mnt != root->mnt && mnt->mnt_parent != mnt) {
+		dentry = mnt->mnt_mountpoint;
+		mnt = mnt->mnt_parent;
+	}
+	return mnt == root->mnt && is_subdir(dentry, root->dentry);
+}
+
+static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
+					    struct mnt_namespace *ns,
+					    const struct path *root)
+{
+	struct vfsmount *m = mnt;
+
+	do {
+		/* Check the namespace first for optimization */
+		if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root))
+			return m;
+
+		m = next_peer(m);
+	} while (m != mnt);
+
+	return NULL;
+}
+
+/*
+ * Get ID of closest dominating peer group having a representative
+ * under the given root.
+ *
+ * Caller must hold namespace_sem
+ */
+int get_dominating_id(struct vfsmount *mnt, const struct path *root)
+{
+	struct vfsmount *m;
+
+	for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
+		struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root);
+		if (d)
+			return d->mnt_group_id;
+	}
+
+	return 0;
+}
+
 static int do_make_slave(struct vfsmount *mnt)
 {
 	struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master;
diff --git a/fs/pnode.h b/fs/pnode.h
index 973c3f825e7d..958665d662af 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -36,4 +36,5 @@ int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
 int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct vfsmount *, int);
 void mnt_release_group_id(struct vfsmount *);
+int get_dominating_id(struct vfsmount *mnt, const struct path *root);
 #endif /* _LINUX_PNODE_H */
-- 
cgit v1.2.3