37 files changed, 625 insertions, 609 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index b0a0ae509c00..61c599b4a1e3 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		kfree(v9ses);
-		return ERR_PTR(newfid);
+		sb = ERR_PTR(newfid);
+		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-
+	if (IS_ERR(sb))
+		goto out_close_session;
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	return sb;
 
+out_close_session:
+	v9fs_session_close(v9ses);
+out_free_session:
+	kfree(v9ses);
+	return sb;
+
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
diff --git a/fs/Kconfig b/fs/Kconfig
index e207be68d4ca..2524629dc835 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -799,6 +799,7 @@ config PROC_KCORE
 config PROC_VMCORE
         bool "/proc/vmcore support (EXPERIMENTAL)"
         depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+	default y
         help
         Exports the dump image of crashed kernel in ELF format.
 
@@ -861,7 +862,7 @@ config RAMFS
 
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on SYSFS && EXPERIMENTAL
 	help
 	  configfs is a ram-based filesystem that provides the converse
 	  of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8ed9b06a9828..5638c8f9362f 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group)
 	int ret = 0;
 	int i;
 
-	if (group && group->default_groups) {
+	if (group->default_groups) {
 		/* FYI, we're faking mkdir here
 		 * I'm not sure we need this semaphore, as we're called
 		 * from our parent's mkdir.  That holds our parent's
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 242fe1a66ce5..1b4491cdd115 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 	switch (op) {
 	case EPOLL_CTL_ADD:
 		if (!epi) {
-			epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+			epds.events |= POLLERR | POLLHUP;
 
 			error = ep_insert(ep, &epds, tfile, fd);
 		} else
@@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 		break;
 	case EPOLL_CTL_MOD:
 		if (epi) {
-			epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+			epds.events |= POLLERR | POLLHUP;
 			error = ep_modify(ep, epi, &epds);
 		} else
 			error = -ENOENT;
diff --git a/fs/exec.c b/fs/exec.c
index 0291a68a3626..3234a0c32d54 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,6 +678,18 @@ static int de_thread(struct task_struct *tsk)
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
+		/*
+		 * The only record we have of the real-time age of a
+		 * process, regardless of execs it's done, is start_time.
+		 * All the past CPU time is accumulated in signal_struct
+		 * from sister threads now dead.  But in this non-leader
+		 * exec, nothing survives from the original leader thread,
+		 * whose birth marks the true age of this process now.
+		 * When we take on its identity by switching to its PID, we
+		 * also take its birthdate (always earlier than our own).
+		 */
+		current->start_time = leader->start_time;
+
 		spin_lock(&leader->proc_lock);
 		spin_lock(&current->proc_lock);
 		proc_dentry1 = proc_pid_unhash(current);
@@ -723,7 +735,12 @@ static int de_thread(struct task_struct *tsk)
 		current->parent = current->real_parent = leader->real_parent;
 		leader->parent = leader->real_parent = child_reaper;
 		current->group_leader = current;
-		leader->group_leader = leader;
+		leader->group_leader = current;
+
+		/* Reduce leader to a thread */
+		detach_pid(leader, PIDTYPE_PGID);
+		detach_pid(leader, PIDTYPE_SID);
+		list_del_init(&leader->tasks);
 
 		add_parent(current);
 		add_parent(leader);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 1041dab6de2f..14f5f6ea3e72 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_put;
 	}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 23d1f52eb1b8..6c740f860665 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep;
 
 static struct fuse_conn *fuse_get_conn(struct file *file)
 {
-	struct fuse_conn *fc;
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
-	if (fc && !fc->connected)
-		fc = NULL;
-	spin_unlock(&fuse_lock);
-	return fc;
+	/*
+	 * Lockless access is OK, because file->private data is set
+	 * once during mount and is valid until the file is released.
+	 */
+	return file->private_data;
 }
 
 static void fuse_request_init(struct fuse_req *req)
@@ -74,10 +72,8 @@ static void restore_sigs(sigset_t *oldset)
  */
 void fuse_reset_request(struct fuse_req *req)
 {
-	int preallocated = req->preallocated;
 	BUG_ON(atomic_read(&req->count) != 1);
 	fuse_request_init(req);
-	req->preallocated = preallocated;
 }
 
 static void __fuse_get_request(struct fuse_req *req)
@@ -92,80 +88,52 @@ static void __fuse_put_request(struct fuse_req *req)
 	atomic_dec(&req->count);
 }
 
-static struct fuse_req *do_get_request(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
-
-	spin_lock(&fuse_lock);
-	BUG_ON(list_empty(&fc->unused_list));
-	req = list_entry(fc->unused_list.next, struct fuse_req, list);
-	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
-	fuse_request_init(req);
-	req->preallocated = 1;
-	req->in.h.uid = current->fsuid;
-	req->in.h.gid = current->fsgid;
-	req->in.h.pid = current->pid;
-	return req;
-}
-
-/* This can return NULL, but only in case it's interrupted by a SIGKILL */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc)
-{
-	int intr;
 	sigset_t oldset;
+	int err;
 
-	atomic_inc(&fc->num_waiting);
 	block_sigs(&oldset);
-	intr = down_interruptible(&fc->outstanding_sem);
+	err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
 	restore_sigs(&oldset);
-	if (intr) {
-		atomic_dec(&fc->num_waiting);
-		return NULL;
-	}
-	return do_get_request(fc);
-}
+	if (err)
+		return ERR_PTR(-EINTR);
 
-/* Must be called with fuse_lock held */
-static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
-{
-	if (req->preallocated) {
-		atomic_dec(&fc->num_waiting);
-		list_add(&req->list, &fc->unused_list);
-	} else
-		fuse_request_free(req);
+	req = fuse_request_alloc();
+	if (!req)
+		return ERR_PTR(-ENOMEM);
 
-	/* If we are in debt decrease that first */
-	if (fc->outstanding_debt)
-		fc->outstanding_debt--;
-	else
-		up(&fc->outstanding_sem);
+	atomic_inc(&fc->num_waiting);
+	fuse_request_init(req);
+	req->in.h.uid = current->fsuid;
+	req->in.h.gid = current->fsgid;
+	req->in.h.pid = current->pid;
+	return req;
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		spin_lock(&fuse_lock);
-		fuse_putback_request(fc, req);
-		spin_unlock(&fuse_lock);
+		atomic_dec(&fc->num_waiting);
+		fuse_request_free(req);
 	}
 }
 
-static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
-{
-	if (atomic_dec_and_test(&req->count))
-		fuse_putback_request(fc, req);
-}
-
-void fuse_release_background(struct fuse_req *req)
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	iput(req->inode);
 	iput(req->inode2);
 	if (req->file)
 		fput(req->file);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	list_del(&req->bg_entry);
-	spin_unlock(&fuse_lock);
+	if (fc->num_background == FUSE_MAX_BACKGROUND) {
+		fc->blocked = 0;
+		wake_up_all(&fc->blocked_waitq);
+	}
+	fc->num_background--;
+	spin_unlock(&fc->lock);
 }
 
 /*
@@ -184,23 +152,23 @@ void fuse_release_background(struct fuse_req *req)
  * interrupted and put in the background, it will return with an error
  * and hence never be reset and reused.
  *
- * Called with fuse_lock, unlocks it
+ * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	list_del(&req->list);
 	req->state = FUSE_REQ_FINISHED;
 	if (!req->background) {
+		spin_unlock(&fc->lock);
 		wake_up(&req->waitq);
-		fuse_put_request_locked(fc, req);
-		spin_unlock(&fuse_lock);
+		fuse_put_request(fc, req);
 	} else {
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		down_read(&fc->sbput_sem);
 		if (fc->mounted)
-			fuse_release_background(req);
+			fuse_release_background(fc, req);
 		up_read(&fc->sbput_sem);
 		if (end)
 			end(fc, req);
@@ -242,6 +210,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->background = 1;
 	list_add(&req->bg_entry, &fc->background);
+	fc->num_background++;
+	if (fc->num_background == FUSE_MAX_BACKGROUND)
+		fc->blocked = 1;
 	if (req->inode)
 		req->inode = igrab(req->inode);
 	if (req->inode2)
@@ -250,16 +221,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 		get_file(req->file);
 }
 
-/* Called with fuse_lock held.  Releases, and then reacquires it. */
+/* Called with fc->lock held.  Releases, and then reacquires it. */
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 {
 	sigset_t oldset;
 
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	block_sigs(&oldset);
 	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
 	restore_sigs(&oldset);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
 		return;
 
@@ -273,9 +244,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 		   locked state, there mustn't be any filesystem
 		   operation (e.g. page fault), since that could lead
 		   to deadlock */
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		wait_event(req->waitq, !req->locked);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 	if (req->state == FUSE_REQ_PENDING) {
 		list_del(&req->list);
@@ -304,19 +275,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	req->in.h.unique = fc->reqctr;
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
-	if (!req->preallocated) {
-		/* If request is not preallocated (either FORGET or
-		   RELEASE), then still decrease outstanding_sem, so
-		   user can't open infinite number of files while not
-		   processing the RELEASE requests.  However for
-		   efficiency do it without blocking, so if down()
-		   would block, just increase the debt instead */
-		if (down_trylock(&fc->outstanding_sem))
-			fc->outstanding_debt++;
-	}
 	list_add_tail(&req->list, &fc->pending);
 	req->state = FUSE_REQ_PENDING;
 	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
 /*
@@ -325,7 +287,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (!fc->connected)
 		req->out.h.error = -ENOTCONN;
 	else if (fc->conn_error)
@@ -338,15 +300,16 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 
 		request_wait_answer(fc, req);
 	}
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
+	background_request(fc, req);
 	if (fc->connected) {
 		queue_request(fc, req);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	} else {
 		req->out.h.error = -ENOTCONN;
 		request_end(fc, req);
@@ -362,9 +325,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fuse_lock);
-	background_request(fc, req);
-	spin_unlock(&fuse_lock);
 	request_send_nowait(fc, req);
 }
 
@@ -373,16 +333,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
  * anything that could cause a page-fault.  If the request was already
  * interrupted bail out.
  */
-static int lock_request(struct fuse_req *req)
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		if (req->interrupted)
 			err = -ENOENT;
 		else
 			req->locked = 1;
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 	return err;
 }
@@ -392,18 +352,19 @@ static int lock_request(struct fuse_req *req)
  * requester thread is currently waiting for it to be unlocked, so
  * wake it up.
  */
-static void unlock_request(struct fuse_req *req)
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (req) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		req->locked = 0;
 		if (req->interrupted)
 			wake_up(&req->waitq);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 }
 
 struct fuse_copy_state {
+	struct fuse_conn *fc;
 	int write;
 	struct fuse_req *req;
 	const struct iovec *iov;
@@ -416,11 +377,12 @@ struct fuse_copy_state {
 	unsigned len;
 };
 
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
-			   struct fuse_req *req, const struct iovec *iov,
-			   unsigned long nr_segs)
+static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
+			   int write, struct fuse_req *req,
+			   const struct iovec *iov, unsigned long nr_segs)
 {
 	memset(cs, 0, sizeof(*cs));
+	cs->fc = fc;
 	cs->write = write;
 	cs->req = req;
 	cs->iov = iov;
@@ -450,7 +412,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	unsigned long offset;
 	int err;
 
-	unlock_request(cs->req);
+	unlock_request(cs->fc, cs->req);
 	fuse_copy_finish(cs);
 	if (!cs->seglen) {
 		BUG_ON(!cs->nr_segs);
@@ -473,7 +435,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	cs->seglen -= cs->len;
 	cs->addr += cs->len;
 
-	return lock_request(cs->req);
+	return lock_request(cs->fc, cs->req);
 }
 
 /* Do as much copy to/from userspace buffer as we can */
@@ -585,9 +547,9 @@ static void request_wait(struct fuse_conn *fc)
 		if (signal_pending(current))
 			break;
 
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		schedule();
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&fc->waitq, &wait);
@@ -606,18 +568,21 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t *off)
 {
 	int err;
-	struct fuse_conn *fc;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	struct fuse_copy_state cs;
 	unsigned reqsize;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
 
  restart:
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
-	err = -EPERM;
-	if (!fc)
+	spin_lock(&fc->lock);
+	err = -EAGAIN;
+	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+	    list_empty(&fc->pending))
 		goto err_unlock;
+
 	request_wait(fc);
 	err = -ENODEV;
 	if (!fc->connected)
@@ -641,14 +606,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 		request_end(fc, req);
 		goto restart;
 	}
-	spin_unlock(&fuse_lock);
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	spin_unlock(&fc->lock);
+	fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
 	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
 	if (!err)
 		err = fuse_copy_args(&cs, in->numargs, in->argpages,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
 		err = -ENOENT;
@@ -663,12 +628,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	else {
 		req->state = FUSE_REQ_SENT;
 		list_move_tail(&req->list, &fc->processing);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 	return reqsize;
 
  err_unlock:
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	return err;
 }
 
@@ -735,9 +700,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return -EPERM;
 
-	fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+	fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
 	if (nbytes < sizeof(struct fuse_out_header))
 		return -EINVAL;
 
@@ -749,7 +714,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	    oh.len != nbytes)
 		goto err_finish;
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	err = -ENOENT;
 	if (!fc->connected)
 		goto err_unlock;
@@ -760,9 +725,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 
 	if (req->interrupted) {
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		fuse_copy_finish(&cs);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		request_end(fc, req);
 		return -ENOENT;
 	}
@@ -770,12 +735,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	req->out.h = oh;
 	req->locked = 1;
 	cs.req = req;
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 
 	err = copy_out_args(&cs, &req->out, nbytes);
 	fuse_copy_finish(&cs);
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err) {
 		if (req->interrupted)
@@ -787,7 +752,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	return err ? err : nbytes;
 
  err_unlock:
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
  err_finish:
 	fuse_copy_finish(&cs);
 	return err;
@@ -804,18 +769,19 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
 
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
 	unsigned mask = POLLOUT | POLLWRNORM;
-
+	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return POLLERR;
 
 	poll_wait(file, &fc->waitq, wait);
 
-	spin_lock(&fuse_lock);
-	if (!list_empty(&fc->pending))
-                mask |= POLLIN | POLLRDNORM;
-	spin_unlock(&fuse_lock);
+	spin_lock(&fc->lock);
+	if (!fc->connected)
+		mask = POLLERR;
+	else if (!list_empty(&fc->pending))
+		mask |= POLLIN | POLLRDNORM;
+	spin_unlock(&fc->lock);
 
 	return mask;
 }
@@ -823,7 +789,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 /*
  * Abort all requests on the given list (pending or processing)
  *
- * This function releases and reacquires fuse_lock
+ * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
 {
@@ -832,7 +798,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
 		request_end(fc, req);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 }
 
@@ -863,10 +829,10 @@ static void end_io_requests(struct fuse_conn *fc)
 			req->end = NULL;
 			/* The end function will consume this reference */
 			__fuse_get_request(req);
-			spin_unlock(&fuse_lock);
+			spin_unlock(&fc->lock);
 			wait_event(req->waitq, !req->locked);
 			end(fc, req);
-			spin_lock(&fuse_lock);
+			spin_lock(&fc->lock);
 		}
 	}
 }
@@ -893,35 +859,44 @@ static void end_io_requests(struct fuse_conn *fc)
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (fc->connected) {
 		fc->connected = 0;
 		end_io_requests(fc);
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		wake_up_all(&fc->waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	}
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 static int fuse_dev_release(struct inode *inode, struct file *file)
 {
-	struct fuse_conn *fc;
-
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
+	struct fuse_conn *fc = fuse_get_conn(file);
 	if (fc) {
+		spin_lock(&fc->lock);
 		fc->connected = 0;
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
-	}
-	spin_unlock(&fuse_lock);
-	if (fc)
+		spin_unlock(&fc->lock);
+		fasync_helper(-1, file, 0, &fc->fasync);
 		kobject_put(&fc->kobj);
+	}
 
 	return 0;
 }
 
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	/* No locking - fasync_helper does its own locking */
+	return fasync_helper(fd, file, on, &fc->fasync);
+}
+
 const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
@@ -931,6 +906,7 @@ const struct file_operations fuse_dev_operations = {
 	.writev		= fuse_dev_writev,
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
+	.fasync		= fuse_dev_fasync,
 };
 
 static struct miscdevice fuse_miscdevice = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 256355b80256..8d7546e832e8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 
 		fc = get_fuse_conn(inode);
-		req = fuse_get_request(fc);
-		if (!req)
+		req = fuse_get_req(fc);
+		if (IS_ERR(req))
 			return 0;
 
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
@@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	if (entry->d_name.len > FUSE_NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return ERR_PTR(-EINTR);
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct file *file;
 	int flags = nd->intent.open.flags - 1;
 
-	err = -ENOSYS;
 	if (fc->no_create)
-		goto out;
+		return -ENOSYS;
 
-	err = -EINTR;
-	req = fuse_get_request(fc);
-	if (!req)
-		goto out;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
+	err = -ENOMEM;
 	ff = fuse_file_alloc();
 	if (!ff)
 		goto out_put_request;
@@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	fuse_file_free(ff);
  out_put_request:
 	fuse_put_request(fc, req);
- out:
 	return err;
 }
 
@@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_SYMLINK;
 	req->in.numargs = 2;
@@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_UNLINK;
 	req->in.h.nodeid = get_node_id(dir);
@@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
@@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	int err;
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.newdir = get_node_id(newdir);
@@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	struct fuse_link_in inarg;
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
@@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode)
 	int err;
 	struct fuse_attr_out arg;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_GETATTR;
 	req->in.h.nodeid = get_node_id(inode);
@@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask)
 	if (fc->no_access)
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mask = mask;
@@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_req(fc);
 	char *link;
 
-	if (!req)
-		return ERR_PTR(-EINTR);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
 
 	link = (char *) __get_free_page(GFP_KERNEL);
 	if (!link) {
@@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 		}
 	}
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	iattr_to_fattr(attr, &inarg);
@@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 	if (fc->no_getxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	if (fc->no_listxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_REMOVEXATTR;
 	req->in.h.nodeid = get_node_id(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 975f2697e866..e4f041a11bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	struct fuse_req *req;
 	int err;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -184,9 +184,9 @@ static int fuse_flush(struct file *file)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page)
 	if (is_bad_inode(inode))
 		goto out;
 
-	err = -EINTR;
-	req = fuse_get_request(fc);
-	if (!req)
+	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
 		goto out;
 
 	req->out.page_zeroing = 1;
@@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
 		fuse_send_readpages(req, data->file, inode);
-		data->req = req = fuse_get_request(fc);
-		if (!req) {
+		data->req = req = fuse_get_req(fc);
+		if (IS_ERR(req)) {
 			unlock_page(page);
-			return -EINTR;
+			return PTR_ERR(req);
 		}
 	}
 	req->pages[req->num_pages] = page;
@@ -392,13 +392,17 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 
 	data.file = file;
 	data.inode = inode;
-	data.req = fuse_get_request(fc);
-	if (!data.req)
-		return -EINTR;
+	data.req = fuse_get_req(fc);
+	if (IS_ERR(data.req))
+		return PTR_ERR(data.req);
 
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
-	if (!err)
-		fuse_send_readpages(data.req, file, inode);
+	if (!err) {
+		if (data.req->num_pages)
+			fuse_send_readpages(data.req, file, inode);
+		else
+			fuse_put_request(fc, data.req);
+	}
 	return err;
 }
 
@@ -451,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->num_pages = 1;
 	req->pages[0] = page;
@@ -528,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	while (count) {
 		size_t nres;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a16a04fcf41e..19c7185a7546 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -18,8 +18,8 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
-/** If more requests are outstanding, then the operation will block */
-#define FUSE_MAX_OUTSTANDING 10
+/** Maximum number of outstanding background requests */
+#define FUSE_MAX_BACKGROUND 10
 
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
@@ -131,8 +131,8 @@ struct fuse_conn;
  * A request to the client
  */
 struct fuse_req {
-	/** This can be on either unused_list, pending processing or
-	    io lists in fuse_conn */
+	/** This can be on either pending processing or io lists in
+	    fuse_conn */
 	struct list_head list;
 
 	/** Entry on the background list */
@@ -144,15 +144,12 @@ struct fuse_req {
 	/*
 	 * The following bitfields are either set once before the
 	 * request is queued or setting/clearing them is protected by
-	 * fuse_lock
+	 * fuse_conn->lock
 	 */
 
 	/** True if the request has reply */
 	unsigned isreply:1;
 
-	/** The request is preallocated */
-	unsigned preallocated:1;
-
 	/** The request was interrupted */
 	unsigned interrupted:1;
 
@@ -213,6 +210,9 @@ struct fuse_req {
  * unmounted.
  */
 struct fuse_conn {
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
 	/** The user id for this mount */
 	uid_t user_id;
 
@@ -244,19 +244,20 @@ struct fuse_conn {
 	    interrupted request) */
 	struct list_head background;
 
-	/** Controls the maximum number of outstanding requests */
-	struct semaphore outstanding_sem;
+	/** Number of requests currently in the background */
+	unsigned num_background;
+
+	/** Flag indicating if connection is blocked.  This will be
+	    the case before the INIT reply is received, and if there
+	    are too many outstading backgrounds requests */
+	int blocked;
 
-	/** This counts the number of outstanding requests if
-	    outstanding_sem would go negative */
-	unsigned outstanding_debt;
+	/** waitq for blocked connection */
+	wait_queue_head_t blocked_waitq;
 
 	/** RW semaphore for exclusion with fuse_put_super() */
 	struct rw_semaphore sbput_sem;
 
-	/** The list of unused requests */
-	struct list_head unused_list;
-
 	/** The next unique request id */
 	u64 reqctr;
 
@@ -318,6 +319,9 @@ struct fuse_conn {
 
 	/** kobject */
 	struct kobject kobj;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -349,21 +353,6 @@ static inline u64 get_node_id(struct inode *inode)
 extern const struct file_operations fuse_dev_operations;
 
 /**
- * This is the single global spinlock which protects FUSE's structures
- *
- * The following data is protected by this lock:
- *
- *  - the private_data field of the device file
- *  - the s_fs_info field of the super block
- *  - unused_list, pending, processing lists in fuse_conn
- *  - background list in fuse_conn
- *  - the unique request ID counter reqctr in fuse_conn
- *  - the sb (super_block) field in fuse_conn
- *  - the file (device file) field in fuse_conn
- */
-extern spinlock_t fuse_lock;
-
-/**
  * Get a filled in inode
  */
 struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
@@ -461,11 +450,11 @@ void fuse_reset_request(struct fuse_req *req);
 /**
  * Reserve a preallocated request
  */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 
 /**
- * Decrement reference count of a request.  If count goes to zero put
- * on unused list (preallocated) or free request (not preallocated).
+ * Decrement reference count of a request.  If count goes to zero free
+ * the request.
  */
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 
@@ -487,7 +476,7 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 /**
  * Release inodes and file associated with background request
  */
-void fuse_release_background(struct fuse_req *req);
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 879e6fba9480..fd34037b0588 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
-spinlock_t fuse_lock;
 static kmem_cache_t *fuse_inode_cachep;
 static struct subsystem connections_subsys;
 
@@ -207,15 +206,17 @@ static void fuse_put_super(struct super_block *sb)
 
 	down_write(&fc->sbput_sem);
 	while (!list_empty(&fc->background))
-		fuse_release_background(list_entry(fc->background.next,
+		fuse_release_background(fc,
+					list_entry(fc->background.next,
 						   struct fuse_req, bg_entry));
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	fc->mounted = 0;
 	fc->connected = 0;
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
 	kobject_del(&fc->kobj);
 	kobject_put(&fc->kobj);
@@ -242,9 +243,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct fuse_statfs_out outarg;
 	int err;
 
-        req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
@@ -369,15 +370,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 static void fuse_conn_release(struct kobject *kobj)
 {
-	struct fuse_conn *fc = get_fuse_conn_kobj(kobj);
-
-	while (!list_empty(&fc->unused_list)) {
-		struct fuse_req *req;
-		req = list_entry(fc->unused_list.next, struct fuse_req, list);
-		list_del(&req->list);
-		fuse_request_free(req);
-	}
-	kfree(fc);
+	kfree(get_fuse_conn_kobj(kobj));
 }
 
 static struct fuse_conn *new_conn(void)
@@ -386,64 +379,25 @@ static struct fuse_conn *new_conn(void)
 
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
-		int i;
+		spin_lock_init(&fc->lock);
 		init_waitqueue_head(&fc->waitq);
+		init_waitqueue_head(&fc->blocked_waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->unused_list);
 		INIT_LIST_HEAD(&fc->background);
-		sema_init(&fc->outstanding_sem, 1); /* One for INIT */
 		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
-		for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
-			struct fuse_req *req = fuse_request_alloc();
-			if (!req) {
-				kobject_put(&fc->kobj);
-				return NULL;
-			}
-			list_add(&req->list, &fc->unused_list);
-		}
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		fc->reqctr = 0;
+		fc->blocked = 1;
 	}
 	return fc;
 }
 
-static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
-{
-	struct fuse_conn *fc;
-	int err;
-
-	err = -EINVAL;
-	if (file->f_op != &fuse_dev_operations)
-		goto out_err;
-
-	err = -ENOMEM;
-	fc = new_conn();
-	if (!fc)
-		goto out_err;
-
-	spin_lock(&fuse_lock);
-	err = -EINVAL;
-	if (file->private_data)
-		goto out_unlock;
-
-	kobject_get(&fc->kobj);
-	file->private_data = fc;
-	spin_unlock(&fuse_lock);
-	return fc;
-
- out_unlock:
-	spin_unlock(&fuse_lock);
-	kobject_put(&fc->kobj);
- out_err:
-	return ERR_PTR(err);
-}
-
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
@@ -467,7 +421,6 @@ static struct super_operations fuse_super_operations = {
 
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
-	int i;
 	struct fuse_init_out *arg = &req->misc.init_out;
 
 	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
@@ -486,22 +439,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 	}
-
-	/* After INIT reply is received other requests can go
-	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-	   up()s on outstanding_sem.  The last up() is done in
-	   fuse_putback_request() */
-	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-		up(&fc->outstanding_sem);
-
 	fuse_put_request(fc, req);
+	fc->blocked = 0;
+	wake_up_all(&fc->blocked_waitq);
 }
 
-static void fuse_send_init(struct fuse_conn *fc)
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 {
-	/* This is called from fuse_read_super() so there's guaranteed
-	   to be exactly one request available */
-	struct fuse_req *req = fuse_get_request(fc);
 	struct fuse_init_in *arg = &req->misc.init_in;
 
 	arg->major = FUSE_KERNEL_VERSION;
@@ -525,12 +469,9 @@ static void fuse_send_init(struct fuse_conn *fc)
 
 static unsigned long long conn_id(void)
 {
+	/* BKL is held for ->get_sb() */
 	static unsigned long long ctr = 1;
-	unsigned long long val;
-	spin_lock(&fuse_lock);
-	val = ctr++;
-	spin_unlock(&fuse_lock);
-	return val;
+	return ctr++;
 }
 
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -540,6 +481,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct fuse_mount_data d;
 	struct file *file;
 	struct dentry *root_dentry;
+	struct fuse_req *init_req;
 	int err;
 
 	if (!parse_fuse_opt((char *) data, &d))
@@ -555,10 +497,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		return -EINVAL;
 
-	fc = get_conn(file, sb);
-	fput(file);
-	if (IS_ERR(fc))
-		return PTR_ERR(fc);
+	if (file->f_op != &fuse_dev_operations)
+		return -EINVAL;
+
+	/* Setting file->private_data can't race with other mount()
+	   instances, since BKL is held for ->get_sb() */
+	if (file->private_data)
+		return -EINVAL;
+
+	fc = new_conn();
+	if (!fc)
+		return -ENOMEM;
 
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
@@ -579,27 +528,40 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err;
 	}
 
+	init_req = fuse_request_alloc();
+	if (!init_req)
+		goto err_put_root;
+
 	err = kobject_set_name(&fc->kobj, "%llu", conn_id());
 	if (err)
-		goto err_put_root;
+		goto err_free_req;
 
 	err = kobject_add(&fc->kobj);
 	if (err)
-		goto err_put_root;
+		goto err_free_req;
 
 	sb->s_root = root_dentry;
-	spin_lock(&fuse_lock);
 	fc->mounted = 1;
 	fc->connected = 1;
-	spin_unlock(&fuse_lock);
+	kobject_get(&fc->kobj);
+	file->private_data = fc;
+	/*
+	 * atomic_dec_and_test() in fput() provides the necessary
+	 * memory barrier for file->private_data to be visible on all
+	 * CPUs after this
+	 */
+	fput(file);
 
-	fuse_send_init(fc);
+	fuse_send_init(fc, init_req);
 
 	return 0;
 
+ err_free_req:
+	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
  err:
+	fput(file);
 	kobject_put(&fc->kobj);
 	return err;
 }
@@ -753,7 +715,6 @@ static int __init fuse_init(void)
 	printk("fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
-	spin_lock_init(&fuse_lock);
 	res = fuse_fs_init();
 	if (res)
 		goto err;
diff --git a/fs/inotify.c b/fs/inotify.c
index 367c487c014b..1f50302849c5 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
 	WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
 	spin_lock(&entry->d_lock);
 	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
+	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
 		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
 	spin_unlock(&entry->d_lock);
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..2c5f1f80bdc2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag)
 /*
  * do loopback mount.
  */
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags)
 {
 	struct nameidata old_nd;
 	struct vfsmount *mnt = NULL;
+	int recurse = flags & MS_REC;
 	int err = mount_is_safe(nd);
+
 	if (err)
 		return err;
 	if (!old_name || !*old_name)
@@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
+	mnt->mnt_flags = mnt_flags;
 
 out:
 	up_write(&namespace_sem);
@@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+		retval = do_loopback(&nd, dev_name, flags, mnt_flags);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index cfe9ce881613..6e92b0fe5323 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -14,46 +14,46 @@
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct svc_cred	*cred = &rqstp->rq_cred;
+	struct svc_cred	cred = rqstp->rq_cred;
 	int i;
 	int ret;
 
 	if (exp->ex_flags & NFSEXP_ALLSQUASH) {
-		cred->cr_uid = exp->ex_anon_uid;
-		cred->cr_gid = exp->ex_anon_gid;
-		put_group_info(cred->cr_group_info);
-		cred->cr_group_info = groups_alloc(0);
+		cred.cr_uid = exp->ex_anon_uid;
+		cred.cr_gid = exp->ex_anon_gid;
+		cred.cr_group_info = groups_alloc(0);
 	} else if (exp->ex_flags & NFSEXP_ROOTSQUASH) {
 		struct group_info *gi;
-		if (!cred->cr_uid)
-			cred->cr_uid = exp->ex_anon_uid;
-		if (!cred->cr_gid)
-			cred->cr_gid = exp->ex_anon_gid;
-		gi = groups_alloc(cred->cr_group_info->ngroups);
+		if (!cred.cr_uid)
+			cred.cr_uid = exp->ex_anon_uid;
+		if (!cred.cr_gid)
+			cred.cr_gid = exp->ex_anon_gid;
+		gi = groups_alloc(cred.cr_group_info->ngroups);
 		if (gi)
-			for (i = 0; i < cred->cr_group_info->ngroups; i++) {
-				if (!GROUP_AT(cred->cr_group_info, i))
+			for (i = 0; i < cred.cr_group_info->ngroups; i++) {
+				if (!GROUP_AT(cred.cr_group_info, i))
 					GROUP_AT(gi, i) = exp->ex_anon_gid;
 				else
-					GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i);
+					GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
 			}
-		put_group_info(cred->cr_group_info);
-		cred->cr_group_info = gi;
-	}
+		cred.cr_group_info = gi;
+	} else
+		get_group_info(cred.cr_group_info);
 
-	if (cred->cr_uid != (uid_t) -1)
-		current->fsuid = cred->cr_uid;
+	if (cred.cr_uid != (uid_t) -1)
+		current->fsuid = cred.cr_uid;
 	else
 		current->fsuid = exp->ex_anon_uid;
-	if (cred->cr_gid != (gid_t) -1)
-		current->fsgid = cred->cr_gid;
+	if (cred.cr_gid != (gid_t) -1)
+		current->fsgid = cred.cr_gid;
 	else
 		current->fsgid = exp->ex_anon_gid;
 
-	if (!cred->cr_group_info)
+	if (!cred.cr_group_info)
 		return -ENOMEM;
-	ret = set_current_groups(cred->cr_group_info);
-	if ((cred->cr_uid)) {
+	ret = set_current_groups(cred.cr_group_info);
+	put_group_info(cred.cr_group_info);
+	if ((cred.cr_uid)) {
 		cap_t(current->cap_effective) &= ~CAP_NFSD_MASK;
 	} else {
 		cap_t(current->cap_effective) |= (CAP_NFSD_MASK &
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c340be0a3f59..4e0578121d9a 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
 		goto out;
 	err = path_lookup(buf, 0, &nd);
-	if (err) goto out;
+	if (err) goto out_no_path;
 
 	exp.h.flags = 0;
 	exp.ex_client = dom;
@@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
  out:
 	if (nd.dentry)
 		path_release(&nd);
+ out_no_path:
 	if (dom)
 		auth_domain_put(dom);
 	kfree(buf);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 6d2dfed1de08..f61142afea44 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -682,7 +682,7 @@ static struct svc_procedure		nfsd_procedures3[22] = {
   PROC(lookup,	 dirop,		dirop,		fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
   PROC(access,	 access,	access,		fhandle,  RC_NOCACHE, ST+pAT+1),
   PROC(readlink, readlink,	readlink,	fhandle,  RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
-  PROC(read,	 read,		read,		fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE),
+  PROC(read,	 read,		read,		fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
   PROC(write,	 write,		write,		fhandle,  RC_REPLBUFF, ST+WC+4),
   PROC(create,	 create,	create,		fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
   PROC(mkdir,	 mkdir,		create,		fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 7391f4aabedb..edb107e61b91 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl)
 		/* Also, the remaining entries are for named users and
 		 * groups, and come in threes (mask, allow, deny): */
 		if (n4acl->naces < 7)
-			return -1;
+			return -EINVAL;
 		if ((n4acl->naces - 7) % 3)
-			return -1;
+			return -EINVAL;
 		return 4 + (n4acl->naces - 7)/3;
 	}
 }
@@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
 			continue;
 
 		error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
-				ace->access_mask, ace->whotype, ace->who) == -1;
+				ace->access_mask, ace->whotype, ace->who);
 		if (error < 0)
 			goto out;
 
@@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
 	struct nfs4_ace *ace;
 
 	if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
-		return -1;
+		return -ENOMEM;
 
 	ace->type = type;
 	ace->flag = flag;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c872bd07fc10..dbaf3f93f328 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 		goto out_clnt;
 	}
 
-	/* the task holds a reference to the nfs4_client struct */
 	cb->cb_client = clnt;
+
+	/* the task holds a reference to the nfs4_client struct */
 	atomic_inc(&clp->cl_count);
 
 	msg.rpc_cred = nfsd4_lookupcred(clp,0);
@@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 out_rpciod:
 	atomic_dec(&clp->cl_count);
 	rpciod_down();
+	cb->cb_client = NULL;
 out_clnt:
 	rpc_shutdown_client(clnt);
-	goto out_err;
 out_err:
 	dprintk("NFSD: warning: no callback path to client %.*s\n",
 		(int)clp->cl_name.len, clp->cl_name.data);
-	cb->cb_client = NULL;
 }
 
 static void
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6d63f1d9e5f5..b0e095ea0c03 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 	fh_put(current_fh);
 	status = exp_pseudoroot(rqstp->rq_client, current_fh,
 			      &rqstp->rq_chandle);
-	if (!status)
-		status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export));
 	return status;
 }
 
@@ -975,7 +973,7 @@ struct nfsd4_voidargs { int dummy; };
  */
 static struct svc_procedure		nfsd_procedures4[2] = {
   PROC(null,	 void,		void,		void,	  RC_NOCACHE, 1),
-  PROC(compound, compound,	compound,	compound, RC_NOCACHE, NFSD_BUFSIZE)
+  PROC(compound, compound,	compound,	compound, RC_NOCACHE, NFSD_BUFSIZE/4)
 };
 
 struct svc_version	nfsd_version4 = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 47ec112b266c..96c7578cbe1e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi)
 	kref_get(&fi->fi_ref);
 }
 
+static int num_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for nfs4_stateowner */
+#define OWNER_HASH_BITS              8
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+
+#define ownerid_hashval(id) \
+        ((id) & OWNER_HASH_MASK)
+#define ownerstr_hashval(clientid, ownername) \
+        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+
+static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
+static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS                   8
+#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
+#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
+/* hash table for (open)nfs4_stateid */
+#define STATEID_HASH_BITS              10
+#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
+#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
+
+#define file_hashval(x) \
+        hash_ptr(x, FILE_HASH_BITS)
+#define stateid_hashval(owner_id, file_id)  \
+        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
+static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
 
 	dprintk("NFSD alloc_init_deleg\n");
+	if (num_delegations > STATEID_HASH_SIZE * 4)
+		return NULL;
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
 		return dp;
+	num_delegations++;
 	INIT_LIST_HEAD(&dp->dl_perfile);
 	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
@@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 		dprintk("NFSD: freeing dp %p\n",dp);
 		put_nfs4_file(dp->dl_file);
 		kmem_cache_free(deleg_slab, dp);
+		num_delegations--;
 	}
 }
 
@@ -330,22 +370,29 @@ put_nfs4_client(struct nfs4_client *clp)
 }
 
 static void
+shutdown_callback_client(struct nfs4_client *clp)
+{
+	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+
+	/* shutdown rpc client, ending any outstanding recall rpcs */
+	if (clnt) {
+		clp->cl_callback.cb_client = NULL;
+		rpc_shutdown_client(clnt);
+		rpciod_down();
+	}
+}
+
+static void
 expire_client(struct nfs4_client *clp)
 {
 	struct nfs4_stateowner *sop;
 	struct nfs4_delegation *dp;
-	struct nfs4_callback *cb = &clp->cl_callback;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
 	struct list_head reaplist;
 
 	dprintk("NFSD: expire_client cl_count %d\n",
 	                    atomic_read(&clp->cl_count));
 
-	/* shutdown rpc client, ending any outstanding recall rpcs */
-	if (atomic_read(&cb->cb_set) == 1 && clnt) {
-		rpc_shutdown_client(clnt);
-		clnt = clp->cl_callback.cb_client = NULL;
-	}
+	shutdown_callback_client(clp);
 
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
@@ -936,40 +983,6 @@ out:
 	return status;
 }
 
-/* 
- * Open owner state (share locks)
- */
-
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS              8
-#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
-
-#define ownerid_hashval(id) \
-        ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
-        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
-
-static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
-
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS                   8
-#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS              10
-#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
-        hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id)  \
-        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
-
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
-
 /* OPEN Share state helper functions */
 static inline struct nfs4_file *
 alloc_init_file(struct inode *ino)
@@ -1186,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
 
-	unhash_stateowner(sop);
-	list_add_tail(&sop->so_close_lru, &close_lru);
+	list_move_tail(&sop->so_close_lru, &close_lru);
 	sop->so_time = get_seconds();
 }
 
@@ -1916,8 +1928,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
 			sop->so_id);
-		list_del(&sop->so_close_lru);
-		nfs4_put_stateowner(sop);
+		release_stateowner(sop);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -2495,36 +2506,27 @@ nfs4_transform_lock_offset(struct file_lock *lock)
 		lock->fl_end = OFFSET_MAX;
 }
 
-static int
-nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval)
-{
-	struct nfs4_stateowner *local = NULL;
-	int status = 0;
-			        
-	if (hashval >= LOCK_HASH_SIZE)
-		goto out;
-	list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) {
-		if (local == sop) {
-			status = 1;
-			goto out;
-		}
-	}
-out:
-	return status;
-}
-
+/* Hack!: For now, we're defining this just so we can use a pointer to it
+ * as a unique cookie to identify our (NFSv4's) posix locks. */
+static struct lock_manager_operations nfsd_posix_mng_ops  = {
+};
 
 static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
-	struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner;
-	unsigned int hval = lockownerid_hashval(sop->so_id);
+	struct nfs4_stateowner *sop;
+	unsigned int hval;
 
-	deny->ld_sop = NULL;
-	if (nfs4_verify_lock_stateowner(sop, hval)) {
+	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+		sop = (struct nfs4_stateowner *) fl->fl_owner;
+		hval = lockownerid_hashval(sop->so_id);
 		kref_get(&sop->so_ref);
 		deny->ld_sop = sop;
 		deny->ld_clientid = sop->so_client->cl_clientid;
+	} else {
+		deny->ld_sop = NULL;
+		deny->ld_clientid.cl_boot = 0;
+		deny->ld_clientid.cl_id = 0;
 	}
 	deny->ld_start = fl->fl_start;
 	deny->ld_length = ~(u64)0;
@@ -2736,6 +2738,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lock->lk_offset;
 	if ((lock->lk_length == ~(u64)0) || 
@@ -2841,6 +2844,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
 	if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
@@ -2900,6 +2904,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX; 
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 	file_lock.fl_start = locku->lu_offset;
 
 	if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
@@ -3211,15 +3216,8 @@ __nfs4_state_shutdown(void)
 	int i;
 	struct nfs4_client *clp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	struct nfs4_stateowner *sop = NULL;
 	struct list_head *pos, *next, reaplist;
 
-	list_for_each_safe(pos, next, &close_lru) {
-		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
-		list_del(&sop->so_close_lru);
-		nfs4_put_stateowner(sop);
-	}
-
 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
 		while (!list_empty(&conf_id_hashtbl[i])) {
 			clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
@@ -3244,8 +3242,6 @@ __nfs4_state_shutdown(void)
 	}
 
 	cancel_delayed_work(&laundromat_work);
-	flush_workqueue(laundry_wq);
-	destroy_workqueue(laundry_wq);
 	nfsd4_shutdown_recdir();
 	nfs4_init = 0;
 }
@@ -3253,6 +3249,8 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
+	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+	destroy_workqueue(laundry_wq);
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 03857fd81126..de3998f15f10 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 						buf, dummy32, &ace.who);
 			if (status)
 				goto out_nfserr;
-			if (nfs4_acl_add_ace(*acl, ace.type, ace.flag,
-				 ace.access_mask, ace.whotype, ace.who) != 0) {
-				status = -ENOMEM;
+			status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+				 ace.access_mask, ace.whotype, ace.who);
+			if (status)
 				goto out_nfserr;
-			}
 		}
 	} else
 		*acl = NULL;
@@ -2085,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
 	WRITE32(eof);
 	WRITE32(maxcount);
 	ADJUST_ARGS();
-	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
-
+	resp->xbuf->head[0].iov_len = (char*)p
+					- (char*)resp->xbuf->head[0].iov_base;
 	resp->xbuf->page_len = maxcount;
 
-	/* read zero bytes -> don't set up tail */
-	if(!maxcount)
-		return 0;        
-
-	/* set up page for remaining responses */
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
-	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
-
 	if (maxcount&3) {
-		*(resp->p)++ = 0;
+		RESERVE_SPACE(4);
+		WRITE32(0);
 		resp->xbuf->tail[0].iov_base += maxcount&3;
 		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
 	}
 	return 0;
 }
@@ -2142,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
 
 	WRITE32(maxcount);
 	ADJUST_ARGS();
-	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->head[0].iov_len = (char*)p
+				- (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->page_len = maxcount;
 
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
-	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
-
-	resp->xbuf->page_len = maxcount;
 	if (maxcount&3) {
-		*(resp->p)++ = 0;
+		RESERVE_SPACE(4);
+		WRITE32(0);
 		resp->xbuf->tail[0].iov_base += maxcount&3;
 		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
 	}
 	return 0;
 }
@@ -2166,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 {
 	int maxcount;
 	loff_t offset;
-	u32 *page, *savep;
+	u32 *page, *savep, *tailbase;
 	ENCODE_HEAD;
 
 	if (nfserr)
@@ -2182,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	WRITE32(0);
 	ADJUST_ARGS();
 	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	tailbase = p;
 
 	maxcount = PAGE_SIZE;
 	if (maxcount > readdir->rd_maxcount)
@@ -2226,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	*p++ = htonl(readdir->common.err == nfserr_eof);
 	resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
 
-	/* allocate a page for the tail */
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = tailbase;
 	resp->xbuf->tail[0].iov_len = 0;
 	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
+	resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
 
 	return 0;
 err_no_verf:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 3e6b75cd90fd..06cd0db0f32b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -553,7 +553,7 @@ static struct svc_procedure		nfsd_procedures2[18] = {
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(lookup,	 diropargs,	diropres,	fhandle,	RC_NOCACHE, ST+FH+AT),
   PROC(readlink, readlinkargs,	readlinkres,	none,		RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
-  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE),
+  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(write,	 writeargs,	attrstat,	fhandle,	RC_REPLBUFF, ST+AT),
   PROC(create,	 createargs,	diropres,	fhandle,	RC_REPLBUFF, ST+FH+AT),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 31018333dc38..6aa92d0e6876 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -371,7 +371,6 @@ out_nfserr:
 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 {
 	ssize_t buflen;
-	int error;
 
 	buflen = vfs_getxattr(dentry, key, NULL, 0);
 	if (buflen <= 0)
@@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 	if (!*buf)
 		return -ENOMEM;
 
-	error = vfs_getxattr(dentry, key, *buf, buflen);
-	if (error < 0)
-		return error;
-	return buflen;
+	return vfs_getxattr(dentry, key, *buf, buflen);
 }
 #endif
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bff0f0d06867..21f38accd039 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -153,6 +153,7 @@ struct o2hb_region {
 struct o2hb_bio_wait_ctxt {
 	atomic_t          wc_num_reqs;
 	struct completion wc_io_complete;
+	int               wc_error;
 };
 
 static void o2hb_write_timeout(void *arg)
@@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
 {
 	atomic_set(&wc->wc_num_reqs, num_ios);
 	init_completion(&wc->wc_io_complete);
+	wc->wc_error = 0;
 }
 
 /* Used in error paths too */
@@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio,
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
 
-	if (error)
+	if (error) {
 		mlog(ML_ERROR, "IO Error %d\n", error);
+		wc->wc_error = error;
+	}
 
 	if (bio->bi_size)
 		return 1;
@@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
 
 bail_and_wait:
 	o2hb_wait_on_io(reg, &wc);
+	if (wc.wc_error && !status)
+		status = wc.wc_error;
 
 	if (bios) {
 		for(i = 0; i < num_bios; i++)
@@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes,
 	return highest;
 }
 
-static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
 	int i, ret, highest_node, change = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct bio *write_bio;
 	struct o2hb_bio_wait_ctxt write_wc;
 
-	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
-		return;
+	ret = o2nm_configured_node_map(configured_nodes,
+				       sizeof(configured_nodes));
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
 
 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
 	if (highest_node >= O2NM_MAX_NODES) {
 		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
-		return;
+		return -EINVAL;
 	}
 
 	/* No sense in reading the slots of nodes that don't exist
@@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	ret = o2hb_read_slots(reg, highest_node + 1);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return;
+		return ret;
 	}
 
 	/* With an up to date view of the slots, we can check that no
@@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return;
+		return ret;
 	}
 
 	i = -1;
@@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	 */
 	o2hb_wait_on_io(reg, &write_wc);
 	bio_put(write_bio);
+	if (write_wc.wc_error) {
+		/* Do not re-arm the write timeout on I/O error - we
+		 * can't be sure that the new block ever made it to
+		 * disk */
+		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
+		     write_wc.wc_error, reg->hr_dev_name);
+		return write_wc.wc_error;
+	}
+
 	o2hb_arm_write_timeout(reg);
 
 	/* let the person who launched us know when things are steady */
@@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		if (atomic_dec_and_test(&reg->hr_steady_iterations))
 			wake_up(&o2hb_steady_queue);
 	}
+
+	return 0;
 }
 
 /* Subtract b from a, storing the result in a. a *must* have a larger
@@ -913,7 +934,10 @@ static int o2hb_thread(void *data)
 		 * likely to time itself out. */
 		do_gettimeofday(&before_hb);
 
-		o2hb_do_disk_heartbeat(reg);
+		i = 0;
+		do {
+			ret = o2hb_do_disk_heartbeat(reg);
+		} while (ret && ++i < 2);
 
 		do_gettimeofday(&after_hb);
 		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index c3764f4744ee..74ca4e5f9765 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -139,6 +139,10 @@ static void user_ast(void *opaque)
 		return;
 	}
 
+	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+			"Lockres %s, requested ivmode. flags 0x%x\n",
+			lockres->l_name, lockres->l_flags);
+
 	/* we're downconverting. */
 	if (lockres->l_requested < lockres->l_level) {
 		if (lockres->l_requested <=
@@ -229,23 +233,42 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 
 	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
 
-	if (status != DLM_NORMAL)
+	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
 		mlog(ML_ERROR, "Dlm returns status %d\n", status);
 
 	spin_lock(&lockres->l_lock);
-	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+	/* The teardown flag gets set early during the unlock process,
+	 * so test the cancel flag to make sure that this ast isn't
+	 * for a concurrent cancel. */
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
 		lockres->l_level = LKM_IVMODE;
-	else {
+	} else if (status == DLM_CANCELGRANT) {
+		mlog(0, "Lock %s, cancel fails, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
+		/* We tried to cancel a convert request, but it was
+		 * already granted. Don't clear the busy flag - the
+		 * ast should've done this already. */
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		goto out_noclear;
+	} else {
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		/* Cancel succeeded, we want to re-queue */
+		mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
 		lockres->l_requested = LKM_IVMODE; /* cancel an
 						    * upconvert
 						    * request. */
 		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
 		/* we want the unblock thread to look at it again
 		 * now. */
-		__user_dlm_queue_lockres(lockres);
+		if (lockres->l_flags & USER_LOCK_BLOCKED)
+			__user_dlm_queue_lockres(lockres);
 	}
 
 	lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
 	spin_unlock(&lockres->l_lock);
 
 	wake_up(&lockres->l_event);
@@ -268,13 +291,26 @@ static void user_dlm_unblock_lock(void *opaque)
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
-	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+			"Lockres %s, flags 0x%x\n",
+			lockres->l_name, lockres->l_flags);
 
-	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
-	 * for user_ast to do. */
+	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+	 * set, we want user_ast clear it. */
 	lockres->l_flags &= ~USER_LOCK_QUEUED;
 
+	/* It's valid to get here and no longer be blocked - if we get
+	 * several basts in a row, we might be queued by the first
+	 * one, the unblock thread might run and clear the queued
+	 * flag, and finally we might get another bast which re-queues
+	 * us before our ast for the downconvert is called. */
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n",
+			lockres->l_name, lockres->l_flags);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
 		mlog(0, "lock is in teardown so we do nothing\n");
 		spin_unlock(&lockres->l_lock);
@@ -282,7 +318,9 @@ static void user_dlm_unblock_lock(void *opaque)
 	}
 
 	if (lockres->l_flags & USER_LOCK_BUSY) {
-		mlog(0, "BUSY flag detected...\n");
+		mlog(0, "Cancel lock %s, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
+
 		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
 			spin_unlock(&lockres->l_lock);
 			goto drop_ref;
@@ -296,14 +334,7 @@ static void user_dlm_unblock_lock(void *opaque)
 				   LKM_CANCEL,
 				   user_unlock_ast,
 				   lockres);
-		if (status == DLM_CANCELGRANT) {
-			/* If we got this, then the ast was fired
-			 * before we could cancel. We cleanup our
-			 * state, and restart the function. */
-			spin_lock(&lockres->l_lock);
-			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
-			spin_unlock(&lockres->l_lock);
-		} else if (status != DLM_NORMAL)
+		if (status != DLM_NORMAL)
 			user_log_dlm_error("dlmunlock", status, lockres);
 		goto drop_ref;
 	}
@@ -581,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 	mlog(0, "asked to destroy %s\n", lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "Lock is already torn down\n");
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
 	while (lockres->l_flags & USER_LOCK_BUSY) {
 		spin_unlock(&lockres->l_lock);
 
@@ -606,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 
 	lockres->l_flags &= ~USER_LOCK_ATTACHED;
 	lockres->l_flags |= USER_LOCK_BUSY;
-	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
 	spin_unlock(&lockres->l_lock);
 
 	mlog(0, "unlocking lockres %s\n", lockres->l_name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 34e903a6a46b..581eb451a41a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode,
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 
+	/* This forces other nodes to sync and drop their pages. Do
+	 * this even if we have a truncate without allocation change -
+	 * ocfs2 cluster sizes can be much greater than page size, so
+	 * we have to truncate them anyway.  */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
 	if (le32_to_cpu(fe->i_clusters) ==
 	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
 		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
@@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		goto bail;
 	}
 
-	/* This forces other nodes to sync and drop their pages */
-	status = ocfs2_data_lock(inode, 1);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-	ocfs2_data_unlock(inode, 1);
-
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7efa73d44c9a..20d4b2237fce 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 				size_t buflen, loff_t *fpos)
 {
 	ssize_t acc = 0, tmp;
-	size_t tsz, nr_bytes;
-	u64 start;
+	size_t tsz;
+	u64 start, nr_bytes;
 	struct vmcore *curr_m = NULL;
 
 	if (buflen == 0 || *fpos >= vmcore_size)
diff --git a/fs/select.c b/fs/select.c
index 071660fa7b01..a8109baa5e46 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -310,8 +310,9 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
-	char *bits;
-	int ret, size, max_fdset;
+	void *bits;
+	int ret, max_fdset;
+	unsigned int size;
 	struct fdtable *fdt;
 	/* Allocate small arguments on the stack to save memory and be faster */
 	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -333,20 +334,21 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	 * since we used fdset we need to allocate memory in units of
 	 * long-words. 
 	 */
-	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	if (6*size < SELECT_STACK_ALLOC)
-		bits = stack_fds;
-	else
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		/* Not enough space in on-stack array; must use kmalloc */
+		ret = -ENOMEM;
 		bits = kmalloc(6 * size, GFP_KERNEL);
-	if (!bits)
-		goto out_nofds;
-	fds.in      = (unsigned long *)  bits;
-	fds.out     = (unsigned long *) (bits +   size);
-	fds.ex      = (unsigned long *) (bits + 2*size);
-	fds.res_in  = (unsigned long *) (bits + 3*size);
-	fds.res_out = (unsigned long *) (bits + 4*size);
-	fds.res_ex  = (unsigned long *) (bits + 5*size);
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = bits;
+	fds.out     = bits +   size;
+	fds.ex      = bits + 2*size;
+	fds.res_in  = bits + 3*size;
+	fds.res_out = bits + 4*size;
+	fds.res_ex  = bits + 5*size;
 
 	if ((ret = get_fd_set(n, inp, fds.in)) ||
 	    (ret = get_fd_set(n, outp, fds.out)) ||
diff --git a/fs/sync.c b/fs/sync.c
index 8616006d2094..aab5ffe77e9f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -61,7 +61,7 @@
  * will be available after a crash.
  */
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-					int flags)
+					unsigned int flags)
 {
 	int ret;
 	struct file *file;
@@ -126,7 +126,7 @@ out:
  * `endbyte' is inclusive
  */
 int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
-			int flags)
+			unsigned int flags)
 {
 	int ret;
 	struct address_space *mapping;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 6cbbd165c60d..4d191ef39b67 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -870,12 +870,14 @@ xfs_page_state_convert(
 	pgoff_t                 end_index, last_index, tlast;
 	ssize_t			size, len;
 	int			flags, err, iomap_valid = 0, uptodate = 1;
-	int			page_dirty, count = 0, trylock_flag = 0;
+	int			page_dirty, count = 0;
+	int			trylock = 0;
 	int			all_bh = unmapped;
 
-	/* wait for other IO threads? */
-	if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
-		trylock_flag |= BMAPI_TRYLOCK;
+	if (startio) {
+		if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+			trylock |= BMAPI_TRYLOCK;
+	}
 
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
@@ -956,15 +958,13 @@ xfs_page_state_convert(
 
 			if (buffer_unwritten(bh)) {
 				type = IOMAP_UNWRITTEN;
-				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
+				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
 			} else if (buffer_delay(bh)) {
 				type = IOMAP_DELAY;
-				flags = BMAPI_ALLOCATE;
-				if (!startio)
-					flags |= trylock_flag;
+				flags = BMAPI_ALLOCATE | trylock;
 			} else {
 				type = IOMAP_NEW;
-				flags = BMAPI_WRITE|BMAPI_MMAP;
+				flags = BMAPI_WRITE | BMAPI_MMAP;
 			}
 
 			if (!iomap_valid) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9fb0312665ca..26fed0756f01 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -182,7 +182,7 @@ free_address(
 {
 	a_list_t	*aentry;
 
-	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
 	if (likely(aentry)) {
 		spin_lock(&as_lock);
 		aentry->next = as_free_head;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 149237304fb6..2e2e275c786f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,8 +673,7 @@ xfs_vn_setattr(
 	if (ia_valid & ATTR_ATIME) {
 		vattr.va_mask |= XFS_AT_ATIME;
 		vattr.va_atime = attr->ia_atime;
-		if (ia_valid & ATTR_ATIME_SET)
-			inode->i_atime = attr->ia_atime;
+		inode->i_atime = attr->ia_atime;
 	}
 	if (ia_valid & ATTR_MTIME) {
 		vattr.va_mask |= XFS_AT_MTIME;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 4eeb856183b1..deddbd03c166 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc(
  	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
-	if(likely(newino != NULLAGINO)) {
-		args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-				XFS_IALLOC_BLOCKS(args.mp);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+			XFS_IALLOC_BLOCKS(args.mp);
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
@@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc(
 		 * Set the alignment for the allocation.
 		 * If stripe alignment is turned on then align at stripe unit
 		 * boundary.
-		 * If the cluster size is smaller than a filesystem block 
-		 * then we're doing I/O for inodes in filesystem block size 
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
 		 * pieces, so don't need alignment anyway.
 		 */
 		isaligned = 0;
@@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc(
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
 		} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
-			   args.mp->m_sb.sb_inoalignmt >= 
+			   args.mp->m_sb.sb_inoalignmt >=
 			   XFS_B_TO_FSBT(args.mp,
 			  	XFS_INODE_CLUSTER_SIZE(args.mp)))
 				args.alignment = args.mp->m_sb.sb_inoalignmt;
@@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc(
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
- 
+
 	/*
 	 * If stripe alignment is turned on, then try again with cluster
 	 * alignment.
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bb33113eef9f..b53854325266 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -421,7 +421,10 @@ finish_inode:
 			ip->i_chash = chlnew;
 			chlnew->chl_ip = ip;
 			chlnew->chl_blkno = ip->i_blkno;
+			if (ch->ch_list)
+				ch->ch_list->chl_prev = chlnew;
 			chlnew->chl_next = ch->ch_list;
+			chlnew->chl_prev = NULL;
 			ch->ch_list = chlnew;
 			chlnew = NULL;
 		}
@@ -723,23 +726,15 @@ xfs_iextract(
 		ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
 		ASSERT(ip->i_chash != NULL);
 		chm=NULL;
-		for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
-			if (chl->chl_blkno == ip->i_blkno) {
-				if (chm == NULL) {
-					/* first item on the list */
-					ch->ch_list = chl->chl_next;
-				} else {
-					chm->chl_next = chl->chl_next;
-				}
-				kmem_zone_free(xfs_chashlist_zone, chl);
-				break;
-			} else {
-				ASSERT(chl->chl_ip != ip);
-				chm = chl;
-			}
-		}
-		ASSERT_ALWAYS(chl != NULL);
-       } else {
+		chl = ip->i_chash;
+		if (chl->chl_prev)
+			chl->chl_prev->chl_next = chl->chl_next;
+		else
+			ch->ch_list = chl->chl_next;
+		if (chl->chl_next)
+			chl->chl_next->chl_prev = chl->chl_prev;
+		kmem_zone_free(xfs_chashlist_zone, chl);
+	} else {
 		/* delete one inode from a non-empty list */
 		iq = ip->i_cnext;
 		iq->i_cprev = ip->i_cprev;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 48146bdc6bdd..94b60dd03801 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2732,16 +2732,29 @@ xfs_iunpin(
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 
 	if (atomic_dec_and_test(&ip->i_pincount)) {
-		vnode_t	*vp = XFS_ITOV_NULL(ip);
+		/*
+		 * If the inode is currently being reclaimed, the
+		 * linux inode _and_ the xfs vnode may have been
+		 * freed so we cannot reference either of them safely.
+		 * Hence we should not try to do anything to them
+		 * if the xfs inode is currently in the reclaim
+		 * path.
+		 *
+		 * However, we still need to issue the unpin wakeup
+		 * call as the inode reclaim may be blocked waiting for
+		 * the inode to become unpinned.
+		 */
+		if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
+			vnode_t	*vp = XFS_ITOV_NULL(ip);
 
-		/* make sync come back and flush this inode */
-		if (vp) {
-			struct inode	*inode = vn_to_inode(vp);
+			/* make sync come back and flush this inode */
+			if (vp) {
+				struct inode	*inode = vn_to_inode(vp);
 
-			if (!(inode->i_state & I_NEW))
-				mark_inode_dirty_sync(inode);
+				if (!(inode->i_state & I_NEW))
+					mark_inode_dirty_sync(inode);
+			}
 		}
-
 		wake_up(&ip->i_ipin_wait);
 	}
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 39ef9c36ea55..3b544db1790b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -189,6 +189,7 @@ typedef struct xfs_ihash {
  */
 typedef struct xfs_chashlist {
 	struct xfs_chashlist	*chl_next;
+	struct xfs_chashlist	*chl_prev;
 	struct xfs_inode	*chl_ip;
 	xfs_daddr_t		chl_blkno;	/* starting block number of
 						 * the cluster */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 049fabb7f7e0..c0b1c2906880 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -270,7 +270,7 @@ xfs_mount_validate_sb(
 	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
-	    (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) {
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
 		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}