From ecf6f5e7d68471b08603f7c20143ac236602364f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 8 Nov 2010 18:08:14 -0500 Subject: fanotify: deny permissions when no event was sent If no event was sent to userspace we cannot expect userspace to respond to permissions requests. Today such requests just hang forever. This patch will deny any permissions event which was unable to be sent to userspace. Reported-by: Tvrtko Ursulin Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 063224812b7e..045c0794d435 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -106,7 +106,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) return client_fd; } -static ssize_t fill_event_metadata(struct fsnotify_group *group, +static int fill_event_metadata(struct fsnotify_group *group, struct fanotify_event_metadata *metadata, struct fsnotify_event *event) { @@ -257,10 +257,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - fd = fill_event_metadata(group, &fanotify_event_metadata, event); - if (fd < 0) - return fd; + ret = fill_event_metadata(group, &fanotify_event_metadata, event); + if (ret < 0) + goto out; + fd = ret; ret = prepare_for_access_response(group, event, fd); if (ret) goto out_close_fd; @@ -275,6 +276,13 @@ out_kill_access_response: remove_access_response(group, event, fd); out_close_fd: sys_close(fd); +out: +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) { + event->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + } +#endif return ret; } -- cgit v1.2.3 From b1085ba80cd2784400a7beec3fda5099198ed01c Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Fri, 5 Nov 2010 17:05:27 +0100 Subject: fanotify: if set by user unset FMODE_NONOTIFY before fsnotify_perm() is called Unsetting FMODE_NONOTIFY in fsnotify_open() is too late, since fsnotify_perm() is called before. If FMODE_NONOTIFY is set fsnotify_perm() will skip permission checks, so a user can still disable permission checks by setting this flag in an open() call. This patch corrects this by unsetting the flag before fsnotify_perm is called. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/namei.c | 3 +++ include/linux/fsnotify.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 5362af9b7372..4ff7ca530533 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname, if (!(open_flag & O_CREAT)) mode = 0; + /* Must never be set by userspace */ + open_flag &= ~FMODE_NONOTIFY; + /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 5c185fa27089..b10bcdeaef76 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -235,9 +235,6 @@ static inline void fsnotify_open(struct file *file) if (S_ISDIR(inode->i_mode)) mask |= FS_ISDIR; - /* FMODE_NONOTIFY must never be set from user */ - file->f_mode &= ~FMODE_NONOTIFY; - fsnotify_parent(path, NULL, mask); fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0); } -- cgit v1.2.3 From fa218ab98c31eeacd12b89501e6b99d146ea56cc Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 9 Nov 2010 18:18:16 +0100 Subject: fanotify: correct broken ref counting in case adding a mark failed If adding a mount or inode mark failed fanotify_free_mark() is called explicitly. But at this time the mark has already been put into the destroy list of the fsnotify_mark kernel thread. If the thread is too slow it will try to decrease the reference of a mark, that has already been freed by fanotify_free_mark(). (If its fast enough it will only decrease the marks ref counter from 2 to 1 - note that the counter has been increased to 2 in add_mark() - which has practically no effect.) This patch fixes the ref counting by not calling free_mark() explicitly, but decreasing the ref counter and rely on the fsnotify_mark thread to cleanup in case adding the mark has failed. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 045c0794d435..c0ca1fa1550c 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -594,11 +594,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) return -ENOSPC; @@ -608,17 +607,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, fsnotify_init_mark(fsn_mark, fanotify_free_mark); ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) { - fanotify_free_mark(fsn_mark); - return ret; - } + if (ret) + goto err; } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - fsnotify_put_mark(fsn_mark); + if (added & ~mnt->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); - - return 0; +err: + fsnotify_put_mark(fsn_mark); + return ret; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -627,6 +625,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -642,8 +641,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) return -ENOSPC; @@ -653,16 +650,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, fsnotify_init_mark(fsn_mark, fanotify_free_mark); ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) { - fanotify_free_mark(fsn_mark); - return ret; - } + if (ret) + goto err; } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - fsnotify_put_mark(fsn_mark); + if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); - return 0; +err: + fsnotify_put_mark(fsn_mark); + return ret; } /* fanotify syscalls */ -- cgit v1.2.3 From 1734dee4e3a296cb72b4819fc2e7ef2440737dff Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 22 Nov 2010 18:46:33 +0100 Subject: fanotify: Dont allow a mask of 0 if setting or removing a mark In mark_remove_from_mask() we destroy marks that have their event mask cleared. Thus we should not allow the creation of those marks in the first place. With this patch we check if the mask given from user is 0 in case of FAN_MARK_ADD. If so we return an error. Same for FAN_MARK_REMOVE since this does not have any effect. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c0ca1fa1550c..480434c5ee5f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -769,8 +769,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, if (flags & ~FAN_ALL_MARK_FLAGS) return -EINVAL; switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { - case FAN_MARK_ADD: + case FAN_MARK_ADD: /* fallthrough */ case FAN_MARK_REMOVE: + if (!mask) + return -EINVAL; case FAN_MARK_FLUSH: break; default: -- cgit v1.2.3 From 09e5f14e57c70f9d357862bb56e57026c51092a1 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Fri, 19 Nov 2010 10:58:07 +0100 Subject: fanotify: on group destroy allow all waiters to bypass permission check When fanotify_release() is called, there may still be processes waiting for access permission. Currently only processes for which an event has already been queued into the groups access list will be woken up. Processes for which no event has been queued will continue to sleep and thus cause a deadlock when fsnotify_put_group() is called. Furthermore there is a race allowing further processes to be waiting on the access wait queue after wake_up (if they arrive before clear_marks_by_group() is called). This patch corrects this by setting a flag to inform processes that the group is about to be destroyed and thus not to wait for access permission. [additional changelog from eparis] Lets think about the 4 relevant code paths from the PoV of the 'operator' 'listener' 'responder' and 'closer'. Where operator is the process doing an action (like open/read) which could require permission. Listener is the task (or in this case thread) slated with reading from the fanotify file descriptor. The 'responder' is the thread responsible for responding to access requests. 'Closer' is the thread attempting to close the fanotify file descriptor. The 'operator' is going to end up in: fanotify_handle_event() get_response_from_access() (THIS BLOCKS WAITING ON USERSPACE) The 'listener' interesting code path fanotify_read() copy_event_to_user() prepare_for_access_response() (THIS CREATES AN fanotify_response_event) The 'responder' code path: fanotify_write() process_access_response() (REMOVE A fanotify_response_event, SET RESPONSE, WAKE UP 'operator') The 'closer': fanotify_release() (SUPPOSED TO CLEAN UP THE REST OF THIS MESS) What we have today is that in the closer we remove all of the fanotify_response_events and set a bit so no more response events are ever created in prepare_for_access_response(). The bug is that we never wake all of the operators up and tell them to move along. You fix that in fanotify_get_response_from_access(). You also fix other operators which haven't gotten there yet. So I agree that's a good fix. [/additional changelog from eparis] [remove additional changes to minimize patch size] [move initialization so it was inside CONFIG_FANOTIFY_PERMISSION] Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify.c | 6 +++++- fs/notify/fanotify/fanotify_user.c | 5 +++-- include/linux/fsnotify_backend.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b04f88eed09e..f35794b97e8e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response); + wait_event(group->fanotify_data.access_waitq, event->response || + atomic_read(&group->fanotify_data.bypass_perm)); + + if (!event->response) /* bypass_perm set */ + return 0; /* userspace responded, convert to something usable */ spin_lock(&event->lock); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 480434c5ee5f..01fffe62a2d4 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -200,7 +200,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, mutex_lock(&group->fanotify_data.access_mutex); - if (group->fanotify_data.bypass_perm) { + if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); event->response = FAN_ALLOW; @@ -390,7 +390,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) mutex_lock(&group->fanotify_data.access_mutex); - group->fanotify_data.bypass_perm = true; + atomic_inc(&group->fanotify_data.bypass_perm); list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, @@ -703,6 +703,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0a68f924f06f..7380763595d3 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -166,7 +166,7 @@ struct fsnotify_group { struct mutex access_mutex; struct list_head access_list; wait_queue_head_t access_waitq; - bool bypass_perm; /* protected by access_mutex */ + atomic_t bypass_perm; #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ int f_flags; unsigned int max_marks; -- cgit v1.2.3 From a2ae4cc9a16e211c8a128ba10d22a85431f093ab Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 23 Nov 2010 18:18:37 -0500 Subject: inotify: stop kernel memory leak on file creation failure If inotify_init is unable to allocate a new file for the new inotify group we leak the new group. This patch drops the reference on the group on file allocation failure. Reported-by: Vegard Nossum cc: stable@kernel.org Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 444c305a468c..4cd5d5d78f9f 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (ret >= 0) return ret; + fsnotify_put_group(group); atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); -- cgit v1.2.3 From 26379198937fcc9bbe7be76be695d06df8334eaa Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 23 Nov 2010 23:48:26 -0500 Subject: fanotify: do not leak user reference on allocation failure If fanotify_init is unable to allocate a new fsnotify group it will return but will not drop its reference on the associated user struct. Drop that reference on error. Reported-by: Vegard Nossum Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 01fffe62a2d4..ca54957b1f61 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -692,8 +692,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_group(&fanotify_fsnotify_ops); - if (IS_ERR(group)) + if (IS_ERR(group)) { + free_uid(user); return PTR_ERR(group); + } group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); -- cgit v1.2.3 From fdbf3ceeb659f0b3c0e8dd79b331b7ac05910f1e Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Wed, 24 Nov 2010 18:26:04 +0100 Subject: fanotify: Dont try to open a file descriptor for the overflow event We should not try to open a file descriptor for the overflow event since this will always fail. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ca54957b1f61..dccd7985e65a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -110,6 +110,8 @@ static int fill_event_metadata(struct fsnotify_group *group, struct fanotify_event_metadata *metadata, struct fsnotify_event *event) { + int ret = 0; + pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, group, metadata, event); @@ -117,9 +119,15 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->vers = FANOTIFY_METADATA_VERSION; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - metadata->fd = create_fd(group, event); + if (unlikely(event->mask & FAN_Q_OVERFLOW)) + metadata->fd = FAN_NOFD; + else { + metadata->fd = create_fd(group, event); + if (metadata->fd < 0) + ret = metadata->fd; + } - return metadata->fd; + return ret; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS @@ -261,7 +269,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (ret < 0) goto out; - fd = ret; + fd = fanotify_event_metadata.fd; ret = prepare_for_access_response(group, event, fd); if (ret) goto out_close_fd; @@ -275,7 +283,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, out_kill_access_response: remove_access_response(group, event, fd); out_close_fd: - sys_close(fd); + if (fd != FAN_NOFD) + sys_close(fd); out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { -- cgit v1.2.3 From 388c4bcb4e63e88fb1f312a2f5f9eb2623afcf5b Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Fri, 19 Nov 2010 15:06:50 -0800 Subject: ocfs2/dlm: Migrate lockres with no locks if it has a reference o2dlm was not migrating resources with zero locks because it assumed that that resource would get purged by dlm_thread. However, some usage patterns involve creating and dropping locks at a high rate leading to the migrate thread seeing zero locks but the purge thread seeing an active reference. When this happens, the dlm_thread cannot purge the resource and the migrate thread sees no reason to migrate that resource. The spell is broken when the migrate thread catches the resource with a lock. The fix is to make the migrate thread also consider the reference map. This usage pattern can be triggered by userspace on userdlm locks and flocks. Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmmaster.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f564b0e5f80d..59f0f6bdfc62 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) */ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, - int *numlocks) + int *numlocks, + int *hasrefs) { int ret; int i; @@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); + *numlocks = 0; + *hasrefs = 0; + ret = -EINVAL; if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "cannot migrate lockres with unknown owner!\n"); @@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, } *numlocks = count; - mlog(0, "migrateable lockres having %d locks\n", *numlocks); + + count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (count < O2NM_MAX_NODES) + *hasrefs = 1; + + mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, + res->lockname.len, res->lockname.name, *numlocks, *hasrefs); leave: return ret; @@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, const char *name; unsigned int namelen; int mle_added = 0; - int numlocks; + int numlocks, hasrefs; int wake = 0; if (!dlm_grab(dlm)) @@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, name = res->lockname.name; namelen = res->lockname.len; - mlog(0, "migrating %.*s to %u\n", namelen, name, target); + mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); /* * ensure this lockres is a proper candidate for migration */ spin_lock(&res->spinlock); - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); if (ret < 0) { spin_unlock(&res->spinlock); goto leave; @@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_unlock(&res->spinlock); /* no work to do */ - if (numlocks == 0) { - mlog(0, "no locks were found on this lockres! done!\n"); + if (numlocks == 0 && !hasrefs) goto leave; - } /* * preallocate up front @@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * find a node to migrate the lockres to */ - mlog(0, "picking a migration node\n"); spin_lock(&dlm->spinlock); /* pick a new node */ if (!test_bit(target, dlm->domain_map) || target >= O2NM_MAX_NODES) { target = dlm_pick_migration_target(dlm, res); } - mlog(0, "node %u chosen for migration\n", target); + mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name, + namelen, name, target); if (target >= O2NM_MAX_NODES || !test_bit(target, dlm->domain_map)) { @@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int ret; int lock_dropped = 0; - int numlocks; + int numlocks, hasrefs; spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { @@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) } /* No need to migrate a lockres having no locks */ - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); - if (ret >= 0 && numlocks == 0) { + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); + if (ret >= 0 && numlocks == 0 && !hasrefs) { spin_unlock(&res->spinlock); goto leave; } @@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, } queue++; } + + nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (nodenum < O2NM_MAX_NODES) { + spin_unlock(&res->spinlock); + return nodenum; + } spin_unlock(&res->spinlock); mlog(0, "have not found a suitable target yet! checking domain map\n"); -- cgit v1.2.3 From 39c99f12f15c8bf8257985d9b2a2548a03d18c00 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Tue, 7 Dec 2010 14:35:07 +0800 Subject: Ocfs2: Teach 'coherency=full' O_DIRECT writes to correctly up_read i_alloc_sem. Due to newly-introduced 'coherency=full' O_DIRECT writes also takes the EX rw_lock like buffered writes did(rw_level == 1), it turns out messing the usage of 'level' in ocfs2_dio_end_io() up, which caused i_alloc_sem being failed to get up_read'd correctly. This patch tries to teach ocfs2_dio_end_io to understand well on all locking stuffs by explicitly introducing a new bit for i_alloc_sem in iocb's private data, just like what we did for rw_lock. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 7 +++++-- fs/ocfs2/aops.h | 23 +++++++++++++++++++++-- fs/ocfs2/file.c | 15 +++++++++++++-- 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f1e962cb3b73..0d7c5540ad66 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + if (ocfs2_iocb_is_sem_locked(iocb)) { + up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } + ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); - if (!level) - up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); if (is_async) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 76bfdfda691a..eceb456037c1 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) else clear_bit(1, (unsigned long *)&iocb->private); } + +/* + * Using a named enum representing lock types in terms of #N bit stored in + * iocb->private, which is going to be used for communication bewteen + * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + */ +enum ocfs2_iocb_lock_bits { + OCFS2_IOCB_RW_LOCK = 0, + OCFS2_IOCB_RW_LOCK_LEVEL, + OCFS2_IOCB_SEM, + OCFS2_IOCB_NUM_LOCKS +}; + #define ocfs2_iocb_clear_rw_locked(iocb) \ - clear_bit(0, (unsigned long *)&iocb->private) + clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ - test_bit(1, (unsigned long *)&iocb->private) + test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_sem_locked(iocb) \ + set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_sem_locked(iocb) \ + clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_sem_locked(iocb) \ + test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 77b4c04a2809..f6cba566429d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2241,11 +2241,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, mutex_lock(&inode->i_mutex); + ocfs2_iocb_clear_sem_locked(iocb); + relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (direct_io) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_sem_locked(iocb); } /* @@ -2382,8 +2386,10 @@ out: ocfs2_rw_unlock(inode, rw_level); out_sems: - if (have_alloc_sem) + if (have_alloc_sem) { up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } mutex_unlock(&inode->i_mutex); @@ -2527,6 +2533,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } + ocfs2_iocb_clear_sem_locked(iocb); + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. @@ -2534,6 +2542,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, if (filp->f_flags & O_DIRECT) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); if (ret < 0) { @@ -2575,8 +2584,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } bail: - if (have_alloc_sem) + if (have_alloc_sem) { up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); mlog_exit(ret); -- cgit v1.2.3 From 7d13162332f2b67a941d18cee20f1c0413e020de Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 7 Dec 2010 15:27:57 -0500 Subject: fanotify: fill in the metadata_len field on struct fanotify_event_metadata The fanotify_event_metadata now has a field which is supposed to indicate the length of the metadata portion of the event. Fill in that field as well. Based-in-part-on-patch-by: Alexey Zaytsev Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index dccd7985e65a..8b61220cffc5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -116,6 +116,7 @@ static int fill_event_metadata(struct fsnotify_group *group, group, metadata, event); metadata->event_len = FAN_EVENT_METADATA_LEN; + metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); @@ -275,10 +276,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, goto out_close_fd; ret = -EFAULT; - if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN)) + if (copy_to_user(buf, &fanotify_event_metadata, + fanotify_event_metadata.event_len)) goto out_kill_access_response; - return FAN_EVENT_METADATA_LEN; + return fanotify_event_metadata.event_len; out_kill_access_response: remove_access_response(group, event, fd); -- cgit v1.2.3 From ab226e21ad34f6ef52e00d2ab399d2364b4cdfee Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Wed, 15 Dec 2010 20:41:54 -0800 Subject: ceph: fix direct-io on non-page-aligned buffers The user buffer may be 512-byte aligned, not page-aligned. We were assuming the buffer was page-aligned and only accounting for non-page-aligned io offsets. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8d79b8912e31..e860d8f1bb45 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -282,7 +282,8 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool align_to_pages) + int *checkeof, bool align_to_pages, + unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); @@ -307,7 +308,7 @@ static int striped_read(struct inode *inode, more: if (align_to_pages) - page_align = (pos - io_align) & ~PAGE_MASK; + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; this_len = left; @@ -376,16 +377,18 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, struct inode *inode = file->f_dentry->d_inode; struct page **pages; u64 off = *poff; - int num_pages = calc_pages_for(off, len); - int ret; + int num_pages, ret; dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_DIRECT) + if (file->f_flags & O_DIRECT) { + num_pages = calc_pages_for((unsigned long)data, len); pages = ceph_get_direct_page_vector(data, num_pages); - else + } else { + num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + } if (IS_ERR(pages)) return PTR_ERR(pages); @@ -400,7 +403,8 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, goto done; ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT); + file->f_flags & O_DIRECT, + (unsigned long)data & ~PAGE_MASK); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -456,6 +460,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int do_sync = 0; int check_caps = 0; int page_align, io_align; + unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -471,6 +476,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, pos = *offset; io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) @@ -496,12 +502,15 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; - if (file->f_flags & O_DIRECT) + if (file->f_flags & O_DIRECT) { /* write from beginning of first page, regardless of io alignment */ - page_align = (pos - io_align) & ~PAGE_MASK; - else + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + num_pages = calc_pages_for((unsigned long)data, len); + } else { page_align = pos & ~PAGE_MASK; + num_pages = calc_pages_for(pos, len); + } req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, @@ -512,8 +521,6 @@ more: if (!req) return -ENOMEM; - num_pages = calc_pages_for(pos, len); - if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages); if (IS_ERR(pages)) { -- cgit v1.2.3 From 947b10ae0aeda89fc066a7470fdba55f72b0b8fc Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 16 Dec 2010 09:57:57 +0900 Subject: nilfs2: fix regression of garbage collection ioctl On 2.6.37-rc1, garbage collection ioctl of nilfs was broken due to the commit 263d90cefc7d82a0 ("nilfs2: remove own inode hash used for GC"), and leading to filesystem corruption. The patch doesn't queue gc-inodes for log writer if they are reused through the vfs inode cache. Here, gc-inode is the inode which buffers blocks to be relocated on GC. That patch queues gc-inodes in nilfs_init_gcinode() function, but this function is not called when they don't have I_NEW flag. Thus, some of live blocks are wrongly overrode without being moved to new logs. This resolves the problem by moving the gc-inode queueing to an outer function to ensure it's done right. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/gcinode.c | 9 --------- fs/nilfs2/ioctl.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 33ad25ddd5c4..caf9a6a3fb54 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) int nilfs_init_gcinode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); @@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode) ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); - /* - * Add the inode to GC inode list. Garbage Collection - * is serialized and no two processes manipulate the - * list simultaneously. - */ - igrab(inode); - list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes); - return 0; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index e00d9457c256..b185e937a335 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) { size_t nmembs = argv->v_nmembs; + struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; struct inode *inode; struct nilfs_vdesc *vdesc; struct buffer_head *bh, *n; @@ -353,6 +354,17 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, ret = PTR_ERR(inode); goto failed; } + if (list_empty(&NILFS_I(inode)->i_dirty)) { + /* + * Add the inode to GC inode list. Garbage Collection + * is serialized and no two processes manipulate the + * list simultaneously. + */ + igrab(inode); + list_add(&NILFS_I(inode)->i_dirty, + &nilfs->ns_gc_inodes); + } + do { ret = nilfs_ioctl_move_inode_block(inode, vdesc, &buffers); -- cgit v1.2.3 From 41b41a26d4d6e4e3ad877d02377844ab9552dc16 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 9 Dec 2010 18:20:38 -0800 Subject: ocfs2: Adjust masklog flag values Two masklogs had the same flag value. Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/cluster/masklog.c | 3 ++- fs/ocfs2/cluster/masklog.h | 15 ++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index c7fba396392d..6c61771469af 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(QUOTA), define_mask(REFCOUNT), define_mask(BASTS), + define_mask(RESERVATIONS), + define_mask(CLUSTER), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), - define_mask(RESERVATIONS), }; static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index ea2ed9f56c94..34d6544357d9 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -81,7 +81,7 @@ #include /* bits that are frequently given and infrequently matched in the low word */ -/* NOTE: If you add a flag, you need to also update mlog.c! */ +/* NOTE: If you add a flag, you need to also update masklog.c! */ #define ML_ENTRY 0x0000000000000001ULL /* func call entry */ #define ML_EXIT 0x0000000000000002ULL /* func call exit */ #define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ @@ -114,13 +114,14 @@ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ #define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ -#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */ +#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */ +#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */ +#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */ + /* bits that are infrequently given and frequently matched in the high word */ -#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ -#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ -#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ -#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ -#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */ +#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ +#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ +#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) -- cgit v1.2.3 From 8ac33dc86d37ca76d282aa112d4f2794a731064e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 15 Dec 2010 16:30:00 +0800 Subject: ocfs2: Hold ip_lock when set/clear flags for indexed dir. When we set/clear the dyn_features for an inode we hold the ip_lock. So do it when we set/clear OCFS2_INDEXED_DIR_FL also. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c49f6de0e7ab..d417b3f9b0c7 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, di->i_dx_root = cpu_to_le64(dr_blkno); + spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_journal_dirty(handle, di_bh); @@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, goto out_commit; } + spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); ocfs2_journal_dirty(handle, di_bh); -- cgit v1.2.3 From 92cf765237e2787eb168096305c448caf25ac7f8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Dec 2010 09:53:41 -0800 Subject: ceph: fix null pointer dereference in ceph_init_dentry for nfs reexport The fh_to_dentry etc. methods use ceph_init_dentry(), which assumes that d_parent is defined. It isn't for those callers, so check! Signed-off-by: Sage Weil --- fs/ceph/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 158c700fdca5..d902948a90d8 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,7 +40,8 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) dentry->d_op = &ceph_dentry_ops; else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) dentry->d_op = &ceph_snapdir_dentry_ops; -- cgit v1.2.3 From b6aa5901c7a2bd90d0b6b9866300d2648b2568f3 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Wed, 15 Dec 2010 20:45:41 -0800 Subject: ceph: mark user pages dirty on direct-io reads For read operation, we have to set the argument _write_ of get_user_pages to 1 since we will write data to pages. Also, we need to SetPageDirty before releasing these pages. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 8 ++++---- include/linux/ceph/libceph.h | 6 ++++-- net/ceph/pagevec.c | 11 +++++++---- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e860d8f1bb45..7d0e4a82d898 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -384,7 +384,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, if (file->f_flags & O_DIRECT) { num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages); + pages = ceph_get_direct_page_vector(data, num_pages, true); } else { num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); @@ -413,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, done: if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, true); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -522,7 +522,7 @@ more: return -ENOMEM; if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages); + pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -572,7 +572,7 @@ more: } if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, false); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 9e76d35670d2..72c72bfccb88 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -227,8 +227,10 @@ extern int ceph_open_session(struct ceph_client *client); extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages); -extern void ceph_put_page_vector(struct page **pages, int num_pages); + int num_pages, + bool write_page); +extern void ceph_put_page_vector(struct page **pages, int num_pages, + bool dirty); extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_copy_user_to_page_vector(struct page **pages, diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index ac34feeb2b3a..01947a5d03b7 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -13,7 +13,7 @@ * build a vector of user pages */ struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages) + int num_pages, bool write_page) { struct page **pages; int rc; @@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const char __user *data, down_read(¤t->mm->mmap_sem); rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, 0, 0, pages, NULL); + num_pages, write_page, 0, pages, NULL); up_read(¤t->mm->mmap_sem); if (rc < 0) goto fail; @@ -36,12 +36,15 @@ fail: } EXPORT_SYMBOL(ceph_get_direct_page_vector); -void ceph_put_page_vector(struct page **pages, int num_pages) +void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) { int i; - for (i = 0; i < num_pages; i++) + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(pages[i]); put_page(pages[i]); + } kfree(pages); } EXPORT_SYMBOL(ceph_put_page_vector); -- cgit v1.2.3 From 3cb50ddf97a0a1ca4c68bc12fa1e727a6b45fbf2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Dec 2010 15:53:18 +0000 Subject: Fix btrfs b0rkage Buggered-in: 76dda93c6ae2 ("Btrfs: add snapshot/subvolume destroy ioctl") Signed-off-by: Al Viro Acked-by: Chris Mason Signed-off-by: Linus Torvalds --- fs/btrfs/export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 6f0444473594..659f532d26a0 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; - static struct dentry *dentry; + struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; -- cgit v1.2.3 From 7d8f98769e7f4bc29c38789daeb416c6a7d7c241 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 22 Dec 2010 17:50:30 +0800 Subject: ocfs2: Fix system inodes cache overflow. When we store system inodes cache in ocfs2_super, we use a array for global system inodes. But unfortunately, the range is calculated wrongly which makes it overflow and pollute ocfs2_super->local_system_inodes. This patch fix it by setting the range properly. The corresponding bug is ossbug1303. http://oss.oracle.com/bugzilla/show_bug.cgi?id=1303 Cc: stable@kernel.org Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c2e4f8222e2f..bf2e7764920e 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -350,7 +350,7 @@ enum { #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; -#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE #define NUM_LOCAL_SYSTEM_INODES \ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) -- cgit v1.2.3 From eabb26cacdec33ca6f6fcaee762b57c2205169ca Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Tue, 21 Dec 2010 17:24:19 -0800 Subject: logfs: fix deadlock in logfs_get_wblocks, hold and wait on super->s_write_mutex do_logfs_journal_wl_pass() should use GFP_NOFS for memory allocation GC code calls btree_insert32 with GFP_KERNEL while holding a mutex super->s_write_mutex. The same mutex is used in address_space_operations->writepage(), and a call to writepage() could be triggered as a result of memory allocation in btree_insert32, causing a deadlock. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=20342 Signed-off-by: Prasad Joshi Cc: Joern Engel Cc: Florian Mickler Cc: "Rafael J. Wysocki" Cc: Maciej Rutecki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/logfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index f46ee8b0e135..9da29706f91c 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) super->s_journal_seg[i] = segno; super->s_journal_ec[i] = ec; logfs_set_segment_reserved(sb, segno); - err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); BUG_ON(err); /* mempool should prevent this */ err = logfs_erase_segment(sb, segno, 1); BUG_ON(err); /* FIXME: remount-ro would be nicer */ -- cgit v1.2.3 From f06328d7721ad3852c45eb2a10a0c8f9439b5f33 Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Tue, 21 Dec 2010 17:24:20 -0800 Subject: logfs: fix "Kernel BUG at readwrite.c:1193" This happens when __logfs_create() tries to write a new inode to the disk which is full. __logfs_create() associates the transaction pointer with inode. During the logfs_write_inode() function call chain this transaction pointer is moved from inode to page->private using function move_inode_to_page (do_write_inode() -> inode_to_page() -> move_inode_to_page) When the write inode fails, the transaction is aborted and iput is called on the failed inode. During delete_inode the same transaction pointer associated with the page is getting used. Thus causing kernel BUG. The patch checks for error in write_inode() and restores the page->private to NULL. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=20162 Signed-off-by: Prasad Joshi Cc: Joern Engel Cc: Florian Mickler Cc: "Rafael J. Wysocki" Cc: Maciej Rutecki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/logfs/readwrite.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 6127baf0e188..ee99a9f5dfd3 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode) /* FIXME: transaction is part of logfs_block now. Is that enough? */ err = logfs_write_buf(master_inode, page, 0); + if (err) + move_page_to_inode(inode, page); + logfs_put_write_page(page); return err; } -- cgit v1.2.3 From 8a7411a24350bac141271755c66f40c56b0535ae Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 20 Dec 2010 22:30:36 -0500 Subject: ext4: fix on-line resizing regression https://bugzilla.kernel.org/show_bug.cgi?id=25352 This regression was caused by commit a31437b85: "ext4: use sb_issue_zeroout in setup_new_group_blocks", by accidentally dropping the code which reserved the block group descriptor and inode table blocks. Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index dc963929de65..981c8477adab 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -232,6 +232,8 @@ static int setup_new_group_blocks(struct super_block *sb, GFP_NOFS); if (err) goto exit_bh; + for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) + ext4_set_bit(bit, bh->b_data); ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -247,6 +249,9 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; + for (i = 0, bit = input->inode_table - start; + i < sbi->s_itb_per_group; i++, bit++) + ext4_set_bit(bit, bh->b_data); if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; -- cgit v1.2.3