From 8fba54aebbdf1f999738121922e74bf796ad60ee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 24 Aug 2016 18:17:04 +0200 Subject: fuse: direct-io: don't dirty ITER_BVEC pages When reading from a loop device backed by a fuse file it deadlocks on lock_page(). This is because the page is already locked by the read() operation done on the loop device. In this case we don't want to either lock the page or dirty it. So do what fs/direct-io.c does: only dirty the page for ITER_IOVEC vectors. Reported-by: Sheng Yang Fixes: aa4d86163e4e ("block: loop: switch to VFS ITER_BVEC") Signed-off-by: Miklos Szeredi Cc: # v4.1+ Reviewed-by: Sheng Yang Reviewed-by: Ashish Samant Tested-by: Sheng Yang Tested-by: Ashish Samant --- fs/fuse/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f394aff59c36..3988b43c2f5a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (write) + if (should_dirty) set_page_dirty_lock(page); put_page(page); } @@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; + bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, !write); + fuse_release_user_pages(req, should_dirty); if (req->out.h.error) { err = req->out.h.error; break; -- cgit v1.2.3 From c49edecd513693ea7530ab18efbd7d6d5b7cbf90 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Sep 2016 12:05:31 -0400 Subject: NFS: Fix error reporting in nfs_file_write() When doing O_DSYNC writes, the actual write errors are reported through generic_write_sync(), so we must test the result. Reported-by: J. R. Okajima Fixes: 18290650b1c8 ("NFS: Move buffered I/O locking into nfs_file_write()") Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7d620970f2e1..ca699ddc11c1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - written = generic_write_sync(iocb, result); + result = generic_write_sync(iocb, result); + if (result < 0) + goto out; + written = result; iocb->ki_pos += written; /* Return error values */ -- cgit v1.2.3 From bf0291dd2267a2b9a4cd74d65249553d11bb45d6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Sep 2016 10:39:51 -0400 Subject: pNFS: Ensure LAYOUTGET and LAYOUTRETURN are properly serialised According to RFC5661, the client is responsible for serialising LAYOUTGET and LAYOUTRETURN to avoid ambiguity. Consider the case where we send both in parallel. Client Server ====== ====== LAYOUTGET(seqid=X) LAYOUTRETURN(seqid=X) LAYOUTGET return seqid=X+1 LAYOUTRETURN return seqid=X+2 Process LAYOUTRETURN Forget layout stateid Process LAYOUTGET Set seqid=X+1 The client processes the layoutget/layoutreturn in the wrong order, and since the result of the layoutreturn was to clear the only existing layout segment, the client forgets the layout stateid. When the LAYOUTGET comes in, it is treated as having a completely new stateid, and so the client sets the wrong sequence id... Fix is to check if there are outstanding LAYOUTGET requests before we send the LAYOUTRETURN (note that LAYOUGET will already wait if it sees an outstanding LAYOUTRETURN). Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6daf034645c8..519ad320f5cd 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -899,6 +899,9 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, enum pnfs_iomode *iomode) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; pnfs_get_layout_hdr(lo); -- cgit v1.2.3 From 2a59a0411671ef9daf17ba21da57809c696f4119 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Sep 2016 11:20:04 -0400 Subject: pNFS: Fix pnfs_set_layout_stateid() to clear NFS_LAYOUT_INVALID_STID If the layout was marked as invalid, we want to ensure to initialise the layout header fields correctly. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 519ad320f5cd..cd8b5fca33f6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -768,17 +768,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; - bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { + + if (!pnfs_layout_is_valid(lo)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + lo->plh_barrier = newseq; + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return; + } + if (pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); /* * Because of wraparound, we want to keep the barrier @@ -790,7 +805,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, new_barrier = be32_to_cpu(new->seqid); else if (new_barrier == 0) return; - if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) lo->plh_barrier = new_barrier; } @@ -886,14 +901,6 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, @@ -1801,16 +1808,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); - lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + pnfs_set_layout_stateid(lo, &res->stateid, true); } pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (!pnfs_layout_is_valid(lo)) { - pnfs_clear_layoutreturn_info(lo); - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - } if (res->return_on_close) -- cgit v1.2.3 From 52ec7be2e27392201adf77892ba883f68df88c99 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Sep 2016 11:05:28 -0400 Subject: pNFS: Clear out all layout segments if the server unsets lrp->res.lrs_present If the server fails to set lrp->res.lrs_present in the LAYOUTRETURN reply, then that means it believes the client holds no more layout state for that file, and that the layout stateid is now invalid. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f5aecaabcb7c..c380d2ee4137 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8190,10 +8190,13 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) + if (lrp->res.lrs_present) { + pnfs_mark_matching_lsegs_invalid(lo, &freeme, + &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); nfs4_sequence_free_slot(&lrp->res.seq_res); -- cgit v1.2.3 From 334a8f37115bf35e38617315a360a91ac4f2b2c6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 4 Sep 2016 12:46:35 -0400 Subject: pNFS: Don't forget the layout stateid if there are outstanding LAYOUTGETs If there are outstanding LAYOUTGET rpc calls, then we want to ensure that we keep the layout stateid around so we that don't inadvertently pick up an old/misordered sequence id. The race is as follows: Client Server ====== ====== LAYOUTGET(seqid) LAYOUTGET(seqid) return LAYOUTGET(seqid+1) return LAYOUTGET(seqid+2) process LAYOUTGET(seqid+2) forget layout process LAYOUTGET(seqid+1) If it forgets the layout stateid before processing seqid+1, then the client will not check the layout->plh_barrier, and so will set the stateid with seqid+1. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index cd8b5fca33f6..2c93a85eda51 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); if (list_empty(&lo->plh_segs)) { - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + if (atomic_read(&lo->plh_outstanding) == 0) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); -- cgit v1.2.3 From e1ff3dd1ae52cef5b5373c8cc4ad949c2c25a71c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 5 Sep 2016 13:55:20 +0200 Subject: ovl: fix workdir creation Workdir creation fails in latest kernel. Fix by allowing EOPNOTSUPP as a valid return value from vfs_removexattr(XATTR_NAME_POSIX_ACL_*). Upper filesystem may not support ACL and still be perfectly able to support overlayfs. Reported-by: Martin Ziegler Signed-off-by: Miklos Szeredi Fixes: c11b9fdd6a61 ("ovl: remove posix_acl_default from workdir") Cc: --- fs/overlayfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a4585f961bf9..e2a94a26767b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -835,11 +835,11 @@ retry: goto out_dput; err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); - if (err && err != -ENODATA) + if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); - if (err && err != -ENODATA) + if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ -- cgit v1.2.3 From 0f5aa88a7bb28b73253fb42b3df8202142769f39 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sun, 28 Aug 2016 18:47:12 +0200 Subject: ceph: do not modify fi->frag in need_reset_readdir() Commit f3c4ebe65ea1 ("ceph: using hash value to compose dentry offset") modified "if (fpos_frag(new_pos) != fi->frag)" to "if (fi->frag |= fpos_frag(new_pos))" in need_reset_readdir(), thus replacing a comparison operator with an assignment one. This looks like a typo which is reported by clang when building the kernel with some warning flags: fs/ceph/dir.c:600:22: error: using the result of an assignment as a condition without parentheses [-Werror,-Wparentheses] } else if (fi->frag |= fpos_frag(new_pos)) { ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~ fs/ceph/dir.c:600:22: note: place parentheses around the assignment to silence this warning } else if (fi->frag |= fpos_frag(new_pos)) { ^ ( ) fs/ceph/dir.c:600:22: note: use '!=' to turn this compound assignment into an inequality comparison } else if (fi->frag |= fpos_frag(new_pos)) { ^~ != Fixes: f3c4ebe65ea1 ("ceph: using hash value to compose dentry offset") Signed-off-by: Nicolas Iooss Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c64a0b794d49..df4b3e6fa563 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag |= fpos_frag(new_pos)) { + } else if (fi->frag != fpos_frag(new_pos)) { return true; } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; -- cgit v1.2.3 From ed7a6948394305b810d0c6203268648715e5006f Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Fri, 26 Aug 2016 11:33:14 +0800 Subject: btrfs: do not decrease bytes_may_use when replaying extents When replaying extents, there is no need to update bytes_may_use in btrfs_alloc_logged_file_extent(), otherwise it'll trigger a WARN_ON about bytes_may_use. Fixes: ("btrfs: update btrfs_space_info's bytes_may_use timely") Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 64676a16d32a..4483487ef021 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8216,6 +8216,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only @@ -8231,9 +8232,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_add_reserved_bytes(block_group, ins->offset, - ins->offset, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); -- cgit v1.2.3 From cbd60aa7cd17d81a434234268c55192862147439 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 6 Sep 2016 05:37:40 -0700 Subject: Btrfs: remove root_log_ctx from ctx list before btrfs_sync_log returns We use a btrfs_log_ctx structure to pass information into the tree log commit, and get error values out. It gets added to a per log-transaction list which we walk when things go bad. Commit d1433debe added an optimization to skip waiting for the log commit, but didn't take root_log_ctx out of the list. This patch makes sure we remove things before exiting. Signed-off-by: Chris Mason Fixes: d1433debe7f4346cf9fc0dafc71c3137d2a97bc4 cc: stable@vger.kernel.org # 3.15+ --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e935035ac034..ef9c55bc7907 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2867,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; -- cgit v1.2.3 From ce129655c9d9aaa7b3bcc46529db1b36693575ed Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Fri, 2 Sep 2016 10:58:46 +0800 Subject: btrfs: introduce tickets_id to determine whether asynchronous metadata reclaim work makes progress In btrfs_async_reclaim_metadata_space(), we use ticket's address to determine whether asynchronous metadata reclaim work is making progress. ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_ticket == ticket) { flush_state++; } else { last_ticket = ticket; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; } But indeed it's wrong, we should not rely on local variable's address to do this check, because addresses may be same. In my test environment, I dd one 168MB file in a 256MB fs, found that for this file, every time wait_reserve_ticket() called, local variable ticket's address is same, For above codes, assume a previous ticket's address is addrA, last_ticket is addrA. Btrfs_async_reclaim_metadata_space() finished this ticket and wake up it, then another ticket is added, but with the same address addrA, now last_ticket will be same to current ticket, then current ticket's flush work will start from current flush_state, not initial FLUSH_DELAYED_ITEMS_NR, which may result in some enospc issues(I have seen this in my test machine). Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ec4154faab61..146d1c7078ed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -427,6 +427,7 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4483487ef021..d09cf7aa083b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4966,12 +4966,12 @@ static void wake_all_tickets(struct list_head *head) */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { - struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -4984,8 +4984,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - last_ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); + last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -5005,10 +5004,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (last_ticket == ticket) { + if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { - last_ticket = ticket; + last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; @@ -5384,6 +5383,7 @@ again: list_del_init(&ticket->list); num_bytes -= ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { ticket->bytes -= num_bytes; @@ -5426,6 +5426,7 @@ again: num_bytes -= ticket->bytes; space_info->bytes_may_use += ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { trace_btrfs_space_reservation(fs_info, "space_info", -- cgit v1.2.3 From ca120cf688874f4423e579e7cc5ddf7244aeca45 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 3 Sep 2016 10:38:03 -0700 Subject: mm: fix show_smap() for zone_device-pmd ranges Attempting to dump /proc//smaps for a process with pmd dax mappings currently results in the following VM_BUG_ONs: kernel BUG at mm/huge_memory.c:1105! task: ffff88045f16b140 task.stack: ffff88045be14000 RIP: 0010:[] [] follow_trans_huge_pmd+0x2cb/0x340 [..] Call Trace: [] smaps_pte_range+0xa0/0x4b0 [] ? vsnprintf+0x255/0x4c0 [] __walk_page_range+0x1fe/0x4d0 [] walk_page_vma+0x62/0x80 [] show_smap+0xa6/0x2b0 kernel BUG at fs/proc/task_mmu.c:585! RIP: 0010:[] [] smaps_pte_range+0x499/0x4b0 Call Trace: [] ? vsnprintf+0x255/0x4c0 [] __walk_page_range+0x1fe/0x4d0 [] walk_page_vma+0x62/0x80 [] show_smap+0xa6/0x2b0 These locations are sanity checking page flags that must be set for an anonymous transparent huge page, but are not set for the zone_device pages associated with dax mappings. Cc: Ross Zwisler Cc: Kirill A. Shutemov Acked-by: Andrew Morton Signed-off-by: Dan Williams --- fs/proc/task_mmu.c | 2 ++ mm/huge_memory.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187d84ef9de9..f6fa99eca515 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; else VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2db2112aa31e..a6abd76baa72 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1078,7 +1078,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto out; page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { @@ -1116,7 +1116,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, } skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); if (flags & FOLL_GET) get_page(page); -- cgit v1.2.3 From 163ae1c6ad6299b19e22b4a35d5ab24a89791a98 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Sep 2016 10:57:08 -0700 Subject: fscrypto: add authorization check for setting encryption policy On an ext4 or f2fs filesystem with file encryption supported, a user could set an encryption policy on any empty directory(*) to which they had readonly access. This is obviously problematic, since such a directory might be owned by another user and the new encryption policy would prevent that other user from creating files in their own directory (for example). Fix this by requiring inode_owner_or_capable() permission to set an encryption policy. This means that either the caller must own the file, or the caller must have the capability CAP_FOWNER. (*) Or also on any regular file, for f2fs v4.6 and later and ext4 v4.8-rc1 and later; a separate bug fix is coming for that. Signed-off-by: Eric Biggers Cc: stable@vger.kernel.org # 4.1+; check fs/{ext4,f2fs} Signed-off-by: Theodore Ts'o --- fs/crypto/policy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0f9961eede1e..c9800b1a2e93 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -95,6 +95,9 @@ static int create_encryption_context_from_policy(struct inode *inode, int fscrypt_process_policy(struct inode *inode, const struct fscrypt_policy *policy) { + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; -- cgit v1.2.3 From 002ced4be6429918800ce3e41d5cbc2d7c01822c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Sep 2016 11:36:39 -0700 Subject: fscrypto: only allow setting encryption policy on directories The FS_IOC_SET_ENCRYPTION_POLICY ioctl allowed setting an encryption policy on nondirectory files. This was unintentional, and in the case of nonempty regular files did not behave as expected because existing data was not actually encrypted by the ioctl. In the case of ext4, the user could also trigger filesystem errors in ->empty_dir(), e.g. due to mismatched "directory" checksums when the kernel incorrectly tried to interpret a regular file as a directory. This bug affected ext4 with kernels v4.8-rc1 or later and f2fs with kernels v4.6 and later. It appears that older kernels only permitted directories and that the check was accidentally lost during the refactoring to share the file encryption code between ext4 and f2fs. This patch restores the !S_ISDIR() check that was present in older kernels. Signed-off-by: Eric Biggers Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o --- fs/crypto/policy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index c9800b1a2e93..f96547f83cab 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -102,6 +102,8 @@ int fscrypt_process_policy(struct inode *inode, return -EINVAL; if (!inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; if (!inode->i_sb->s_cop->empty_dir) return -EOPNOTSUPP; if (!inode->i_sb->s_cop->empty_dir(inode)) -- cgit v1.2.3 From 4214ebf4654798309364d0c678b799e402f38288 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Fri, 29 Jul 2016 22:38:19 +0100 Subject: Fix memory leaks in cifs_do_mount() Fix memory leaks introduced by the patch fs/cifs: make share unaccessible at root level mountable Also move allocation of cifs_sb->prepath to cifs_setup_cifs_sb(). Signed-off-by: Sachin Prabhu Tested-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 20 ++++++++------------ fs/cifs/cifsproto.h | 2 +- fs/cifs/connect.c | 10 +++++++++- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 6bbec5e784cd..cc9cdab52dab 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -686,26 +686,22 @@ cifs_do_mount(struct file_system_type *fs_type, cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; + goto out_free; } - if (volume_info->prepath) { - cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); - if (cifs_sb->prepath == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_cifs_sb; - } + rc = cifs_setup_cifs_sb(volume_info, cifs_sb); + if (rc) { + root = ERR_PTR(rc); + goto out_free; } - cifs_setup_cifs_sb(volume_info, cifs_sb); - rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!(flags & MS_SILENT)) cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", rc); root = ERR_PTR(rc); - goto out_mountdata; + goto out_free; } mnt_data.vol = volume_info; @@ -752,9 +748,9 @@ out: cifs_cleanup_volume_info(volume_info); return root; -out_mountdata: +out_free: + kfree(cifs_sb->prepath); kfree(cifs_sb->mountdata); -out_cifs_sb: kfree(cifs_sb); out_nls: unload_nls(volume_info->local_nls); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1243bd326591..95dab43646f0 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int to_read); -extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae03283bd61..4546926680d4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3222,7 +3222,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } } -void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, +int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb) { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); @@ -3316,6 +3316,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); + + if (pvolume_info->prepath) { + cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) + return -ENOMEM; + } + + return 0; } static void -- cgit v1.2.3 From c1d8b24d18192764fe82067ec6aa8d4c3bf094e0 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Fri, 29 Jul 2016 22:38:20 +0100 Subject: Compare prepaths when comparing superblocks The patch fs/cifs: make share unaccessible at root level mountable makes use of prepaths when any component of the underlying path is inaccessible. When mounting 2 separate shares having different prepaths but are other wise similar in other respects, we end up sharing superblocks when we shouldn't be doing so. Signed-off-by: Sachin Prabhu Tested-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/connect.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4546926680d4..2e4f4bad8b1e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 1; } +static int +match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) +{ + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; + + if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { + if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) + return 0; + /* The prepath should be null terminated strings */ + if (strcmp(new->prepath, old->prepath)) + return 0; + + return 1; + } + return 0; +} + int cifs_match_super(struct super_block *sb, void *data) { @@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data) if (!match_server(tcp_srv, volume_info) || !match_session(ses, volume_info) || - !match_tcon(tcon, volume_info->UNC)) { + !match_tcon(tcon, volume_info->UNC) || + !match_prepath(sb, mnt_data)) { rc = 0; goto out; } -- cgit v1.2.3 From 348c1bfa84dfc47da1f1234b7f2bf09fa798edea Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Fri, 29 Jul 2016 22:38:21 +0100 Subject: Move check for prefix path to within cifs_get_root() Signed-off-by: Sachin Prabhu Tested-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cc9cdab52dab..14ae4b8e1a3c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) char *s, *p; char sep; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + return dget(sb->s_root); + full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) @@ -731,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) - root = dget(sb->s_root); - else - root = cifs_get_root(volume_info, sb); - + root = cifs_get_root(volume_info, sb); if (IS_ERR(root)) goto out_super; -- cgit v1.2.3 From ba63f23d69a3a10e7e527a02702023da68ef8a6d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Sep 2016 14:20:38 -0700 Subject: fscrypto: require write access to mount to set encryption policy Since setting an encryption policy requires writing metadata to the filesystem, it should be guarded by mnt_want_write/mnt_drop_write. Otherwise, a user could cause a write to a frozen or readonly filesystem. This was handled correctly by f2fs but not by ext4. Make fscrypt_process_policy() handle it rather than relying on the filesystem to get it right. Signed-off-by: Eric Biggers Cc: stable@vger.kernel.org # 4.1+; check fs/{ext4,f2fs} Signed-off-by: Theodore Ts'o Acked-by: Jaegeuk Kim --- fs/crypto/policy.c | 38 +++++++++++++++++++++++++------------- fs/ext4/ioctl.c | 2 +- fs/f2fs/file.c | 9 +-------- include/linux/fscrypto.h | 5 ++--- 4 files changed, 29 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index f96547f83cab..ed115acb5dee 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ #include #include #include +#include static int inode_has_encryption_context(struct inode *inode) { @@ -92,31 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct inode *inode, +int fscrypt_process_policy(struct file *filp, const struct fscrypt_policy *policy) { + struct inode *inode = file_inode(filp); + int ret; + if (!inode_owner_or_capable(inode)) return -EACCES; if (policy->version != 0) return -EINVAL; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - if (!inode->i_sb->s_cop->empty_dir) - return -EOPNOTSUPP; - if (!inode->i_sb->s_cop->empty_dir(inode)) - return -ENOTEMPTY; - return create_encryption_context_from_policy(inode, policy); + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; } - if (is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; + mnt_drop_write_file(filp); + return ret; } EXPORT_SYMBOL(fscrypt_process_policy); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd67fb4..1bb7df5e4536 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -776,7 +776,7 @@ resizefs_out: (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - return fscrypt_process_policy(inode, &policy); + return fscrypt_process_policy(filp, &policy); #else return -EOPNOTSUPP; #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 47abb96098e4..28f4f4cbb8d8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - ret = fscrypt_process_policy(inode, &policy); - mnt_drop_write_file(filp); - return ret; + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index cfa6cde25f8e..76cff18bb032 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -274,8 +274,7 @@ extern void fscrypt_restore_control_page(struct page *); extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, unsigned int); /* policy.c */ -extern int fscrypt_process_policy(struct inode *, - const struct fscrypt_policy *); +extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); extern int fscrypt_has_permitted_context(struct inode *, struct inode *); extern int fscrypt_inherit_context(struct inode *, struct inode *, @@ -345,7 +344,7 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, } /* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct inode *i, +static inline int fscrypt_notsupp_process_policy(struct file *f, const struct fscrypt_policy *p) { return -EOPNOTSUPP; -- cgit v1.2.3 From b519d408ea32040b1c7e10b155a3ee9a36660947 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 11 Sep 2016 14:50:01 -0400 Subject: NFSv4.1: Fix the CREATE_SESSION slot number accounting Ensure that we conform to the algorithm described in RFC5661, section 18.36.4 for when to bump the sequence id. In essence we do it for all cases except when the RPC call timed out, or in case of the server returning NFS4ERR_DELAY or NFS4ERR_STALE_CLIENTID. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c380d2ee4137..a9dec32ba9ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7570,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); -- cgit v1.2.3 From 5297e0f0fe13305a1fc7f01986be0dccd063d57a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Sep 2016 20:20:00 -0700 Subject: vfs: fix return type of ioctl_file_dedupe_range All the VFS functions in the dedupe ioctl path return int status, so the ioctl handler ought to as well. Found by Coverity, CID 1350952. Signed-off-by: Darrick J. Wong Signed-off-by: Linus Torvalds --- fs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 0f56deb24ce6..26aba0942a98 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +static int ioctl_file_dedupe_range(struct file *file, void __user *arg) { struct file_dedupe_range __user *argp = arg; struct file_dedupe_range *same = NULL; -- cgit v1.2.3 From b71dbf1032f546bf3efd60fb5d9d0cefd200a508 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Sep 2016 20:20:44 -0700 Subject: vfs: cap dedupe request structure size at PAGE_SIZE Kirill A Shutemov reports that the kernel doesn't try to cap dest_count in any way, and uses the number to allocate kernel memory. This causes high order allocation warnings in the kernel log if someone passes in a big enough value. We should clamp the allocation at PAGE_SIZE to avoid stressing the VM. The two existing users of the dedupe ioctl never send more than 120 requests, so we can safely clamp dest_range at PAGE_SIZE, because with 4k pages we can handle up to 127 dedupe candidates. Given the max extent length of 16MB, we can end up doing 2GB of IO which is plenty. [ Note: the "offsetof()" can't overflow, because 'count' is just a 16-bit integer. That's not obvious in the limited context of the patch, so I'm noting it here because it made me go look. - Linus ] Reported-by: "Kirill A. Shutemov" Signed-off-by: Darrick J. Wong Signed-off-by: Linus Torvalds --- fs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 26aba0942a98..c415668c86d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -582,6 +582,10 @@ static int ioctl_file_dedupe_range(struct file *file, void __user *arg) } size = offsetof(struct file_dedupe_range __user, info[count]); + if (size > PAGE_SIZE) { + ret = -ENOMEM; + goto out; + } same = memdup_user(argp, size); if (IS_ERR(same)) { -- cgit v1.2.3 From 22f6b4d34fcf039c63a94e7670e0da24f8575a5a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 16 Sep 2016 00:31:22 +0200 Subject: aio: mark AIO pseudo-fs noexec This ensures that do_mmap() won't implicitly make AIO memory mappings executable if the READ_IMPLIES_EXEC personality flag is set. Such behavior is problematic because the security_mmap_file LSM hook doesn't catch this case, potentially permitting an attacker to bypass a W^X policy enforced by SELinux. I have tested the patch on my machine. To test the behavior, compile and run this: #define _GNU_SOURCE #include #include #include #include #include #include #include int main(void) { personality(READ_IMPLIES_EXEC); aio_context_t ctx = 0; if (syscall(__NR_io_setup, 1, &ctx)) err(1, "io_setup"); char cmd[1000]; sprintf(cmd, "cat /proc/%d/maps | grep -F '/[aio]'", (int)getpid()); system(cmd); return 0; } In the output, "rw-s" is good, "rwxs" is bad. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- fs/aio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index fb8e45b88cd4..4fe81d1c60f9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + AIO_RING_MAGIC); + + if (!IS_ERR(root)) + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } /* aio_setup -- cgit v1.2.3 From e6f0c6e6170fec175fe676495f29029aecdf486c Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 19 Sep 2016 14:43:55 -0700 Subject: ocfs2/dlm: fix race between convert and migration Commit ac7cf246dfdb ("ocfs2/dlm: fix race between convert and recovery") checks if lockres master has changed to identify whether new master has finished recovery or not. This will introduce a race that right after old master does umount ( means master will change), a new convert request comes. In this case, it will reset lockres state to DLM_RECOVERING and then retry convert, and then fail with lockres->l_action being set to OCFS2_AST_INVALID, which will cause inconsistent lock level between ocfs2 and dlm, and then finally BUG. Since dlm recovery will clear lock->convert_pending in dlm_move_lockres_to_recovery_list, we can use it to correctly identify the race case between convert and recovery. So fix it. Fixes: ac7cf246dfdb ("ocfs2/dlm: fix race between convert and recovery") Link: http://lkml.kernel.org/r/57CE1569.8010704@huawei.com Signed-off-by: Joseph Qi Signed-off-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmconvert.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index cdeafb4e7ed6..0bb128659d4b 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags, int type) { enum dlm_status status; - u8 old_owner = res->owner; mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); @@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; /* if it failed, move it back to granted queue. * if master returns DLM_NORMAL and then down before sending ast, * it may have already been moved to granted queue, reset to @@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); - } else if ((res->state & DLM_LOCK_RES_RECOVERING) || - (old_owner != res->owner)) { - mlog(0, "res %.*s is in recovering or has been recovered.\n", - res->lockname.len, res->lockname.name); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); -- cgit v1.2.3 From 7cbdb4a286a60c5d519cb9223fe2134d26870d39 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 19 Sep 2016 14:44:12 -0700 Subject: autofs: use dentry flags to block walks during expire Somewhere along the way the autofs expire operation has changed to hold a spin lock over expired dentry selection. The autofs indirect mount expired dentry selection is complicated and quite lengthy so it isn't appropriate to hold a spin lock over the operation. Commit 47be61845c77 ("fs/dcache.c: avoid soft-lockup in dput()") added a might_sleep() to dput() causing a WARN_ONCE() about this usage to be issued. But the spin lock doesn't need to be held over this check, the autofs dentry info. flags are enough to block walks into dentrys during the expire. I've left the direct mount expire as it is (for now) because it is much simpler and quicker than the indirect mount expire and adding spin lock release and re-aquires would do nothing more than add overhead. Fixes: 47be61845c77 ("fs/dcache.c: avoid soft-lockup in dput()") Link: http://lkml.kernel.org/r/20160912014017.1773.73060.stgit@pluto.themaw.net Signed-off-by: Ian Kent Reported-by: Takashi Iwai Tested-by: Takashi Iwai Cc: Takashi Iwai Cc: NeilBrown Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/expire.c | 55 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index b493909e7492..d8e6d421c27f 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry, } return NULL; } + /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; + struct dentry *found; struct autofs_info *ino; if (!root) @@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) - expired = NULL; - else - expired = should_expire(dentry, mnt, timeout, how); - if (!expired) { + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); - if (should_expire(expired, mnt, timeout, how)) { - if (expired != dentry) - dput(dentry); - goto found; - } + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + + if (expired != dentry) + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); if (expired != dentry) dput(expired); - spin_unlock(&sbi->fs_lock); } return NULL; @@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; + int state; /* Block on any pending expire */ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) @@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (rcu_walk) return -ECHILD; +retry: spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); -- cgit v1.2.3 From 31b4beb473e3bdee1bf79db849502dcb24b5c202 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 19 Sep 2016 14:44:18 -0700 Subject: ipc/shm: fix crash if CONFIG_SHMEM is not set Commit c01d5b300774 ("shmem: get_unmapped_area align huge page") makes use of shm_get_unmapped_area() in shm_file_operations() unconditional to CONFIG_MMU. As Tony Battersby pointed this can lead NULL-pointer dereference on machine with CONFIG_MMU=y and CONFIG_SHMEM=n. In this case ipc/shm is backed by ramfs which doesn't provide f_op->get_unmapped_area for configurations with MMU. The solution is to provide dummy f_op->get_unmapped_area for ramfs when CONFIG_MMU=y, which just call current->mm->get_unmapped_area(). Fixes: c01d5b300774 ("shmem: get_unmapped_area align huge page") Link: http://lkml.kernel.org/r/20160912102704.140442-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Tony Battersby Tested-by: Tony Battersby Cc: Hugh Dickins Cc: [4.7.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-mmu.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 183a212694bf..12af0490322f 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -27,9 +27,17 @@ #include #include #include +#include #include "internal.h" +static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, + .get_unmapped_area = ramfs_mmu_get_unmapped_area, }; const struct inode_operations ramfs_file_inode_operations = { -- cgit v1.2.3 From 2b0ad0085aa47ace4756aa501274a7de0325c09c Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 19 Sep 2016 14:44:21 -0700 Subject: ocfs2: fix trans extend while flush truncate log Every time, ocfs2_extend_trans() included a credit for truncate log inode, but as that inode had been managed by jbd2 running transaction first time, it will not consume that credit until jbd2_journal_restart(). Since total credits to extend always included the un-consumed ones, there will be more and more un-consumed credit, at last jbd2_journal_restart() will fail due to credit number over the half of max transction credit. The following error was caught when unlinking a large file with many extents: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 13626 at fs/jbd2/transaction.c:269 start_this_handle+0x4c3/0x510 [jbd2]() Modules linked in: ocfs2 nfsd lockd grace nfs_acl auth_rpcgss sunrpc autofs4 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sd_mod sg ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ppdev xen_kbdfront xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea parport_pc parport pcspkr i2c_piix4 i2c_core acpi_cpufreq ext4 jbd2 mbcache xen_blkfront floppy pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 13626 Comm: unlink Tainted: G W 4.1.12-37.6.3.el6uek.x86_64 #2 Hardware name: Xen HVM domU, BIOS 4.4.4OVM 02/11/2016 Call Trace: dump_stack+0x48/0x5c warn_slowpath_common+0x95/0xe0 warn_slowpath_null+0x1a/0x20 start_this_handle+0x4c3/0x510 [jbd2] jbd2__journal_restart+0x161/0x1b0 [jbd2] jbd2_journal_restart+0x13/0x20 [jbd2] ocfs2_extend_trans+0x74/0x220 [ocfs2] ocfs2_replay_truncate_records+0x93/0x360 [ocfs2] __ocfs2_flush_truncate_log+0x13e/0x3a0 [ocfs2] ocfs2_remove_btree_range+0x458/0x7f0 [ocfs2] ocfs2_commit_truncate+0x1b3/0x6f0 [ocfs2] ocfs2_truncate_for_delete+0xbd/0x380 [ocfs2] ocfs2_wipe_inode+0x136/0x6a0 [ocfs2] ocfs2_delete_inode+0x2a2/0x3e0 [ocfs2] ocfs2_evict_inode+0x28/0x60 [ocfs2] evict+0xab/0x1a0 iput_final+0xf6/0x190 iput+0xc8/0xe0 do_unlinkat+0x1b7/0x310 SyS_unlink+0x16/0x20 system_call_fastpath+0x12/0x71 ---[ end trace 28aa7410e69369cf ]--- JBD2: unlink wants too many credits (251 > 128) Link: http://lkml.kernel.org/r/1473674623-11810-1-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7dabbc31060e..51128789a661 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5922,7 +5922,6 @@ bail: } static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, - handle_t *handle, struct inode *data_alloc_inode, struct buffer_head *data_alloc_bh) { @@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct ocfs2_truncate_log *tl; struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; + handle_t *handle; di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail; + } + /* Caller has given us at least enough credits to * update the truncate log dinode */ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, @@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, } } - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); i--; } @@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) { int status; unsigned int num_to_flush; - handle_t *handle; struct inode *tl_inode = osb->osb_tl_inode; struct inode *data_alloc_inode = NULL; struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_unlock; - } - - status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + status = ocfs2_replay_truncate_records(osb, data_alloc_inode, data_alloc_bh); if (status < 0) mlog_errno(status); - ocfs2_commit_trans(osb, handle); - -out_unlock: brelse(data_alloc_bh); ocfs2_inode_unlock(data_alloc_inode, 1); -- cgit v1.2.3 From d5bf141893880f7283fe97e1812c58ff22c8f9a5 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 19 Sep 2016 14:44:24 -0700 Subject: ocfs2: fix trans extend while free cached blocks The root cause of this issue is the same with the one fixed by the last patch, but this time credits for allocator inode and group descriptor may not be consumed before trans extend. The following error was caught: WARNING: CPU: 0 PID: 2037 at fs/jbd2/transaction.c:269 start_this_handle+0x4c3/0x510 [jbd2]() Modules linked in: ocfs2 nfsd lockd grace nfs_acl auth_rpcgss sunrpc autofs4 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sd_mod sg ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ppdev xen_kbdfront fb_sys_fops sysimgblt sysfillrect syscopyarea xen_netfront parport_pc parport pcspkr i2c_piix4 i2c_core acpi_cpufreq ext4 jbd2 mbcache xen_blkfront floppy pata_acpi ata_generic ata_piix dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 2037 Comm: rm Tainted: G W 4.1.12-37.6.3.el6uek.bug24573128v2.x86_64 #2 Hardware name: Xen HVM domU, BIOS 4.4.4OVM 02/11/2016 Call Trace: dump_stack+0x48/0x5c warn_slowpath_common+0x95/0xe0 warn_slowpath_null+0x1a/0x20 start_this_handle+0x4c3/0x510 [jbd2] jbd2__journal_restart+0x161/0x1b0 [jbd2] jbd2_journal_restart+0x13/0x20 [jbd2] ocfs2_extend_trans+0x74/0x220 [ocfs2] ocfs2_free_cached_blocks+0x16b/0x4e0 [ocfs2] ocfs2_run_deallocs+0x70/0x270 [ocfs2] ocfs2_commit_truncate+0x474/0x6f0 [ocfs2] ocfs2_truncate_for_delete+0xbd/0x380 [ocfs2] ocfs2_wipe_inode+0x136/0x6a0 [ocfs2] ocfs2_delete_inode+0x2a2/0x3e0 [ocfs2] ocfs2_evict_inode+0x28/0x60 [ocfs2] evict+0xab/0x1a0 iput_final+0xf6/0x190 iput+0xc8/0xe0 do_unlinkat+0x1b7/0x310 SyS_unlinkat+0x22/0x40 system_call_fastpath+0x12/0x71 ---[ end trace a62437cb060baa71 ]--- JBD2: rm wants too many credits (149 > 128) Link: http://lkml.kernel.org/r/1473674623-11810-2-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 51128789a661..f165f867f332 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6404,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_unlock; - } - while (head) { if (head->free_bg) bg_blkno = head->free_bg; else bg_blkno = ocfs2_which_suballoc_group(head->free_blk, head->free_bit); + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + trace_ocfs2_free_cached_blocks( (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); - if (ret) { + if (ret) mlog_errno(ret); - goto out_journal; - } - ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); - if (ret) { - mlog_errno(ret); - goto out_journal; - } + ocfs2_commit_trans(osb, handle); tmp = head; head = head->free_next; kfree(tmp); } -out_journal: - ocfs2_commit_trans(osb, handle); - out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); -- cgit v1.2.3 From 12703dbfeb15402260e7554d32a34ac40c233990 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 19 Sep 2016 14:44:27 -0700 Subject: fsnotify: add a way to stop queueing events on group shutdown Implement a function that can be called when a group is being shutdown to stop queueing new events to the group. Fanotify will use this. Fixes: 5838d4442bd5 ("fanotify: fix double free of pending permission events") Link: http://lkml.kernel.org/r/1473797711-14111-2-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/group.c | 19 +++++++++++++++++++ fs/notify/notification.c | 8 +++++++- include/linux/fsnotify_backend.h | 3 +++ 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/group.c b/fs/notify/group.c index 3e2dd85be5dd..b47f7cfdcaa4 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -39,6 +39,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) kfree(group); } +/* + * Stop queueing new events for this group. Once this function returns + * fsnotify_add_event() will not add any new events to the group's queue. + */ +void fsnotify_group_stop_queueing(struct fsnotify_group *group) +{ + mutex_lock(&group->notification_mutex); + group->shutdown = true; + mutex_unlock(&group->notification_mutex); +} + /* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. @@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { + /* + * Stop queueing new events. The code below is careful enough to not + * require this but fanotify needs to stop queuing events even before + * fsnotify_destroy_group() is called and this makes the other callers + * of fsnotify_destroy_group() to see the same behavior. + */ + fsnotify_group_stop_queueing(group); + /* clear all inode marks for this group, attach them to destroy_list */ fsnotify_detach_group_marks(group); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index a95d8e037aeb..3d76e65ff84f 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * Add an event to the group notification queue. The group can later pull this * event off the queue to deal with. The function returns 0 if the event was * added to the queue, 1 if the event was merged with some other queued event, - * 2 if the queue of events has overflown. + * 2 if the event was not queued - either the queue of events has overflown + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, @@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); + if (group->shutdown) { + mutex_unlock(&group->notification_mutex); + return 2; + } + if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 58205f33af02..40a9e99de703 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -148,6 +148,7 @@ struct fsnotify_group { #define FS_PRIO_1 1 /* fanotify content based access control */ #define FS_PRIO_2 2 /* fanotify pre-content access */ unsigned int priority; + bool shutdown; /* group is being shut down, don't queue more events */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ @@ -292,6 +293,8 @@ extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *op extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); +/* group destruction begins, stop queuing new events */ +extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ -- cgit v1.2.3 From 96d41019e3ac55f6f0115b0ce97e4f24a3d636d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 19 Sep 2016 14:44:30 -0700 Subject: fanotify: fix list corruption in fanotify_get_response() fanotify_get_response() calls fsnotify_remove_event() when it finds that group is being released from fanotify_release() (bypass_perm is set). However the event it removes need not be only in the group's notification queue but it can have already moved to access_list (userspace read the event before closing the fanotify instance fd) which is protected by a different lock. Thus when fsnotify_remove_event() races with fanotify_release() operating on access_list, the list can get corrupted. Fix the problem by moving all the logic removing permission events from the lists to one place - fanotify_release(). Fixes: 5838d4442bd5 ("fanotify: fix double free of pending permission events") Link: http://lkml.kernel.org/r/1473797711-14111-3-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Miklos Szeredi Tested-by: Miklos Szeredi Reviewed-by: Miklos Szeredi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 13 +------------ fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++------------ fs/notify/notification.c | 15 --------------- include/linux/fsnotify_backend.h | 3 --- 4 files changed, 25 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d2f97ecca6a5..e0e5f7c3c99f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response || - atomic_read(&group->fanotify_data.bypass_perm)); - - if (!event->response) { /* bypass_perm set */ - /* - * Event was canceled because group is being destroyed. Remove - * it from group's event list because we are responsible for - * freeing the permission event. - */ - fsnotify_remove_event(group, &event->fae.fse); - return 0; - } + wait_event(group->fanotify_data.access_waitq, event->response); /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..a64313868d3a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + struct fsnotify_event *fsn_event; /* - * There may be still new events arriving in the notification queue - * but since userspace cannot use fanotify fd anymore, no event can - * enter or leave access_list by now. + * Stop new events from arriving in the notification queue. since + * userspace cannot use fanotify fd anymore, no event can enter or + * leave access_list by now either. */ - spin_lock(&group->fanotify_data.access_lock); - - atomic_inc(&group->fanotify_data.bypass_perm); + fsnotify_group_stop_queueing(group); + /* + * Process all permission events on access_list and notification queue + * and simulate reply from userspace. + */ + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->fanotify_data.access_lock); /* - * Since bypass_perm is set, newly queued events will not wait for - * access response. Wake up the already sleeping ones now. - * synchronize_srcu() in fsnotify_destroy_group() will wait for all - * processes sleeping in fanotify_handle_event() waiting for access - * response and thus also for all permission events to be freed. + * Destroy all non-permission events. For permission events just + * dequeue them and set the response. They will be freed once the + * response is consumed and fanotify_get_response() returns. */ + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + fsn_event = fsnotify_remove_first_event(group); + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, fsn_event); + else + FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } + mutex_unlock(&group->notification_mutex); + + /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); #endif @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3d76e65ff84f..e455e83ceeeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -131,21 +131,6 @@ queue: return ret; } -/* - * Remove @event from group's notification queue. It is the responsibility of - * the caller to destroy the event. - */ -void fsnotify_remove_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - mutex_lock(&group->notification_mutex); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - group->q_len--; - } - mutex_unlock(&group->notification_mutex); -} - /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 40a9e99de703..7268ed076be8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -180,7 +180,6 @@ struct fsnotify_group { spinlock_t access_lock; struct list_head access_list; wait_queue_head_t access_waitq; - atomic_t bypass_perm; #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ int f_flags; unsigned int max_marks; @@ -307,8 +306,6 @@ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct list_head *, struct fsnotify_event *)); -/* Remove passed event from groups notification queue */ -extern void fsnotify_remove_event(struct fsnotify_group *group, struct fsnotify_event *event); /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ -- cgit v1.2.3 From 3bb8b653c86f6b1d2cc05aa1744fed4b18f99485 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 19 Sep 2016 14:44:33 -0700 Subject: ocfs2: fix double unlock in case retry after free truncate log If ocfs2_reserve_cluster_bitmap_bits() fails with ENOSPC, it will try to free truncate log and then retry. Since ocfs2_try_to_free_truncate_log will lock/unlock global bitmap inode, we have to unlock it before calling this function. But when retry reserve and it fails with no global bitmap inode lock taken, it will unlock again in error handling branch and BUG. This issue also exists if no need retry and then ocfs2_inode_lock fails. So fix it. Fixes: 2070ad1aebff ("ocfs2: retry on ENOSPC if sufficient space in truncate log") Link: http://lkml.kernel.org/r/57D91939.6030809@huawei.com Signed-off-by: Joseph Qi Signed-off-by: Jiufei Xue Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ea47120a85ff..6ad3533940ba 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1199,14 +1199,24 @@ retry: inode_unlock((*ac)->ac_inode); ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); - if (ret == 1) + if (ret == 1) { + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; goto retry; + } if (ret < 0) mlog_errno(ret); inode_lock((*ac)->ac_inode); - ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + inode_unlock((*ac)->ac_inode); + iput((*ac)->ac_inode); + (*ac)->ac_inode = NULL; + goto bail; + } } if (status < 0) { if (status != -ENOSPC) -- cgit v1.2.3 From d21c353d5e99c56cdd5b5c1183ffbcaf23b8b960 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Mon, 19 Sep 2016 14:44:42 -0700 Subject: ocfs2: fix start offset to ocfs2_zero_range_for_truncate() If we punch a hole on a reflink such that following conditions are met: 1. start offset is on a cluster boundary 2. end offset is not on a cluster boundary 3. (end offset is somewhere in another extent) or (hole range > MAX_CONTIG_BYTES(1MB)), we dont COW the first cluster starting at the start offset. But in this case, we were wrongly passing this cluster to ocfs2_zero_range_for_truncate() to zero out. This will modify the cluster in place and zero it in the source too. Fix this by skipping this cluster in such a scenario. To reproduce: 1. Create a random file of say 10 MB xfs_io -c 'pwrite -b 4k 0 10M' -f 10MBfile 2. Reflink it reflink -f 10MBfile reflnktest 3. Punch a hole at starting at cluster boundary with range greater that 1MB. You can also use a range that will put the end offset in another extent. fallocate -p -o 0 -l 1048615 reflnktest 4. sync 5. Check the first cluster in the source file. (It will be zeroed out). dd if=10MBfile iflag=direct bs= count=1 | hexdump -C Link: http://lkml.kernel.org/r/1470957147-14185-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant Reported-by: Saar Maoz Reviewed-by: Srinivas Eeda Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Eric Ren Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4e7b0dc22450..0b055bfb8e86 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; @@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* -- cgit v1.2.3 From 63b52c4936a2e679639c38ef51a50aa8ca1c5c07 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 19 Sep 2016 14:44:44 -0700 Subject: Revert "ocfs2: bump up o2cb network protocol version" This reverts commit 38b52efd218b ("ocfs2: bump up o2cb network protocol version"). This commit made rolling upgrade fail. When one node is upgraded to new version with this commit, the remaining nodes will fail to establish connections to it, then the application like VMs on the remaining nodes can't be live migrated to the upgraded one. This will cause an outage. Since negotiate hb timeout behavior didn't change without this commit, so revert it. Fixes: 38b52efd218bf ("ocfs2: bump up o2cb network protocol version") Link: http://lkml.kernel.org/r/1471396924-10375-1-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp_internal.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 94b18369b1cc..b95e7df5b76a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,9 +44,6 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * - * New in version 12 - * - Negotiate hb timeout when storage is down. - * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -78,7 +75,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 12ULL +#define O2NET_PROTOCOL_VERSION 11ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; -- cgit v1.2.3