From 0d2783626a53d4c922f82d51fa675cb5d13f0d36 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 26 Feb 2015 11:45:47 +0100 Subject: fuse: notify: don't move pages fuse_try_move_page() is not prepared for replacing pages that have already been read. Reported-by: Al Viro Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed19a7d622fa..b0d7e13fae3d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1797,6 +1797,9 @@ copy_finish: static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + switch (code) { case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); -- cgit v1.2.3 From aa991b3b267e24f578bac7b09cc57579b660304b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 26 Feb 2015 11:45:47 +0100 Subject: fuse: set stolen page uptodate Regular pipe buffers' ->steal method (generic_pipe_buf_steal()) doesn't set PG_uptodate. Don't warn on this condition, just set the uptodate flag. Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b0d7e13fae3d..71c4619af333 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -890,8 +890,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newpage = buf->page; - if (WARN_ON(!PageUptodate(newpage))) - return -EIO; + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); ClearPageMappedToDisk(newpage); -- cgit v1.2.3 From 18d585f0f2d4c9dc7dfe6e69dcae4933d5a428c9 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 12 Mar 2015 16:25:46 -0700 Subject: ocfs2: make append_dio an incompat feature It turns out that making this feature ro_compat isn't quite enough to prevent accidental corruption on mount from older kernels. Ocfs2 (like other file systems) will process orphaned inodes even when the user mounts in 'ro' mode. So for the case of a filesystem not knowing the append_dio feature, mounting the filesystem could result in orphaned-for-dio files being deleted, which we clearly don't want. So instead, turn this into an incompat flag. Btw, this is kind of my fault - initially I asked that we add a flag to cover the feature and even suggested that we use an ro flag. It wasn't until I was looking through our commits for v4.0-rc1 that I realized we actually want this to be incompat. Signed-off-by: Mark Fasheh Cc: Joseph Qi Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2.h | 2 +- fs/ocfs2/ocfs2_fs.h | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8490c64d34fe..460c6c37e683 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) { - if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) return 1; return 0; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 20e37a3ed26f..db64ce2d4667 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -102,11 +102,11 @@ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The @@ -178,6 +178,11 @@ */ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 +/* + * Append Direct IO support + */ +#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -200,10 +205,6 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 -/* - * Append Direct IO support - */ -#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. -- cgit v1.2.3 From 283ee1482f349d6c0c09dfb725db5880afc56813 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 12 Mar 2015 16:26:00 -0700 Subject: nilfs2: fix deadlock of segment constructor during recovery According to a report from Yuxuan Shui, nilfs2 in kernel 3.19 got stuck during recovery at mount time. The code path that caused the deadlock was as follows: nilfs_fill_super() load_nilfs() nilfs_salvage_orphan_logs() * Do roll-forwarding, attach segment constructor for recovery, and kick it. nilfs_segctor_thread() nilfs_segctor_thread_construct() * A lock is held with nilfs_transaction_lock() nilfs_segctor_do_construct() nilfs_segctor_drop_written_files() iput() iput_final() write_inode_now() writeback_single_inode() __writeback_single_inode() do_writepages() nilfs_writepage() nilfs_construct_dsync_segment() nilfs_transaction_lock() --> deadlock This can happen if commit 7ef3ff2fea8b ("nilfs2: fix deadlock of segment constructor over I_SYNC flag") is applied and roll-forward recovery was performed at mount time. The roll-forward recovery can happen if datasync write is done and the file system crashes immediately after that. For instance, we can reproduce the issue with the following steps: < nilfs2 is mounted on /nilfs (device: /dev/sdb1) > # dd if=/dev/zero of=/nilfs/test bs=4k count=1 && sync # dd if=/dev/zero of=/nilfs/test conv=notrunc oflag=dsync bs=4k count=1 && reboot -nfh < the system will immediately reboot > # mount -t nilfs2 /dev/sdb1 /nilfs The deadlock occurs because iput() can run segment constructor through writeback_single_inode() if MS_ACTIVE flag is not set on sb->s_flags. The above commit changed segment constructor so that it calls iput() asynchronously for inodes with i_nlink == 0, but that change was imperfect. This fixes the another deadlock by deferring iput() in segment constructor even for the case that mount is not finished, that is, for the case that MS_ACTIVE flag is not set. Signed-off-by: Ryusuke Konishi Reported-by: Yuxuan Shui Tested-by: Ryusuke Konishi Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 469086b9f99b..0c3f303baf32 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; + int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); @@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, brelse(ii->i_bh); ii->i_bh = NULL; list_del_init(&ii->i_dirty); - if (!ii->vfs_inode.i_nlink) { + if (!ii->vfs_inode.i_nlink || during_mount) { /* - * Defer calling iput() to avoid a deadlock - * over I_SYNC flag for inodes with i_nlink == 0 + * Defer calling iput() to avoid deadlocks if + * i_nlink == 0 or mount is not yet finished. */ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); defer_iput = true; -- cgit v1.2.3 From b3c1030d50bad39383fcfa6721bd3c35463b3f3f Mon Sep 17 00:00:00 2001 From: "Suzuki K. Poulose" Date: Thu, 12 Mar 2015 16:26:08 -0700 Subject: fanotify: fix event filtering with FAN_ONDIR set With FAN_ONDIR set, the user can end up getting events, which it hasn't marked. This was revealed with fanotify04 testcase failure on Linux-4.0-rc1, and is a regression from 3.19, revealed with 66ba93c0d7fe6 ("fanotify: don't set FAN_ONDIR implicitly on a marks ignored mask"). # /opt/ltp/testcases/bin/fanotify04 [ ... ] fanotify04 7 TPASS : event generated properly for type 100000 fanotify04 8 TFAIL : fanotify04.c:147: got unexpected event 30 fanotify04 9 TPASS : No event as expected The testcase sets the adds the following marks : FAN_OPEN | FAN_ONDIR for a fanotify on a dir. Then does an open(), followed by close() of the directory and expects to see an event FAN_OPEN(0x20). However, the fanotify returns (FAN_OPEN|FAN_CLOSE_NOWRITE(0x10)). This happens due to the flaw in the check for event_mask in fanotify_should_send_event() which does: if (event_mask & marks_mask & ~marks_ignored_mask) return true; where, event_mask == (FAN_ONDIR | FAN_CLOSE_NOWRITE), marks_mask == (FAN_ONDIR | FAN_OPEN), marks_ignored_mask == 0 Fix this by masking the outgoing events to the user, as we already take care of FAN_ONDIR and FAN_EVENT_ON_CHILD. Signed-off-by: Suzuki K. Poulose Tested-by: Lino Sanfilippo Reviewed-by: Jan Kara Cc: Eric Paris Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 9a66ff79ff27..d2f97ecca6a5 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -143,7 +143,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & marks_mask & ~marks_ignored_mask) + if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & + ~marks_ignored_mask) return true; return false; -- cgit v1.2.3 From d22071293f17eedc96df25093d8e99d09cd76463 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sat, 14 Feb 2015 13:10:22 +0100 Subject: btrfs: fix sizeof format specifier in btrfs_check_super_valid() This patch fixes mips compilation warning: fs/btrfs/disk-io.c: In function 'btrfs_check_super_valid': fs/btrfs/disk-io.c:3927:21: warning: format '%lu' expects argument of type 'long unsigned int', but argument 3 has type 'unsigned int' [-Wformat] Signed-off-by: Fabian Frederick Acked-by: Geert Uytterhoeven Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41b320e235d7..1577b91940dd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3923,7 +3923,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { - printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n", + printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n", btrfs_super_sys_array_size(sb), sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); -- cgit v1.2.3 From b4924a0fa18d7f69bde3a84521258e7a55828186 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Mar 2015 20:23:44 +0800 Subject: Btrfs: catch transaction abortion after waiting for it This problem is uncovered by a test case: http://patchwork.ozlabs.org/patch/244297. Fsync() can report success when it actually doesn't. When we have several threads running fsync() at the same tiem and in one fsync() we get a transaction abortion due to some problems(in the test case it's disk failures), and other fsync()s may return successfully which makes userspace programs think that data is now safely flushed into disk. It's because that after fsyncs() fail btrfs_sync_log() due to disk failures, they get to try btrfs_commit_transaction() where it finds that there is already a transaction being committed, and they'll just call wait_for_commit() and return. Note that we actually check "trans->aborted" in btrfs_end_transaction, but it's likely that the error message is still not yet throwed out and only after wait_for_commit() we're sure whether the transaction is committed successfully. This add the necessary check and it now passes the test. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 323c6541d3dc..07b985f2a814 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1811,6 +1811,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, cur_trans); + if (unlikely(cur_trans->aborted)) + ret = cur_trans->aborted; + btrfs_put_transaction(cur_trans); return ret; -- cgit v1.2.3 From 48da5f0a4cb2b6b44579f5737e8be888c0d02526 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 13 Mar 2015 14:24:38 +0800 Subject: Btrfs: fix comp_oper to get right order Case (oper1->seq > oper2->seq) should differ with case (oper1->seq < oper2->seq). Signed-off-by: Liu Bo Reviewed-by: David Sterba Reviewed-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 97159a8e91d4..058c79eecbfb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1, if (oper1->seq < oper2->seq) return -1; if (oper1->seq > oper2->seq) - return -1; + return 1; if (oper1->ref_root < oper2->ref_root) return -1; if (oper1->ref_root > oper2->ref_root) -- cgit v1.2.3 From 8461a3de770477a9a7b8eeaebcc4804dbc26ca38 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2015 15:12:08 -0400 Subject: Btrfs: fix merge delalloc logic My patch to properly count outstanding extents wrt MAX_EXTENT_SIZE introduced a regression when re-dirtying already dirty areas. We have logic in split to make sure we are taking the largest space into account but didn't have it for merge, so it was sometimes making us think we were turning a tiny extent into a huge extent, when in reality we already had a huge extent and needed to use the other side in our logic. This fixes the regression that was reported by a user on list. Thanks, Reported-by: Markus Trippelsdorf Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 91a87f53be3c..97b601bec326 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1592,7 +1592,12 @@ static void btrfs_merge_extent_hook(struct inode *inode, return; old_size = other->end - other->start + 1; - new_size = old_size + (new->end - new->start + 1); + if (old_size < (new->end - new->start + 1)) + old_size = (new->end - new->start + 1); + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { -- cgit v1.2.3 From 6a41dd0922e3c63e677c2d8f7906ce6a3e097af1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2015 15:12:23 -0400 Subject: Btrfs: account for the correct number of extents for delalloc reservations Direct IO can easily pass in an buffer that is greater than BTRFS_MAX_EXTENT_SIZE, so take this into account when reserving extents in the delalloc reservation code. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 92146a5afdc1..96c613bfe157 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5110,7 +5110,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + nr_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + BTRFS_I(inode)->outstanding_extents += nr_extents; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) -- cgit v1.2.3 From ea526d18990018f224e5734748975bea1824545f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2015 16:40:45 -0400 Subject: Btrfs: fix ASSERT(list_empty(&cur_trans->dirty_bgs_list) Dave could hit this assert consistently running btrfs/078. This is because when we update the block groups we could truncate the free space, which would try to delete the csums for that range and dirty the csum root. For this to happen we have to have already written out the csum root so it's kind of hard to hit this case. This patch fixes this by changing the logic to only write the dirty block groups if the dirty_cowonly_roots list is empty. This will get us the same effect as before since we add the extent root last, and will cover the case that we dirty some other root again but not the extent root. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 07b985f2a814..2fe3ef5e9de3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1023,7 +1023,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; - bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); @@ -1031,9 +1030,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item) && - (!extent_root || - list_empty(&trans->transaction->dirty_bgs))) + old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1044,14 +1041,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - if (extent_root) { - ret = btrfs_write_dirty_block_groups(trans, root); - if (ret) - return ret; - } - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; } return 0; @@ -1068,6 +1057,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1099,7 +1089,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; - +again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); @@ -1112,8 +1102,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = update_cowonly_root(trans, root); if (ret) return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; } + while (!list_empty(dirty_bgs)) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + } + + if (!list_empty(&fs_info->dirty_cowonly_roots)) + goto again; + list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); -- cgit v1.2.3 From a9b1b455c519ee2fd6a4f9c069511e67b5be1ac4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 14 Mar 2015 09:45:35 -0400 Subject: locks: fix generic_delete_lease tracepoint to use victim pointer It's possible that "fl" won't point at a valid lock at this point, so use "victim" instead which is either a valid lock or NULL. Signed-off-by: Jeff Layton --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index f1bad681fc1c..528fedfda15e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1728,7 +1728,7 @@ static int generic_delete_lease(struct file *filp, void *owner) break; } } - trace_generic_delete_lease(inode, fl); + trace_generic_delete_lease(inode, victim); if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); -- cgit v1.2.3 From 7cff4b1836a9d3f18aadd6e88fd43055e2ff4132 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 16 Mar 2015 10:44:52 +1100 Subject: kernfs: handle poll correctly on 'direct_read' files. Kernfs supports two styles of read: direct_read and seqfile_read. The latter supports 'poll' correctly thanks to the update of '->event' in kernfs_seq_show. The former does not as '->event' is never updated on a read. So add an appropriate update in kernfs_file_direct_read(). This was noticed because some 'md' sysfs attributes were recently changed to use direct reads. Reported-by: Prakash Punnoor Reported-by: Torsten Kaiser Fixes: 750f199ee8b578062341e6ddfe36c59ac8ff2dcb Signed-off-by: NeilBrown Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b684e8a132e6..2bacb9988566 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, goto out_free; } + of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, *ppos); -- cgit v1.2.3 From dcdf7f6ddba006f3482ebee73dfa6b75aec5f07b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 2 Mar 2015 16:37:31 -0500 Subject: Btrfs: prepare block group cache before writing Writing the block group cache will modify the extent tree quite a bit because it truncates the old space cache and pre-allocates new stuff. To try and cut down on the churn lets do the setup dance first, then later on hopefully we can avoid looping with newly dirtied roots. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/extent-tree.c | 26 ++++++++++++++++++++++++++ fs/btrfs/transaction.c | 5 ++++- 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b3dd55f52f71..a0c90a324f53 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3386,6 +3386,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96c613bfe157..3ac3fefb1625 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3325,6 +3325,32 @@ out: return ret; } +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_path *path; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(root, SPACE_CACHE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + } + + btrfs_free_path(path); + return 0; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2fe3ef5e9de3..932709af5163 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1025,7 +1025,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; old_root_used = btrfs_root_used(&root->root_item); - btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); @@ -1085,6 +1084,10 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_setup_space_cache(trans, root); + if (ret) + return ret; + /* run_qgroups might have added some more refs */ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) -- cgit v1.2.3 From ab676b7d6fbf4b294bf198fb27ade5b0e865c7ce Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 9 Mar 2015 23:11:12 +0200 Subject: pagemap: do not leak physical addresses to non-privileged userspace As pointed by recent post[1] on exploiting DRAM physical imperfection, /proc/PID/pagemap exposes sensitive information which can be used to do attacks. This disallows anybody without CAP_SYS_ADMIN to read the pagemap. [1] http://googleprojectzero.blogspot.com/2015/03/exploiting-dram-rowhammer-bug-to-gain.html [ Eventually we might want to do anything more finegrained, but for now this is the simple model. - Linus ] Signed-off-by: Kirill A. Shutemov Acked-by: Konstantin Khlebnikov Acked-by: Andy Lutomirski Cc: Pavel Emelyanov Cc: Andrew Morton Cc: Mark Seaborn Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 956b75d61809..6dee68d013ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1325,6 +1325,9 @@ out: static int pagemap_open(struct inode *inode, struct file *file) { + /* do not disclose physical addresses: attack vector */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " "to stop being page-shift some time soon. See the " "linux/Documentation/vm/pagemap.txt for details.\n"); -- cgit v1.2.3 From ba117213554bc747561c5b7bf274d60ac93b8598 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Mar 2015 15:01:24 -0400 Subject: Btrfs: account merges/splits properly My fix Btrfs: fix merge delalloc logic only fixed half of the problems, it didn't fix the case where we have two large extents on either side and then join them together with a new small extent. We need to instead keep track of how many extents we have accounted for with each side of the new extent, and then see how many extents we need for the new large extent. If they match then we know we need to keep our reservation, otherwise we need to drop our reservation. This shows up with a case like this [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] Previously the logic would have said that the number extents required for the new size (3) is larger than the number of extents required for the largest side (2) therefore we need to keep our reservation. But this isn't the case, since both sides require a reservation of 2 which leads to 4 for the whole range currently reserved, but we only need 3, so we need to drop one of the reservations. The same problem existed for splits, we'd think we only need 3 extents when creating the hole but in reality we need 4. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 57 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97b601bec326..bb74a4181075 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1542,30 +1542,17 @@ static void btrfs_split_extent_hook(struct inode *inode, u64 new_size; /* - * We need the largest size of the remaining extent to see if we - * need to add a new outstanding extent. Think of the following - * case - * - * [MEAX_EXTENT_SIZEx2 - 4k][4k] - * - * The new_size would just be 4k and we'd think we had enough - * outstanding extents for this if we only took one side of the - * split, same goes for the other direction. We need to see if - * the larger size still is the same amount of extents as the - * original size, because if it is we need to add a new - * outstanding extent. But if we split up and the larger size - * is less than the original then we are good to go since we've - * already accounted for the extra extent in our original - * accounting. + * See the explanation in btrfs_merge_extent_hook, the same + * applies here, just in reverse. */ new_size = orig->end - split + 1; - if ((split - orig->start) > new_size) - new_size = split - orig->start; - - num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); - if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) < num_extents) + new_size = split - orig->start; + num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; } @@ -1591,9 +1578,6 @@ static void btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return; - old_size = other->end - other->start + 1; - if (old_size < (new->end - new->start + 1)) - old_size = (new->end - new->start + 1); if (new->start > other->start) new_size = new->end - other->start + 1; else @@ -1608,13 +1592,32 @@ static void btrfs_merge_extent_hook(struct inode *inode, } /* - * If we grew by another max_extent, just return, we want to keep that - * reserved amount. + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. */ + old_size = other->end - other->start + 1; num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); + old_size = new->end - new->start + 1; + num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, - BTRFS_MAX_EXTENT_SIZE) > num_extents) + BTRFS_MAX_EXTENT_SIZE) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); -- cgit v1.2.3 From bcb7e449ec6350121ac7ca138c0b050ba7caca47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Mar 2015 17:38:02 -0400 Subject: Btrfs: just free dummy extent buffers If we fail during our sanity tests we could get NULL deref's because we unload the module before the dummy extent buffers are free'd via RCU. So check for this case and just free the things directly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 29850d4a3827..d13ceadcbf18 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb) /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { + __free_extent_buffer(eb); + return 1; + } +#endif call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } -- cgit v1.2.3 From 6a3891c551268dd4ff0969b883c4c8b8d974db8f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Mar 2015 17:38:52 -0400 Subject: Btrfs: add sanity test for outstanding_extents accounting I introduced a regression wrt outstanding_extents accounting. These are tricky areas that aren't easily covered by xfstests as we could change MAX_EXTENT_SIZE at any time. So add sanity tests to cover the various conditions that are tricky in order to make sure we don't introduce regressions in the future. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 + fs/btrfs/extent-tree.c | 3 + fs/btrfs/inode.c | 15 ++++ fs/btrfs/tests/inode-tests.c | 197 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 217 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a0c90a324f53..5bd721a1516e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3910,6 +3910,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, loff_t actual_len, u64 *alloc_hint); int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); +#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ac3fefb1625..d1ca86ea9993 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (btrfs_test_is_dummy_root(root)) + return; + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bb74a4181075..3717b3d4c2ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, static int btrfs_dirty_inode(struct inode *inode); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode) +{ + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +} +#endif + static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -1694,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } + /* For sanity tests */ + if (btrfs_test_is_dummy_root(root)) + return; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); @@ -1749,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); + /* For sanity tests. */ + if (btrfs_test_is_dummy_root(root)) + return; + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a116b55ce788..054fc0d97131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -911,6 +911,197 @@ out: return ret; } +static int test_extent_accounting(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + BTRFS_I(inode)->root = root; + btrfs_test_inode_set_ops(inode); + + /* [BTRFS_MAX_EXTENT_SIZE] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_msg("Miscount, wanted 1, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + EXTENT_DELALLOC | EXTENT_DIRTY | + EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * + * I'm artificially adding 2 to outstanding_extents because in the + * buffered IO case we'd add things up as we go, but I don't feel like + * doing that here, this isn't the interesting case we want to test. + */ + BTRFS_I(inode)->outstanding_extents += 2; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, + (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_msg("Miscount, wanted 0, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + int btrfs_test_inodes(void) { int ret; @@ -924,5 +1115,9 @@ int btrfs_test_inodes(void) if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - return test_hole_first(); + ret = test_hole_first(); + if (ret) + return ret; + test_msg("Running outstanding_extents tests\n"); + return test_extent_accounting(); } -- cgit v1.2.3 From e1cbbfa5f5aaf40a1fe70856fac4dfcc33e0e651 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Mar 2015 10:52:28 -0400 Subject: Btrfs: fix outstanding_extents accounting in DIO We are keeping track of how many extents we need to reserve properly based on the amount we want to write, but we were still incrementing outstanding_extents if we wrote less than what we requested. This isn't quite right since we will be limited to our max extent size. So instead lets do something horrible! Keep track of how many outstanding_extents we reserved, and decrement each time we allocate an extent. If we use our entire reserve make sure to jack up outstanding_extents on the inode so the accounting works out properly. Thanks, Reported-by: Filipe Manana Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3717b3d4c2ce..aa1fb534f69f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7239,7 +7239,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 orig_len = len; + u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7251,6 +7251,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, lockstart = start; lockend = start + len - 1; + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transction doesn't get + * confused. + */ + outstanding_extents = current->journal_info; + current->journal_info = NULL; + } + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. @@ -7374,11 +7384,20 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - if (len < orig_len) { + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (*outstanding_extents) { + (*outstanding_extents)--; + } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } + + current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); } @@ -7402,6 +7421,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); + if (outstanding_extents) + current->journal_info = outstanding_extents; return ret; } @@ -8101,6 +8122,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + u64 outstanding_extents = 0; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8138,6 +8160,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; + outstanding_extents = div64_u64(count + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); @@ -8150,6 +8182,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { + current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); else if (ret >= 0 && (size_t)ret < count) -- cgit v1.2.3 From bead55ef775f6e25a8d286c0d47030580f577bec Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 15 Jan 2015 13:17:36 +0800 Subject: ovl: print error message for invalid mount options Overlayfs should print an error message if an incorrect mount option is caught like other filesystems. After this patch, improper option input could be clearly known. Reported-by: Fabian Sturm Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b90952f528b1..ab3c8cb8e52d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -615,6 +615,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; } } -- cgit v1.2.3 From 6be4506e34cf6075a1307b646e0a6c46c1c9010d Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 15 Jan 2015 13:19:21 +0800 Subject: ovl: check lowerdir amount for non-upper mount Recently multi-lower layer mount support allow upperdir and workdir to be omitted, then cause overlayfs can be mount with only one lowerdir directory. This action make no sense and have potential risk. This patch check the total number of lower directories to prevent mounting overlayfs with only one directory. Also, an error message is added to indicate lower directories exceed OVL_MAX_STACK limit. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ab3c8cb8e52d..edbb3ebcdaad 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -870,8 +870,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); - if (stacklen > OVL_MAX_STACK) + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directries, limit is %d\n", + OVL_MAX_STACK); goto out_free_lowertmp; + } else if (!ufs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_free_lowertmp; + } stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); if (!stack) -- cgit v1.2.3 From 71cbad7e694ee81233b3be3a38b81c3d5872cc6f Mon Sep 17 00:00:00 2001 From: hujianyang Date: Thu, 15 Jan 2015 13:20:57 +0800 Subject: ovl: upper fs should not be R/O After importing multi-lower layer support, users could mount a r/o partition as the left most lowerdir instead of using it as upperdir. And a r/o upperdir may cause an error like overlayfs: failed to create directory ./workdir/work during mount. This patch check the *s_flags* of upper fs and return an error if it is a r/o partition. The checking of *upper_mnt->mnt_sb->s_flags* can be removed now. This patch also remove /* FIXME: workdir is not needed for a R/O mount */ from ovl_fill_super() because: 1) for upper fs r/o case Setting a r/o partition as upper is prevented, no need to care about workdir in this case. 2) for "mount overlay -o ro" with a r/w upper fs case Users could remount overlayfs to r/w in this case, so workdir should not be omitted. Signed-off-by: hujianyang Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index edbb3ebcdaad..5f0d1993e6e3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -529,8 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ufs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && - (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))) + if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) return -EROFS; return 0; @@ -619,6 +618,15 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) return -EINVAL; } } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + return 0; } @@ -838,7 +846,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; if (ufs->config.upperdir) { - /* FIXME: workdir is not needed for a R/O mount */ if (!ufs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); goto out_free_config; @@ -848,6 +855,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_config; + /* Upper fs should not be r/o */ + if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out_put_upperpath; + } + err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) goto out_put_upperpath; @@ -939,8 +953,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->numlower++; } - /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */ - if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)) + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; sb->s_d_op = &ovl_dentry_operations; -- cgit v1.2.3 From 94e4fe2cab3d43b3ba7c3f721743006a8c9d913a Mon Sep 17 00:00:00 2001 From: Tom Van Braeckel Date: Mon, 12 Jan 2015 05:22:16 +0100 Subject: fuse: explicitly set /dev/fuse file's private_data The misc subsystem (which is used for /dev/fuse) initializes private_data to point to the misc device when a driver has registered a custom open file operation, and initializes it to NULL when a custom open file operation has *not* been provided. This subtle quirk is confusing, to the point where kernel code registers *empty* file open operations to have private_data point to the misc device structure. And it leads to bugs, where the addition or removal of a custom open file operation surprisingly changes the initial contents of a file's private_data structure. So to simplify things in the misc subsystem, a patch [1] has been proposed to *always* set the private_data to point to the misc device, instead of only doing this when a custom open file operation has been registered. But before this patch can be applied we need to modify drivers that make the assumption that a misc device file's private_data is initialized to NULL because they didn't register a custom open file operation, so they don't rely on this assumption anymore. FUSE uses private_data to store the fuse_conn and errors out if this is not initialized to NULL at mount time. Hence, we now set a file's private_data to NULL explicitly, to be independent of whatever value the misc subsystem initializes it to by default. [1] https://lkml.org/lkml/2014/12/4/939 Reported-by: Giedrius Statkevicius Reported-by: Thierry Reding Signed-off-by: Tom Van Braeckel Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 71c4619af333..39706c57ad3c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1353,6 +1353,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, return err; } +static int fuse_dev_open(struct inode *inode, struct file *file) +{ + /* + * The fuse device's file's private_data is used to hold + * the fuse_conn(ection) when it is mounted, and is used to + * keep track of whether the file has been mounted already. + */ + file->private_data = NULL; + return 0; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -2220,6 +2231,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, + .open = fuse_dev_open, .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, -- cgit v1.2.3 From 133d558216d9db3617a9fdeebd1bce9afff5f973 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Mar 2015 14:17:31 +0100 Subject: Subject: nfsd: don't recursively call nfsd4_cb_layout_fail Due to a merge error when creating c5c707f9 ("nfsd: implement pNFS layout recalls"), we recursively call nfsd4_cb_layout_fail from itself, leading to stack overflows. Signed-off-by: Christoph Hellwig Fixes: c5c707f9 ("nfsd: implement pNFS layout recalls") Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4layouts.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 3c1bfa1..1028a06 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -587,8 +587,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); - nfsd4_cb_layout_fail(ls); - printk(KERN_WARNING "nfsd: client %s failed to respond to layout recall. " " Fencing..\n", addr_str); -- 1.9.1 --- fs/nfsd/nfs4layouts.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 3c1bfa155571..1028a0629543 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -587,8 +587,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); - nfsd4_cb_layout_fail(ls); - printk(KERN_WARNING "nfsd: client %s failed to respond to layout recall. " " Fencing..\n", addr_str); -- cgit v1.2.3 From 3d5d472cf55d1be091e8a145f80602bf435ece85 Mon Sep 17 00:00:00 2001 From: Taesoo Kim Date: Wed, 25 Mar 2015 15:55:29 -0700 Subject: fs/affs/file.c: unlock/release page on error When affs_bread_ino() fails, correctly unlock the page and release the page cache with proper error value. All write_end() should unlock/release the page that was locked by write_beg(). Signed-off-by: Taesoo Kim Cc: Fabian Frederick Cc: Al Viro Cc: Geert Uytterhoeven Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/file.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/affs/file.c b/fs/affs/file.c index d2468bf95669..a91795e01a7f 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, boff = tmp % bsize; if (boff) { bh = affs_bread_ino(inode, bidx, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } tmp = min(bsize - boff, to - from); BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); @@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, bidx++; } else if (bidx) { bh = affs_bread_ino(inode, bidx - 1, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } } while (from + bsize <= to) { prev_bh = bh; bh = affs_getemptyblk_ino(inode, bidx); if (IS_ERR(bh)) - goto out; + goto err_bh; memcpy(AFFS_DATA(bh), data + from, bsize); if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); @@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, prev_bh = bh; bh = affs_bread_ino(inode, bidx, 1); if (IS_ERR(bh)) - goto out; + goto err_bh; tmp = min(bsize, to - from); BUG_ON(tmp > bsize); memcpy(AFFS_DATA(bh), data + from, tmp); @@ -790,12 +794,13 @@ done: if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; +err_first_bh: unlock_page(page); page_cache_release(page); return written; -out: +err_bh: bh = prev_bh; if (!written) written = PTR_ERR(bh); -- cgit v1.2.3 From 98cf21c61a7f5419d82f847c4d77bf6e96a76f5f Mon Sep 17 00:00:00 2001 From: Sergei Antonov Date: Wed, 25 Mar 2015 15:55:34 -0700 Subject: hfsplus: fix B-tree corruption after insertion at position 0 Fix B-tree corruption when a new record is inserted at position 0 in the node in hfs_brec_insert(). In this case a hfs_brec_update_parent() is called to update the parent index node (if exists) and it is passed hfs_find_data with a search_key containing a newly inserted key instead of the key to be updated. This results in an inconsistent index node. The bug reproduces on my machine after an extents overflow record for the catalog file (CNID=4) is inserted into the extents overflow B-tree. Because of a low (reserved) value of CNID=4, it has to become the first record in the first leaf node. The resulting first leaf node is correct: ---------------------------------------------------- | key0.CNID=4 | key1.CNID=123 | key2.CNID=456, ... | ---------------------------------------------------- But the parent index key0 still contains the previous key CNID=123: ----------------------- | key0.CNID=123 | ... | ----------------------- A change in hfs_brec_insert() makes hfs_brec_update_parent() work correctly by preventing it from getting fd->record=-1 value from __hfs_brec_find(). Along the way, I removed duplicate code with unification of the if condition. The resulting code is equivalent to the original code because node is never 0. Also hfs_brec_update_parent() will now return an error after getting a negative fd->record value. However, the return value of hfs_brec_update_parent() is not checked anywhere in the file and I'm leaving it unchanged by this patch. brec.c lacks error checking after some other calls too, but this issue is of less importance than the one being fixed by this patch. Signed-off-by: Sergei Antonov Cc: Joe Perches Reviewed-by: Vyacheslav Dubeyko Acked-by: Hin-Tak Leung Cc: Anton Altaparmakov Cc: Al Viro Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/brec.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 6e560d56094b..754fdf8c6356 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -168,9 +171,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -370,6 +370,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd, hfs_find_rec_by_key); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; -- cgit v1.2.3