From 2a9b8cba62c0741109c33a2be700ff3d7703a7c2 Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Sun, 8 Jan 2017 20:59:35 -0500
Subject: ext4: Include forgotten start block on fallocate insert range

While doing 'insert range' start block should be also shifted right.
The bug can be easily reproduced by the following test:

    ptr = malloc(4096);
    assert(ptr);

    fd = open("./ext4.file", O_CREAT | O_TRUNC | O_RDWR, 0600);
    assert(fd >= 0);

    rc = fallocate(fd, 0, 0, 8192);
    assert(rc == 0);
    for (i = 0; i < 2048; i++)
            *((unsigned short *)ptr + i) = 0xbeef;
    rc = pwrite(fd, ptr, 4096, 0);
    assert(rc == 4096);
    rc = pwrite(fd, ptr, 4096, 4096);
    assert(rc == 4096);

    for (block = 2; block < 1000; block++) {
            rc = fallocate(fd, FALLOC_FL_INSERT_RANGE, 4096, 4096);
            assert(rc == 0);

            for (i = 0; i < 2048; i++)
                    *((unsigned short *)ptr + i) = block;

            rc = pwrite(fd, ptr, 4096, 4096);
            assert(rc == 4096);
    }

Because start block is not included in the range the hole appears at
the wrong offset (just after the desired offset) and the following
pwrite() overwrites already existent block, keeping hole untouched.

Simple way to verify wrong behaviour is to check zeroed blocks after
the test:

   $ hexdump ./ext4.file | grep '0000 0000'

The root cause of the bug is a wrong range (start, stop], where start
should be inclusive, i.e. [start, stop].

This patch fixes the problem by including start into the range.  But
not to break left shift (range collapse) stop points to the beginning
of the a block, not to the end.

The other not obvious change is an iterator check on validness in a
main loop.  Because iterator is unsigned the following corner case
should be considered with care: insert a block at 0 offset, when stop
variables overflows and never becomes less than start, which is 0.
To handle this special case iterator is set to NULL to indicate that
end of the loop is reached.

Fixes: 331573febb6a2
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: Namjae Jeon <namjae.jeon@samsung.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: stable@vger.kernel.org
---
 fs/ext4/extents.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3e295d3350a9..4d3014b5a3f9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5343,8 +5343,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	if (!extent)
 		goto out;
 
-	stop = le32_to_cpu(extent->ee_block) +
-			ext4_ext_get_actual_len(extent);
+	stop = le32_to_cpu(extent->ee_block);
 
        /*
 	 * In case of left shift, Don't start shifting extents until we make
@@ -5383,8 +5382,12 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	else
 		iterator = &stop;
 
-	/* Its safe to start updating extents */
-	while (start < stop) {
+	/*
+	 * Its safe to start updating extents.  Start and stop are unsigned, so
+	 * in case of right shift if extent with 0 block is reached, iterator
+	 * becomes NULL to indicate the end of the loop.
+	 */
+	while (iterator && start <= stop) {
 		path = ext4_find_extent(inode, *iterator, &path, 0);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
@@ -5412,8 +5415,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 					ext4_ext_get_actual_len(extent);
 		} else {
 			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
-			*iterator =  le32_to_cpu(extent->ee_block) > 0 ?
-				le32_to_cpu(extent->ee_block) - 1 : 0;
+			if (le32_to_cpu(extent->ee_block) > 0)
+				*iterator = le32_to_cpu(extent->ee_block) - 1;
+			else
+				/* Beginning is reached, end of the loop */
+				iterator = NULL;
 			/* Update path extent in case we need to stop */
 			while (le32_to_cpu(extent->ee_block) < start)
 				extent++;
-- 
cgit v1.2.3


From 03e916fa8b5577d85471452a3d0c5738aa658dae Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Sun, 8 Jan 2017 21:00:35 -0500
Subject: ext4: do not polute the extents cache while shifting extents

Inside ext4_ext_shift_extents() function ext4_find_extent() is called
without EXT4_EX_NOCACHE flag, which should prevent cache population.

This leads to oudated offsets in the extents tree and wrong blocks
afterwards.

Patch fixes the problem providing EXT4_EX_NOCACHE flag for each
ext4_find_extents() call inside ext4_ext_shift_extents function.

Fixes: 331573febb6a2
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: Namjae Jeon <namjae.jeon@samsung.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: stable@vger.kernel.org
---
 fs/ext4/extents.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4d3014b5a3f9..2a97dff87b96 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5334,7 +5334,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	ext4_lblk_t stop, *iterator, ex_start, ex_end;
 
 	/* Let path point to the last extent */
-	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+				EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 
@@ -5350,7 +5351,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	 * sure the hole is big enough to accommodate the shift.
 	*/
 	if (SHIFT == SHIFT_LEFT) {
-		path = ext4_find_extent(inode, start - 1, &path, 0);
+		path = ext4_find_extent(inode, start - 1, &path,
+					EXT4_EX_NOCACHE);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
@@ -5388,7 +5390,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	 * becomes NULL to indicate the end of the loop.
 	 */
 	while (iterator && start <= stop) {
-		path = ext4_find_extent(inode, *iterator, &path, 0);
+		path = ext4_find_extent(inode, *iterator, &path,
+					EXT4_EX_NOCACHE);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
-- 
cgit v1.2.3


From 670e9875eb14b112fa6a206a65c776a4fb347eb1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Jan 2017 15:32:22 -0500
Subject: ext4: add debug_want_extra_isize mount option

In order to test the inode extra isize expansion code, it is useful to
be able to easily create file systems that have inodes with extra
isize values smaller than the current desired value.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9d15a6293124..829e4a7b59e4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1284,7 +1284,7 @@ enum {
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-	Opt_lazytime, Opt_nolazytime,
+	Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
@@ -1352,6 +1352,7 @@ static const match_table_t tokens = {
 	{Opt_delalloc, "delalloc"},
 	{Opt_lazytime, "lazytime"},
 	{Opt_nolazytime, "nolazytime"},
+	{Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_removed, "mblk_io_submit"},
 	{Opt_removed, "nomblk_io_submit"},
@@ -1557,6 +1558,7 @@ static const struct mount_opts {
 #endif
 	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
 	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+	{Opt_debug_want_extra_isize, 0, MOPT_GTE0},
 	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
 	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
 							MOPT_SET | MOPT_Q},
@@ -1670,6 +1672,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		if (arg == 0)
 			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
 		sbi->s_commit_interval = HZ * arg;
+	} else if (token == Opt_debug_want_extra_isize) {
+		sbi->s_want_extra_isize = arg;
 	} else if (token == Opt_max_batch_time) {
 		sbi->s_max_batch_time = arg;
 	} else if (token == Opt_min_batch_time) {
@@ -4081,7 +4085,8 @@ no_journal:
 		sb->s_flags |= MS_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
-	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
+	    sbi->s_want_extra_isize == 0) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						     EXT4_GOOD_OLD_INODE_SIZE;
 		if (ext4_has_feature_extra_isize(sb)) {
-- 
cgit v1.2.3


From c755e251357a0cee0679081f08c3f4ba797a8009 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Jan 2017 21:50:46 -0500
Subject: ext4: fix deadlock between inline_data and
 ext4_expand_extra_isize_ea()

The xattr_sem deadlock problems fixed in commit 2e81a4eeedca: "ext4:
avoid deadlock when expanding inode size" didn't include the use of
xattr_sem in fs/ext4/inline.c.  With the addition of project quota
which added a new extra inode field, this exposed deadlocks in the
inline_data code similar to the ones fixed by 2e81a4eeedca.

The deadlock can be reproduced via:

   dmesg -n 7
   mke2fs -t ext4 -O inline_data -Fq -I 256 /dev/vdc 32768
   mount -t ext4 -o debug_want_extra_isize=24 /dev/vdc /vdc
   mkdir /vdc/a
   umount /vdc
   mount -t ext4 /dev/vdc /vdc
   echo foo > /vdc/a/foo

and looks like this:

[   11.158815]
[   11.160276] =============================================
[   11.161960] [ INFO: possible recursive locking detected ]
[   11.161960] 4.10.0-rc3-00015-g011b30a8a3cf #160 Tainted: G        W
[   11.161960] ---------------------------------------------
[   11.161960] bash/2519 is trying to acquire lock:
[   11.161960]  (&ei->xattr_sem){++++..}, at: [<c1225a4b>] ext4_expand_extra_isize_ea+0x3d/0x4cd
[   11.161960]
[   11.161960] but task is already holding lock:
[   11.161960]  (&ei->xattr_sem){++++..}, at: [<c1227941>] ext4_try_add_inline_entry+0x3a/0x152
[   11.161960]
[   11.161960] other info that might help us debug this:
[   11.161960]  Possible unsafe locking scenario:
[   11.161960]
[   11.161960]        CPU0
[   11.161960]        ----
[   11.161960]   lock(&ei->xattr_sem);
[   11.161960]   lock(&ei->xattr_sem);
[   11.161960]
[   11.161960]  *** DEADLOCK ***
[   11.161960]
[   11.161960]  May be due to missing lock nesting notation
[   11.161960]
[   11.161960] 4 locks held by bash/2519:
[   11.161960]  #0:  (sb_writers#3){.+.+.+}, at: [<c11a2414>] mnt_want_write+0x1e/0x3e
[   11.161960]  #1:  (&type->i_mutex_dir_key){++++++}, at: [<c119508b>] path_openat+0x338/0x67a
[   11.161960]  #2:  (jbd2_handle){++++..}, at: [<c123314a>] start_this_handle+0x582/0x622
[   11.161960]  #3:  (&ei->xattr_sem){++++..}, at: [<c1227941>] ext4_try_add_inline_entry+0x3a/0x152
[   11.161960]
[   11.161960] stack backtrace:
[   11.161960] CPU: 0 PID: 2519 Comm: bash Tainted: G        W       4.10.0-rc3-00015-g011b30a8a3cf #160
[   11.161960] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1 04/01/2014
[   11.161960] Call Trace:
[   11.161960]  dump_stack+0x72/0xa3
[   11.161960]  __lock_acquire+0xb7c/0xcb9
[   11.161960]  ? kvm_clock_read+0x1f/0x29
[   11.161960]  ? __lock_is_held+0x36/0x66
[   11.161960]  ? __lock_is_held+0x36/0x66
[   11.161960]  lock_acquire+0x106/0x18a
[   11.161960]  ? ext4_expand_extra_isize_ea+0x3d/0x4cd
[   11.161960]  down_write+0x39/0x72
[   11.161960]  ? ext4_expand_extra_isize_ea+0x3d/0x4cd
[   11.161960]  ext4_expand_extra_isize_ea+0x3d/0x4cd
[   11.161960]  ? _raw_read_unlock+0x22/0x2c
[   11.161960]  ? jbd2_journal_extend+0x1e2/0x262
[   11.161960]  ? __ext4_journal_get_write_access+0x3d/0x60
[   11.161960]  ext4_mark_inode_dirty+0x17d/0x26d
[   11.161960]  ? ext4_add_dirent_to_inline.isra.12+0xa5/0xb2
[   11.161960]  ext4_add_dirent_to_inline.isra.12+0xa5/0xb2
[   11.161960]  ext4_try_add_inline_entry+0x69/0x152
[   11.161960]  ext4_add_entry+0xa3/0x848
[   11.161960]  ? __brelse+0x14/0x2f
[   11.161960]  ? _raw_spin_unlock_irqrestore+0x44/0x4f
[   11.161960]  ext4_add_nondir+0x17/0x5b
[   11.161960]  ext4_create+0xcf/0x133
[   11.161960]  ? ext4_mknod+0x12f/0x12f
[   11.161960]  lookup_open+0x39e/0x3fb
[   11.161960]  ? __wake_up+0x1a/0x40
[   11.161960]  ? lock_acquire+0x11e/0x18a
[   11.161960]  path_openat+0x35c/0x67a
[   11.161960]  ? sched_clock_cpu+0xd7/0xf2
[   11.161960]  do_filp_open+0x36/0x7c
[   11.161960]  ? _raw_spin_unlock+0x22/0x2c
[   11.161960]  ? __alloc_fd+0x169/0x173
[   11.161960]  do_sys_open+0x59/0xcc
[   11.161960]  SyS_open+0x1d/0x1f
[   11.161960]  do_int80_syscall_32+0x4f/0x61
[   11.161960]  entry_INT80_32+0x2f/0x2f
[   11.161960] EIP: 0xb76ad469
[   11.161960] EFLAGS: 00000286 CPU: 0
[   11.161960] EAX: ffffffda EBX: 08168ac8 ECX: 00008241 EDX: 000001b6
[   11.161960] ESI: b75e46bc EDI: b7755000 EBP: bfbdb108 ESP: bfbdafc0
[   11.161960]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b

Cc: stable@vger.kernel.org # 3.10 (requires 2e81a4eeedca as a prereq)
Reported-by: George Spelvin <linux@sciencehorizons.net>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inline.c | 66 ++++++++++++++++++++++++++------------------------------
 fs/ext4/xattr.c  | 30 +++++++++++---------------
 fs/ext4/xattr.h  | 32 +++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 437df6a1a841..99a5312ced52 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -381,7 +381,7 @@ out:
 static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
 				    unsigned int len)
 {
-	int ret, size;
+	int ret, size, no_expand;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
@@ -391,15 +391,14 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
 	if (size < len)
 		return -ENOSPC;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 
 	if (ei->i_inline_off)
 		ret = ext4_update_inline_data(handle, inode, len);
 	else
 		ret = ext4_create_inline_data(handle, inode, len);
 
-	up_write(&EXT4_I(inode)->xattr_sem);
-
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return ret;
 }
 
@@ -533,7 +532,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
 					      struct inode *inode,
 					      unsigned flags)
 {
-	int ret, needed_blocks;
+	int ret, needed_blocks, no_expand;
 	handle_t *handle = NULL;
 	int retries = 0, sem_held = 0;
 	struct page *page = NULL;
@@ -573,7 +572,7 @@ retry:
 		goto out;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	sem_held = 1;
 	/* If some one has already done this for us, just exit. */
 	if (!ext4_has_inline_data(inode)) {
@@ -610,7 +609,7 @@ retry:
 		put_page(page);
 		page = NULL;
 		ext4_orphan_add(handle, inode);
-		up_write(&EXT4_I(inode)->xattr_sem);
+		ext4_write_unlock_xattr(inode, &no_expand);
 		sem_held = 0;
 		ext4_journal_stop(handle);
 		handle = NULL;
@@ -636,7 +635,7 @@ out:
 		put_page(page);
 	}
 	if (sem_held)
-		up_write(&EXT4_I(inode)->xattr_sem);
+		ext4_write_unlock_xattr(inode, &no_expand);
 	if (handle)
 		ext4_journal_stop(handle);
 	brelse(iloc.bh);
@@ -729,7 +728,7 @@ convert:
 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 			       unsigned copied, struct page *page)
 {
-	int ret;
+	int ret, no_expand;
 	void *kaddr;
 	struct ext4_iloc iloc;
 
@@ -747,7 +746,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 		goto out;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	BUG_ON(!ext4_has_inline_data(inode));
 
 	kaddr = kmap_atomic(page);
@@ -757,7 +756,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 	/* clear page dirty so that writepages wouldn't work for us. */
 	ClearPageDirty(page);
 
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	brelse(iloc.bh);
 out:
 	return copied;
@@ -768,7 +767,7 @@ ext4_journalled_write_inline_data(struct inode *inode,
 				  unsigned len,
 				  struct page *page)
 {
-	int ret;
+	int ret, no_expand;
 	void *kaddr;
 	struct ext4_iloc iloc;
 
@@ -778,11 +777,11 @@ ext4_journalled_write_inline_data(struct inode *inode,
 		return NULL;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	kaddr = kmap_atomic(page);
 	ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
 	kunmap_atomic(kaddr);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 
 	return iloc.bh;
 }
@@ -1259,7 +1258,7 @@ out:
 int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 			      struct inode *dir, struct inode *inode)
 {
-	int ret, inline_size;
+	int ret, inline_size, no_expand;
 	void *inline_start;
 	struct ext4_iloc iloc;
 
@@ -1267,7 +1266,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 	if (ret)
 		return ret;
 
-	down_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_lock_xattr(dir, &no_expand);
 	if (!ext4_has_inline_data(dir))
 		goto out;
 
@@ -1313,7 +1312,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 
 out:
 	ext4_mark_inode_dirty(handle, dir);
-	up_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_unlock_xattr(dir, &no_expand);
 	brelse(iloc.bh);
 	return ret;
 }
@@ -1673,7 +1672,7 @@ int ext4_delete_inline_entry(handle_t *handle,
 			     struct buffer_head *bh,
 			     int *has_inline_data)
 {
-	int err, inline_size;
+	int err, inline_size, no_expand;
 	struct ext4_iloc iloc;
 	void *inline_start;
 
@@ -1681,7 +1680,7 @@ int ext4_delete_inline_entry(handle_t *handle,
 	if (err)
 		return err;
 
-	down_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_lock_xattr(dir, &no_expand);
 	if (!ext4_has_inline_data(dir)) {
 		*has_inline_data = 0;
 		goto out;
@@ -1715,7 +1714,7 @@ int ext4_delete_inline_entry(handle_t *handle,
 
 	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
 out:
-	up_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_unlock_xattr(dir, &no_expand);
 	brelse(iloc.bh);
 	if (err != -ENOENT)
 		ext4_std_error(dir->i_sb, err);
@@ -1814,11 +1813,11 @@ out:
 
 int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
 {
-	int ret;
+	int ret, no_expand;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	ret = ext4_destroy_inline_data_nolock(handle, inode);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 
 	return ret;
 }
@@ -1903,7 +1902,7 @@ out:
 void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 {
 	handle_t *handle;
-	int inline_size, value_len, needed_blocks;
+	int inline_size, value_len, needed_blocks, no_expand;
 	size_t i_size;
 	void *value = NULL;
 	struct ext4_xattr_ibody_find is = {
@@ -1920,7 +1919,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 	if (IS_ERR(handle))
 		return;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	if (!ext4_has_inline_data(inode)) {
 		*has_inline = 0;
 		ext4_journal_stop(handle);
@@ -1978,7 +1977,7 @@ out_error:
 	up_write(&EXT4_I(inode)->i_data_sem);
 out:
 	brelse(is.iloc.bh);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	kfree(value);
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
@@ -1994,7 +1993,7 @@ out:
 
 int ext4_convert_inline_data(struct inode *inode)
 {
-	int error, needed_blocks;
+	int error, needed_blocks, no_expand;
 	handle_t *handle;
 	struct ext4_iloc iloc;
 
@@ -2016,15 +2015,10 @@ int ext4_convert_inline_data(struct inode *inode)
 		goto out_free;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		up_write(&EXT4_I(inode)->xattr_sem);
-		goto out;
-	}
-
-	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
-	up_write(&EXT4_I(inode)->xattr_sem);
-out:
+	ext4_write_lock_xattr(inode, &no_expand);
+	if (ext4_has_inline_data(inode))
+		error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	ext4_journal_stop(handle);
 out_free:
 	brelse(iloc.bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5a94fa52b74f..c40bd55b6400 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1188,16 +1188,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	struct ext4_xattr_block_find bs = {
 		.s = { .not_found = -ENODATA, },
 	};
-	unsigned long no_expand;
+	int no_expand;
 	int error;
 
 	if (!name)
 		return -EINVAL;
 	if (strlen(name) > 255)
 		return -ERANGE;
-	down_write(&EXT4_I(inode)->xattr_sem);
-	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
-	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_write_lock_xattr(inode, &no_expand);
 
 	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
 	if (error)
@@ -1264,7 +1262,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		ext4_xattr_update_super_block(handle, inode->i_sb);
 		inode->i_ctime = current_time(inode);
 		if (!value)
-			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+			no_expand = 0;
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1278,9 +1276,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
-	if (no_expand == 0)
-		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return error;
 }
 
@@ -1497,12 +1493,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	int error = 0, tried_min_extra_isize = 0;
 	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
 	int isize_diff;	/* How much do we need to grow i_extra_isize */
+	int no_expand;
+
+	if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
+		return 0;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
-	/*
-	 * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty
-	 */
-	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 retry:
 	isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
@@ -1584,17 +1579,16 @@ shift:
 	EXT4_I(inode)->i_extra_isize = new_extra_isize;
 	brelse(bh);
 out:
-	ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return 0;
 
 cleanup:
 	brelse(bh);
 	/*
-	 * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode
-	 * size expansion failed.
+	 * Inode size expansion failed; don't try again
 	 */
-	up_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = 1;
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return error;
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index a92e783fa057..099c8b670ef5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -102,6 +102,38 @@ extern const struct xattr_handler ext4_xattr_security_handler;
 
 #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
 
+/*
+ * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
+ * The first is to signal that there the inline xattrs and data are
+ * taking up so much space that we might as well not keep trying to
+ * expand it.  The second is that xattr_sem is taken for writing, so
+ * we shouldn't try to recurse into the inode expansion.  For this
+ * second case, we need to make sure that we take save and restore the
+ * NO_EXPAND state flag appropriately.
+ */
+static inline void ext4_write_lock_xattr(struct inode *inode, int *save)
+{
+	down_write(&EXT4_I(inode)->xattr_sem);
+	*save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+}
+
+static inline int ext4_write_trylock_xattr(struct inode *inode, int *save)
+{
+	if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0)
+		return 0;
+	*save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	return 1;
+}
+
+static inline void ext4_write_unlock_xattr(struct inode *inode, int *save)
+{
+	if (*save == 0)
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	up_write(&EXT4_I(inode)->xattr_sem);
+}
+
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
-- 
cgit v1.2.3


From b907f2d5194c2636623415d89cfb91d692af0629 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 11 Jan 2017 22:14:49 -0500
Subject: ext4: avoid calling ext4_mark_inode_dirty() under unneeded semaphores

There is no need to call ext4_mark_inode_dirty while holding xattr_sem
or i_data_sem, so where it's easy to avoid it, move it out from the
critical region.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inline.c | 9 +++------
 fs/ext4/inode.c  | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 99a5312ced52..31f98dd04e51 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1042,7 +1042,6 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
-	ext4_mark_inode_dirty(handle, dir);
 	return 1;
 }
 
@@ -1311,8 +1310,8 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 	ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
 
 out:
-	ext4_mark_inode_dirty(handle, dir);
 	ext4_write_unlock_xattr(dir, &no_expand);
+	ext4_mark_inode_dirty(handle, dir);
 	brelse(iloc.bh);
 	return ret;
 }
@@ -1708,13 +1707,11 @@ int ext4_delete_inline_entry(handle_t *handle,
 	if (err)
 		goto out;
 
-	err = ext4_mark_inode_dirty(handle, dir);
-	if (unlikely(err))
-		goto out;
-
 	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
 out:
 	ext4_write_unlock_xattr(dir, &no_expand);
+	if (likely(err == 0))
+		err = ext4_mark_inode_dirty(handle, dir);
 	brelse(iloc.bh);
 	if (err != -ENOENT)
 		ext4_std_error(dir->i_sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88d57af1b516..86dde0667ccc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2464,8 +2464,8 @@ update_disksize:
 			disksize = i_size;
 		if (disksize > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = disksize;
-		err2 = ext4_mark_inode_dirty(handle, inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
+		err2 = ext4_mark_inode_dirty(handle, inode);
 		if (err2)
 			ext4_error(inode->i_sb,
 				   "Failed to mark inode %lu dirty",
-- 
cgit v1.2.3


From 01daf9452569fe2e69e27fe3e617b43d2ebb1e93 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 22 Jan 2017 19:35:49 -0500
Subject: ext4: propagate error values from ext4_inline_data_truncate()

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  2 +-
 fs/ext4/inline.c | 40 +++++++++++++++++++++++-----------------
 fs/ext4/inode.c  |  4 +++-
 3 files changed, 27 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6bcb9622fdf9..1cd077e02517 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3023,7 +3023,7 @@ extern int ext4_inline_data_fiemap(struct inode *inode,
 extern int ext4_try_to_evict_inline_data(handle_t *handle,
 					 struct inode *inode,
 					 int needed);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);
 
 extern int ext4_convert_inline_data(struct inode *inode);
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 31f98dd04e51..338cfd862adb 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1896,10 +1896,10 @@ out:
 	return error;
 }
 
-void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 {
 	handle_t *handle;
-	int inline_size, value_len, needed_blocks, no_expand;
+	int inline_size, value_len, needed_blocks, no_expand, err = 0;
 	size_t i_size;
 	void *value = NULL;
 	struct ext4_xattr_ibody_find is = {
@@ -1914,19 +1914,19 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 	needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
 	if (IS_ERR(handle))
-		return;
+		return PTR_ERR(handle);
 
 	ext4_write_lock_xattr(inode, &no_expand);
 	if (!ext4_has_inline_data(inode)) {
 		*has_inline = 0;
 		ext4_journal_stop(handle);
-		return;
+		return 0;
 	}
 
-	if (ext4_orphan_add(handle, inode))
+	if ((err = ext4_orphan_add(handle, inode)) != 0)
 		goto out;
 
-	if (ext4_get_inode_loc(inode, &is.iloc))
+	if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0)
 		goto out;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -1937,24 +1937,29 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 	if (i_size < inline_size) {
 		/* Clear the content in the xattr space. */
 		if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
-			if (ext4_xattr_ibody_find(inode, &i, &is))
+			if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
 				goto out_error;
 
 			BUG_ON(is.s.not_found);
 
 			value_len = le32_to_cpu(is.s.here->e_value_size);
 			value = kmalloc(value_len, GFP_NOFS);
-			if (!value)
+			if (!value) {
+				err = -ENOMEM;
 				goto out_error;
+			}
 
-			if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
-						value, value_len))
+			err = ext4_xattr_ibody_get(inode, i.name_index,
+						   i.name, value, value_len);
+			if (err <= 0)
 				goto out_error;
 
 			i.value = value;
 			i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
 					i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
-			if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+			err = ext4_xattr_ibody_inline_set(handle, inode,
+							  &i, &is);
+			if (err)
 				goto out_error;
 		}
 
@@ -1979,13 +1984,14 @@ out:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode->i_ctime = current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-
+	if (err == 0) {
+		inode->i_mtime = inode->i_ctime = current_time(inode);
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+	}
 	ext4_journal_stop(handle);
-	return;
+	return err;
 }
 
 int ext4_convert_inline_data(struct inode *inode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 86dde0667ccc..1e2c881f102d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4222,7 +4222,9 @@ int ext4_truncate(struct inode *inode)
 	if (ext4_has_inline_data(inode)) {
 		int has_inline = 1;
 
-		ext4_inline_data_truncate(inode, &has_inline);
+		err = ext4_inline_data_truncate(inode, &has_inline);
+		if (err)
+			return err;
 		if (has_inline)
 			return 0;
 	}
-- 
cgit v1.2.3


From 43c73221b3b1cdc9156c78287c5f4b29dc085d22 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 22 Jan 2017 19:35:52 -0500
Subject: ext4: replace BUG_ON with WARN_ON in mb_find_extent()

The last BUG_ON in mb_find_extent() is apparently triggering in some
rare cases.  Most of the time it indicates a bug in the buddy bitmap
algorithms, but there are some weird cases where it can trigger when
buddy bitmap is still in memory, but the block bitmap has to be read
from disk, and there is disk or memory corruption such that the block
bitmap and the buddy bitmap are out of sync.

Google-Bug-Id: #33702157

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7ae43c59bc79..ec2f64b0e696 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1556,7 +1556,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
 		ex->fe_len += 1 << order;
 	}
 
-	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+	if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
+		/* Should never happen! (but apparently sometimes does?!?) */
+		WARN_ON(1);
+		ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
+			   "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
+			   block, order, needed, ex->fe_group, ex->fe_start,
+			   ex->fe_len, ex->fe_logical);
+		ex->fe_len = 0;
+		ex->fe_start = 0;
+		ex->fe_group = 0;
+	}
 	return ex->fe_len;
 }
 
-- 
cgit v1.2.3


From cd648b8a8fd5071d232242d5ee7ee3c0815776af Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 27 Jan 2017 14:34:30 -0500
Subject: ext4: trim allocation requests to group size

If filesystem groups are artifically small (using parameter -g to
mkfs.ext4), ext4_mb_normalize_request() can result in a request that is
larger than a block group. Trim the request size to not confuse
allocation code.

Reported-by: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/mballoc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ec2f64b0e696..f513f273ff89 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3133,6 +3133,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	if (ar->pright && start + size - 1 >= ar->lright)
 		size -= start + size - ar->lright;
 
+	/*
+	 * Trim allocation request for filesystems with artificially small
+	 * groups.
+	 */
+	if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
+		size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
+
 	end = start + size;
 
 	/* check we don't cross already preallocated blocks */
-- 
cgit v1.2.3


From 3b136499e906460919f0d21a49db1aaccf0ae963 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 27 Jan 2017 14:35:38 -0500
Subject: ext4: fix data corruption in data=journal mode

ext4_journalled_write_end() did not propely handle all the cases when
generic_perform_write() did not copy all the data into the target page
and could mark buffers with uninitialized contents as uptodate and dirty
leading to possible data corruption (which would be quickly fixed by
generic_perform_write() retrying the write but still). Fix the problem
by carefully handling the case when the page that is written to is not
uptodate.

CC: stable@vger.kernel.org
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e2c881f102d..918d351d5b94 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1385,7 +1385,9 @@ errout:
  * set the buffer to be dirty, since in data=journalled mode we need
  * to call ext4_handle_dirty_metadata() instead.
  */
-static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+					    struct page *page,
+					    unsigned from, unsigned to)
 {
 	unsigned int block_start = 0, block_end;
 	struct buffer_head *head, *bh;
@@ -1402,7 +1404,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
 					size = min(to, block_end) - start;
 
 					zero_user(page, start, size);
-					set_buffer_uptodate(bh);
+					write_end_fn(handle, bh);
 				}
 				clear_buffer_new(bh);
 			}
@@ -1434,15 +1436,16 @@ static int ext4_journalled_write_end(struct file *file,
 	if (ext4_has_inline_data(inode))
 		copied = ext4_write_inline_data_end(inode, pos, len,
 						    copied, page);
-	else {
-		if (copied < len) {
-			if (!PageUptodate(page))
-				copied = 0;
-			zero_new_buffers(page, from+copied, to);
-		}
-
+	else if (unlikely(copied < len) && !PageUptodate(page)) {
+		copied = 0;
+		ext4_journalled_zero_new_buffers(handle, page, from, to);
+	} else {
+		if (unlikely(copied < len))
+			ext4_journalled_zero_new_buffers(handle, page,
+							 from + copied, to);
 		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
-					     to, &partial, write_end_fn);
+					     from + copied, &partial,
+					     write_end_fn);
 		if (!partial)
 			SetPageUptodate(page);
 	}
-- 
cgit v1.2.3


From dbfcef6b0f4012c57bc0b6e0e660d5ed12a5eaed Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Wed, 1 Feb 2017 20:49:35 -0500
Subject: jbd2: fix use after free in kjournald2()

Below is the synchronization issue between unmount and kjournald2
contexts, which results into use after free issue in kjournald2().
Fix this issue by using journal->j_state_lock to synchronize the
wait_event() done in journal_kill_thread() and the wake_up() done
in kjournald2().

TASK 1:
umount cmd:
   |--jbd2_journal_destroy() {
       |--journal_kill_thread() {
            write_lock(&journal->j_state_lock);
	    journal->j_flags |= JBD2_UNMOUNT;
	    ...
	    write_unlock(&journal->j_state_lock);
	    wake_up(&journal->j_wait_commit);	   TASK 2 wakes up here:
	    					   kjournald2() {
						     ...
						     checks JBD2_UNMOUNT flag and calls goto end-loop;
						     ...
						     end_loop:
						       write_unlock(&journal->j_state_lock);
						       journal->j_task = NULL; --> If this thread gets
						       pre-empted here, then TASK 1 wait_event will
						       exit even before this thread is completely
						       done.
	    wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
	    ...
	    write_lock(&journal->j_state_lock);
	    write_unlock(&journal->j_state_lock);
	  }
       |--kfree(journal);
     }
}
						       wake_up(&journal->j_wait_done_commit); --> this step
						       now results into use after free issue.
						   }

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a097048ed1a3..85d1483939b2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -276,11 +276,11 @@ loop:
 	goto loop;
 
 end_loop:
-	write_unlock(&journal->j_state_lock);
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
 	jbd_debug(1, "Journal thread exiting.\n");
+	write_unlock(&journal->j_state_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From dd01b690f8f4b1e414f89e5a9a5326bf720d6652 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 1 Feb 2017 21:07:11 -0500
Subject: ext4: fix use-after-iput when fscrypt contexts are inconsistent

In the case where the child's encryption context was inconsistent with
its parent directory, we were using inode->i_sb and inode->i_ino after
the inode had already been iput().  Fix this by doing the iput() in the
correct places.

Note: only ext4 had this bug, not f2fs and ubifs.

Fixes: d9cdc9033181 ("ext4 crypto: enforce context consistency")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bb880c326191..931da9d5d915 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1618,13 +1618,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 		    !fscrypt_has_permitted_context(dir, inode)) {
 			int nokey = ext4_encrypted_inode(inode) &&
 				!fscrypt_has_encryption_key(inode);
-			iput(inode);
-			if (nokey)
+			if (nokey) {
+				iput(inode);
 				return ERR_PTR(-ENOKEY);
+			}
 			ext4_warning(inode->i_sb,
 				     "Inconsistent encryption contexts: %lu/%lu",
 				     (unsigned long) dir->i_ino,
 				     (unsigned long) inode->i_ino);
+			iput(inode);
 			return ERR_PTR(-EPERM);
 		}
 	}
-- 
cgit v1.2.3


From 1c83a9aab807f7452c4957b2401e1cbf43941820 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 2 Feb 2017 11:52:14 -0500
Subject: ext4: move halfmd4 into hash.c directly

The "half md4" transform should not be used by any new code. And
fortunately, it's only used now by ext4. Since ext4 supports several
hashing methods, at some point it might be desirable to move to
something like SipHash. As an intermediate step, remove half md4 from
cryptohash.h and lib, and make it just a local function in ext4's
hash.c. There's precedent for doing this; the other function ext can use
for its hashes -- TEA -- is also implemented in the same place. Also, by
being a local function, this might allow gcc to perform some additional
optimizations.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/hash.c             | 71 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cryptohash.h |  2 --
 lib/Makefile               |  2 +-
 lib/halfmd4.c              | 67 -------------------------------------------
 4 files changed, 71 insertions(+), 71 deletions(-)
 delete mode 100644 lib/halfmd4.c

(limited to 'fs')

diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e026aa941fd5..38b8a96eb97c 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -10,7 +10,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/cryptohash.h>
+#include <linux/compiler.h>
+#include <linux/bitops.h>
 #include "ext4.h"
 
 #define DELTA 0x9E3779B9
@@ -32,6 +33,74 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 	buf[1] += b1;
 }
 
+/* F, G and H are basic MD4 functions: selection, majority, parity */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/*
+ * The generic round function.  The application is so specific that
+ * we don't bother protecting all the arguments with parens, as is generally
+ * good macro practice, in favor of extra legibility.
+ * Rotation is separate from addition to prevent recomputation
+ */
+#define ROUND(f, a, b, c, d, x, s)	\
+	(a += f(b, c, d) + x, a = rol32(a, s))
+#define K1 0
+#define K2 013240474631UL
+#define K3 015666365641UL
+
+/*
+ * Basic cut-down MD4 transform.  Returns only 32 bits of result.
+ */
+static __u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
+{
+	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+	/* Round 1 */
+	ROUND(F, a, b, c, d, in[0] + K1,  3);
+	ROUND(F, d, a, b, c, in[1] + K1,  7);
+	ROUND(F, c, d, a, b, in[2] + K1, 11);
+	ROUND(F, b, c, d, a, in[3] + K1, 19);
+	ROUND(F, a, b, c, d, in[4] + K1,  3);
+	ROUND(F, d, a, b, c, in[5] + K1,  7);
+	ROUND(F, c, d, a, b, in[6] + K1, 11);
+	ROUND(F, b, c, d, a, in[7] + K1, 19);
+
+	/* Round 2 */
+	ROUND(G, a, b, c, d, in[1] + K2,  3);
+	ROUND(G, d, a, b, c, in[3] + K2,  5);
+	ROUND(G, c, d, a, b, in[5] + K2,  9);
+	ROUND(G, b, c, d, a, in[7] + K2, 13);
+	ROUND(G, a, b, c, d, in[0] + K2,  3);
+	ROUND(G, d, a, b, c, in[2] + K2,  5);
+	ROUND(G, c, d, a, b, in[4] + K2,  9);
+	ROUND(G, b, c, d, a, in[6] + K2, 13);
+
+	/* Round 3 */
+	ROUND(H, a, b, c, d, in[3] + K3,  3);
+	ROUND(H, d, a, b, c, in[7] + K3,  9);
+	ROUND(H, c, d, a, b, in[2] + K3, 11);
+	ROUND(H, b, c, d, a, in[6] + K3, 15);
+	ROUND(H, a, b, c, d, in[1] + K3,  3);
+	ROUND(H, d, a, b, c, in[5] + K3,  9);
+	ROUND(H, c, d, a, b, in[0] + K3, 11);
+	ROUND(H, b, c, d, a, in[4] + K3, 15);
+
+	buf[0] += a;
+	buf[1] += b;
+	buf[2] += c;
+	buf[3] += d;
+
+	return buf[1]; /* "most hashed" word */
+}
+#undef ROUND
+#undef K1
+#undef K2
+#undef K3
+#undef F
+#undef G
+#undef H
 
 /* The old legacy hash */
 static __u32 dx_hack_hash_unsigned(const char *name, int len)
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index f4754282c9c2..3252799832cf 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -15,6 +15,4 @@ void sha_transform(__u32 *digest, const char *data, __u32 *W);
 
 void md5_transform(__u32 *hash, __u32 const *in);
 
-__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
-
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index bc4073a8cd08..19ea76149a37 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -31,7 +31,7 @@ lib-$(CONFIG_HAS_DMA) += dma-noop.o
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
 
-obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
+obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
deleted file mode 100644
index 137e861d9690..000000000000
--- a/lib/halfmd4.c
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/cryptohash.h>
-#include <linux/bitops.h>
-
-/* F, G and H are basic MD4 functions: selection, majority, parity */
-#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-
-/*
- * The generic round function.  The application is so specific that
- * we don't bother protecting all the arguments with parens, as is generally
- * good macro practice, in favor of extra legibility.
- * Rotation is separate from addition to prevent recomputation
- */
-#define ROUND(f, a, b, c, d, x, s)	\
-	(a += f(b, c, d) + x, a = rol32(a, s))
-#define K1 0
-#define K2 013240474631UL
-#define K3 015666365641UL
-
-/*
- * Basic cut-down MD4 transform.  Returns only 32 bits of result.
- */
-__u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
-{
-	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-
-	/* Round 1 */
-	ROUND(F, a, b, c, d, in[0] + K1,  3);
-	ROUND(F, d, a, b, c, in[1] + K1,  7);
-	ROUND(F, c, d, a, b, in[2] + K1, 11);
-	ROUND(F, b, c, d, a, in[3] + K1, 19);
-	ROUND(F, a, b, c, d, in[4] + K1,  3);
-	ROUND(F, d, a, b, c, in[5] + K1,  7);
-	ROUND(F, c, d, a, b, in[6] + K1, 11);
-	ROUND(F, b, c, d, a, in[7] + K1, 19);
-
-	/* Round 2 */
-	ROUND(G, a, b, c, d, in[1] + K2,  3);
-	ROUND(G, d, a, b, c, in[3] + K2,  5);
-	ROUND(G, c, d, a, b, in[5] + K2,  9);
-	ROUND(G, b, c, d, a, in[7] + K2, 13);
-	ROUND(G, a, b, c, d, in[0] + K2,  3);
-	ROUND(G, d, a, b, c, in[2] + K2,  5);
-	ROUND(G, c, d, a, b, in[4] + K2,  9);
-	ROUND(G, b, c, d, a, in[6] + K2, 13);
-
-	/* Round 3 */
-	ROUND(H, a, b, c, d, in[3] + K3,  3);
-	ROUND(H, d, a, b, c, in[7] + K3,  9);
-	ROUND(H, c, d, a, b, in[2] + K3, 11);
-	ROUND(H, b, c, d, a, in[6] + K3, 15);
-	ROUND(H, a, b, c, d, in[1] + K3,  3);
-	ROUND(H, d, a, b, c, in[5] + K3,  9);
-	ROUND(H, c, d, a, b, in[0] + K3, 11);
-	ROUND(H, b, c, d, a, in[4] + K3, 15);
-
-	buf[0] += a;
-	buf[1] += b;
-	buf[2] += c;
-	buf[3] += d;
-
-	return buf[1]; /* "most hashed" word */
-}
-EXPORT_SYMBOL(half_md4_transform);
-- 
cgit v1.2.3


From eb5efbcb762aee4b454b04f7115f73ccbcf8f0ef Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 4 Feb 2017 23:04:00 -0500
Subject: ext4: fix inline data error paths

The write_end() function must always unlock the page and drop its ref
count, even on an error.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/inline.c |  9 ++++++++-
 fs/ext4/inode.c  | 20 +++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 338cfd862adb..b777b8aa14ae 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -943,8 +943,15 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 				  struct page *page)
 {
 	int i_size_changed = 0;
+	int ret;
 
-	copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+	ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
+	if (ret < 0) {
+		unlock_page(page);
+		put_page(page);
+		return ret;
+	}
+	copied = ret;
 
 	/*
 	 * No need to use i_size_read() here, the i_size
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 918d351d5b94..af97b9170358 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1330,8 +1330,11 @@ static int ext4_write_end(struct file *file,
 	if (ext4_has_inline_data(inode)) {
 		ret = ext4_write_inline_data_end(inode, pos, len,
 						 copied, page);
-		if (ret < 0)
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
 			goto errout;
+		}
 		copied = ret;
 	} else
 		copied = block_write_end(file, mapping, pos,
@@ -1433,10 +1436,16 @@ static int ext4_journalled_write_end(struct file *file,
 
 	BUG_ON(!ext4_handle_valid(handle));
 
-	if (ext4_has_inline_data(inode))
-		copied = ext4_write_inline_data_end(inode, pos, len,
-						    copied, page);
-	else if (unlikely(copied < len) && !PageUptodate(page)) {
+	if (ext4_has_inline_data(inode)) {
+		ret = ext4_write_inline_data_end(inode, pos, len,
+						 copied, page);
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
+			goto errout;
+		}
+		copied = ret;
+	} else if (unlikely(copied < len) && !PageUptodate(page)) {
 		copied = 0;
 		ext4_journalled_zero_new_buffers(handle, page, from, to);
 	} else {
@@ -1471,6 +1480,7 @@ static int ext4_journalled_write_end(struct file *file,
 		 */
 		ext4_orphan_add(handle, inode);
 
+errout:
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-- 
cgit v1.2.3


From e112666b4959b25a8552d63bc564e1059be703e8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 4 Feb 2017 23:14:19 -0500
Subject: jbd2: don't leak modified metadata buffers on an aborted journal

If the journal has been aborted, we shouldn't mark the underlying
buffer head as dirty, since that will cause the metadata block to get
modified.  And if the journal has been aborted, we shouldn't allow
this since it will almost certainly lead to a corrupted file system.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/jbd2/transaction.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e1652665bd93..5e659ee08d6a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1863,7 +1863,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 
 	__blist_del_buffer(list, jh);
 	jh->b_jlist = BJ_None;
-	if (test_clear_buffer_jbddirty(bh))
+	if (transaction && is_journal_aborted(transaction->t_journal))
+		clear_buffer_jbddirty(bh);
+	else if (test_clear_buffer_jbddirty(bh))
 		mark_buffer_dirty(bh);	/* Expose it to the VM */
 }
 
-- 
cgit v1.2.3


From 97abd7d4b5d9c48ec15c425485f054e1c15e591b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 4 Feb 2017 23:38:06 -0500
Subject: ext4: preserve the needs_recovery flag when the journal is aborted

If the journal is aborted, the needs_recovery feature flag should not
be removed.  Otherwise, it's the journal might not get replayed and
this could lead to more data getting lost.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 829e4a7b59e4..3fef82e79131 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -825,6 +825,7 @@ static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
+	int aborted = 0;
 	int i, err;
 
 	ext4_unregister_li_request(sb);
@@ -834,9 +835,10 @@ static void ext4_put_super(struct super_block *sb)
 	destroy_workqueue(sbi->rsv_conversion_wq);
 
 	if (sbi->s_journal) {
+		aborted = is_journal_aborted(sbi->s_journal);
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
-		if (err < 0)
+		if ((err < 0) && !aborted)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
@@ -847,7 +849,7 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 
-	if (!(sb->s_flags & MS_RDONLY)) {
+	if (!(sb->s_flags & MS_RDONLY) && !aborted) {
 		ext4_clear_feature_journal_needs_recovery(sb);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
 	}
-- 
cgit v1.2.3


From 4753d8a24d4588657bc0a4cd66d4e282dff15c8c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Feb 2017 01:26:48 -0500
Subject: ext4: return EROFS if device is r/o and journal replay is needed

If the file system requires journal recovery, and the device is
read-ony, return EROFS to the mount system call.  This allows xfstests
generic/050 to pass.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3fef82e79131..514e5fc59893 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3925,7 +3925,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * root first: it may be modified in the journal!
 	 */
 	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
-		if (ext4_load_journal(sb, es, journal_devnum))
+		err = ext4_load_journal(sb, es, journal_devnum);
+		if (err)
 			goto failed_mount3a;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 		   ext4_has_feature_journal_needs_recovery(sb)) {
-- 
cgit v1.2.3


From 9549a168bd500db1a76914e50775f7cd1690acef Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Feb 2017 01:27:48 -0500
Subject: ext4: rename s_resize_flags to s_ext4_flags

We are currently using one bit in s_resize_flags; rename it in order
to allow more of the bits in that unsigned long for other purposes.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 10 +++++++---
 fs/ext4/resize.c |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1cd077e02517..2e7e02f2f771 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1399,8 +1399,7 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
-	unsigned long s_resize_flags;		/* Flags indicating if there
-						   is a resizer */
+	unsigned long s_ext4_flags;		/* Ext4 superblock flags */
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
@@ -1833,6 +1832,12 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
 	return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
 }
 
+/*
+ * Superblock flags
+ */
+#define EXT4_FLAGS_RESIZING	0
+
+
 /*
  * Default values for user and/or group using reserved blocks
  */
@@ -3217,7 +3222,6 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
 					    EXT4_WQ_HASH_SZ])
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 
-#define EXT4_RESIZING	0
 extern int ext4_resize_begin(struct super_block *sb);
 extern void ext4_resize_end(struct super_block *sb);
 
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf681004b196..c3ed9021b781 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,7 +45,8 @@ int ext4_resize_begin(struct super_block *sb)
 		return -EPERM;
 	}
 
-	if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+	if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
+				  &EXT4_SB(sb)->s_ext4_flags))
 		ret = -EBUSY;
 
 	return ret;
@@ -53,7 +54,7 @@ int ext4_resize_begin(struct super_block *sb)
 
 void ext4_resize_end(struct super_block *sb)
 {
-	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+	clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
 	smp_mb__after_atomic();
 }
 
-- 
cgit v1.2.3


From 0db1ff222d40f1601c961f0edb86d10426992595 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Feb 2017 01:28:48 -0500
Subject: ext4: add shutdown bit and check for it

Add a shutdown bit that will cause ext4 processing to fail immediately
with EIO.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h      |  6 ++++++
 fs/ext4/ext4_jbd2.c | 11 +++++++++++
 fs/ext4/file.c      | 12 ++++++++++++
 fs/ext4/fsync.c     |  3 +++
 fs/ext4/ialloc.c    |  3 +++
 fs/ext4/inline.c    |  3 +++
 fs/ext4/inode.c     | 30 ++++++++++++++++++++++++++++--
 fs/ext4/namei.c     | 12 ++++++++++++
 fs/ext4/page-io.c   |  2 +-
 fs/ext4/super.c     | 21 +++++++++++++++++++++
 fs/ext4/xattr.c     |  3 +++
 11 files changed, 103 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2e7e02f2f771..35d93ab7f3fb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1836,6 +1836,12 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
  * Superblock flags
  */
 #define EXT4_FLAGS_RESIZING	0
+#define EXT4_FLAGS_SHUTDOWN	1
+
+static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
+{
+	return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+}
 
 
 /*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index e770c1ee4613..dd106b1d5d89 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,10 @@ static int ext4_journal_check_start(struct super_block *sb)
 	journal_t *journal;
 
 	might_sleep();
+
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return -EIO;
+
 	if (sb->s_flags & MS_RDONLY)
 		return -EROFS;
 	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
@@ -161,6 +165,13 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
 	might_sleep();
 
 	if (ext4_handle_valid(handle)) {
+		struct super_block *sb;
+
+		sb = handle->h_transaction->t_journal->j_private;
+		if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) {
+			jbd2_journal_abort_handle(handle);
+			return -EIO;
+		}
 		err = jbd2_journal_get_write_access(handle, bh);
 		if (err)
 			ext4_journal_abort_handle(where, line, __func__, bh,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d663d3d7c81c..ff3f6107b0ba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -57,6 +57,9 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+		return -EIO;
+
 	if (!iov_iter_count(to))
 		return 0; /* skip atime */
 
@@ -213,6 +216,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	int overwrite = 0;
 	ssize_t ret;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 #ifdef CONFIG_FS_DAX
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
@@ -348,6 +354,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_mapping->host;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (ext4_encrypted_inode(inode)) {
 		int err = fscrypt_get_encryption_info(inode);
 		if (err)
@@ -375,6 +384,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	char buf[64], *cp;
 	int ret;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
 		     !(sb->s_flags & MS_RDONLY))) {
 		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 88effb1053c7..9d549608fd30 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,6 +100,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	tid_t commit_tid;
 	bool needs_barrier = false;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file_enter(file, datasync);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f372fc431b8e..b14bae2598bc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -764,6 +764,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	if (!dir || !dir->i_nlink)
 		return ERR_PTR(-EPERM);
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return ERR_PTR(-EIO);
+
 	if ((ext4_encrypted_inode(dir) ||
 	     DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index b777b8aa14ae..30a9f210d1e3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -215,6 +215,9 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
 	struct ext4_inode *raw_inode;
 	int cp_len = 0;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return;
+
 	BUG_ON(!EXT4_I(inode)->i_inline_off);
 	BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index af97b9170358..bc282f9d0969 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1189,6 +1189,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index;
 	unsigned from, to;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
@@ -2047,6 +2050,12 @@ static int ext4_writepage(struct page *page,
 	struct ext4_io_submit io_submit;
 	bool keep_towrite = false;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+		ext4_invalidatepage(page, 0, PAGE_SIZE);
+		unlock_page(page);
+		return -EIO;
+	}
+
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_SHIFT)
@@ -2422,7 +2431,8 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 		if (err < 0) {
 			struct super_block *sb = inode->i_sb;
 
-			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			if (ext4_forced_shutdown(EXT4_SB(sb)) ||
+			    EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
 				goto invalidate_dirty_pages;
 			/*
 			 * Let the uper layers retry transient errors.
@@ -2644,6 +2654,9 @@ static int ext4_writepages(struct address_space *mapping,
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	percpu_down_read(&sbi->s_journal_flag_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
@@ -2680,7 +2693,8 @@ static int ext4_writepages(struct address_space *mapping,
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
-	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
+		     sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 		ret = -EROFS;
 		goto out_writepages;
 	}
@@ -2905,6 +2919,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	index = pos >> PAGE_SHIFT;
 
 	if (ext4_nonda_switch(inode->i_sb) ||
@@ -5212,6 +5229,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	error = setattr_prepare(dentry, attr);
 	if (error)
 		return error;
@@ -5498,6 +5518,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
 	int err = 0;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (IS_I_VERSION(inode))
 		inode_inc_iversion(inode);
 
@@ -5521,6 +5544,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 {
 	int err;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	err = ext4_get_inode_loc(inode, iloc);
 	if (!err) {
 		BUFFER_TRACE(iloc->bh, "get_write_access");
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 931da9d5d915..6ad612c576fc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2939,6 +2939,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	struct ext4_dir_entry_2 *de;
 	handle_t *handle = NULL;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return -EIO;
+
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
 	retval = dquot_initialize(dir);
@@ -3012,6 +3015,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	struct ext4_dir_entry_2 *de;
 	handle_t *handle = NULL;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return -EIO;
+
 	trace_ext4_unlink_enter(dir, dentry);
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
@@ -3082,6 +3088,9 @@ static int ext4_symlink(struct inode *dir,
 	struct fscrypt_str disk_link;
 	struct fscrypt_symlink_data *sd = NULL;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return -EIO;
+
 	disk_link.len = len + 1;
 	disk_link.name = (char *) symname;
 
@@ -3874,6 +3883,9 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
+		return -EIO;
+
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d83b0f3c5fe9..f8808835a28b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -158,7 +158,7 @@ static int ext4_end_io(ext4_io_end_t *io)
 
 	io->handle = NULL;	/* Following call will use up the handle */
 	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
-	if (ret < 0) {
+	if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
 		ext4_msg(inode->i_sb, KERN_EMERG,
 			 "failed to convert unwritten extents to written "
 			 "extents -- potential data loss!  "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 514e5fc59893..cfa4ce5a1f80 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -438,6 +438,9 @@ void __ext4_error(struct super_block *sb, const char *function,
 	struct va_format vaf;
 	va_list args;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	if (ext4_error_ratelimit(sb)) {
 		va_start(args, fmt);
 		vaf.fmt = fmt;
@@ -459,6 +462,9 @@ void __ext4_error_inode(struct inode *inode, const char *function,
 	struct va_format vaf;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return;
+
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	if (ext4_error_ratelimit(inode->i_sb)) {
@@ -491,6 +497,9 @@ void __ext4_error_file(struct file *file, const char *function,
 	struct inode *inode = file_inode(file);
 	char pathname[80], *path;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return;
+
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	if (ext4_error_ratelimit(inode->i_sb)) {
@@ -567,6 +576,9 @@ void __ext4_std_error(struct super_block *sb, const char *function,
 	char nbuf[16];
 	const char *errstr;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	/* Special case: if the error is EROFS, and we're not already
 	 * inside a transaction, then there's really no point in logging
 	 * an error. */
@@ -600,6 +612,9 @@ void __ext4_abort(struct super_block *sb, const char *function,
 	struct va_format vaf;
 	va_list args;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	save_error_info(sb, function, line);
 	va_start(args, fmt);
 	vaf.fmt = fmt;
@@ -695,6 +710,9 @@ __acquires(bitlock)
 	va_list args;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	es->s_last_error_ino = cpu_to_le32(ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	__save_error_info(sb, function, line);
@@ -4717,6 +4735,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	bool needs_barrier = false;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return 0;
+
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->rsv_conversion_wq);
 	/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c40bd55b6400..67636acf7624 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -411,6 +411,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 {
 	int error;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (strlen(name) > 255)
 		return -ERANGE;
 
-- 
cgit v1.2.3


From 783d948544993f55bdacc78b127532e8b6e2fc9f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Feb 2017 19:47:14 -0500
Subject: ext4: add EXT4_IOC_GOINGDOWN ioctl

This ioctl is modeled after the xfs's XFS_IOC_GOINGDOWN ioctl.  (In
fact, it uses the same code points.)

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 10 ++++++++++
 fs/ext4/ioctl.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c |  2 +-
 3 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 35d93ab7f3fb..55b7a77a0444 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -679,6 +679,16 @@ struct fsxattr {
 #define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
 #define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
 
+#define EXT4_IOC_GOINGDOWN _IOR ('X', 125, __u32)
+
+/*
+ * Flags for going down operation
+ */
+#define EXT4_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define EXT4_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define EXT4_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
+
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d534399cf607..b383ebf4020c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -16,6 +16,7 @@
 #include <linux/quotaops.h>
 #include <linux/uuid.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
@@ -442,6 +443,52 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 	return iflags;
 }
 
+int ext4_goingdown(struct super_block *sb, unsigned long arg)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	__u32 flags;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, (__u32 __user *)arg))
+		return -EFAULT;
+
+	if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
+		return -EINVAL;
+
+	if (ext4_forced_shutdown(sbi))
+		return 0;
+
+	ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
+
+	switch (flags) {
+	case EXT4_GOING_FLAGS_DEFAULT:
+		freeze_bdev(sb->s_bdev);
+		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+		thaw_bdev(sb->s_bdev, sb);
+		break;
+	case EXT4_GOING_FLAGS_LOGFLUSH:
+		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+		if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
+			(void) ext4_force_commit(sb);
+			jbd2_journal_abort(sbi->s_journal, 0);
+		}
+		break;
+	case EXT4_GOING_FLAGS_NOLOGFLUSH:
+		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+		if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
+			msleep(100);
+			jbd2_journal_abort(sbi->s_journal, 0);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	clear_opt(sb, DISCARD);
+	return 0;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -893,6 +940,8 @@ resizefs_out:
 
 		return 0;
 	}
+	case EXT4_IOC_GOINGDOWN:
+		return ext4_goingdown(sb, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -959,6 +1008,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+	case EXT4_IOC_GOINGDOWN:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cfa4ce5a1f80..3db5b6491513 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4821,7 +4821,7 @@ out:
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
-	if (sb->s_flags & MS_RDONLY)
+	if ((sb->s_flags & MS_RDONLY) || ext4_forced_shutdown(EXT4_SB(sb)))
 		return 0;
 
 	if (EXT4_SB(sb)->s_journal) {
-- 
cgit v1.2.3


From ff5462e39ca1d27e530d088c4e38741fd9cddad4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 14:39:27 -0500
Subject: ext4: fix DAX write locking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unlike O_DIRECT DAX is not an optional opt-in feature selected by the
application, so we'll have to provide the traditional synchronіzation
of overlapping writes as we do for buffered writes.

This was broken historically for DAX, but got fixed for ext2 and XFS
as part of the iomap conversion.  Fix up ext4 as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ff3f6107b0ba..87e11dfe3cde 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -178,7 +178,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
-	bool overwrite = false;
 
 	inode_lock(inode);
 	ret = ext4_write_checks(iocb, from);
@@ -191,16 +190,9 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out;
 
-	if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
-		overwrite = true;
-		downgrade_write(&inode->i_rwsem);
-	}
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 out:
-	if (!overwrite)
-		inode_unlock(inode);
-	else
-		inode_unlock_shared(inode);
+	inode_unlock(inode);
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
 	return ret;
-- 
cgit v1.2.3


From 168316db3583253b9e5d46392e5b9d03d531406f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Feb 2017 14:43:13 -0500
Subject: dax: assert that i_rwsem is held exclusive for writes

Make sure all callers follow the same locking protocol, given that DAX
transparantly replaced the normal buffered I/O path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/dax.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 5c74f60d0a50..04734daed1bd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1061,8 +1061,12 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
 	unsigned flags = 0;
 
-	if (iov_iter_rw(iter) == WRITE)
+	if (iov_iter_rw(iter) == WRITE) {
+		lockdep_assert_held_exclusive(&inode->i_rwsem);
 		flags |= IOMAP_WRITE;
+	} else {
+		lockdep_assert_held(&inode->i_rwsem);
+	}
 
 	while (iov_iter_count(iter)) {
 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
-- 
cgit v1.2.3


From d9b22cf9f5466a057f2a4f1e642b469fa9d73117 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Feb 2017 00:50:56 -0500
Subject: ext4: fix stripe-unaligned allocations

When a filesystem is created using:

	mkfs.ext4 -b 4096 -E stride=512 <dev>

and we try to allocate 64MB extent, we will end up directly in
ext4_mb_complex_scan_group(). This is because the request is detected
as power-of-two allocation (so we start in ext4_mb_regular_allocator()
with ac_criteria == 0) however the check before
ext4_mb_simple_scan_group() refuses the direct buddy scan because the
allocation request is too large. Since cr == 0, the check whether we
should use ext4_mb_scan_aligned() fails as well and we fall back to
ext4_mb_complex_scan_group().

Fix the problem by checking for upper limit on power-of-two requests
directly when detecting them.

Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f513f273ff89..10c62de642c6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2146,8 +2146,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	 * We search using buddy data only if the order of the request
 	 * is greater than equal to the sbi_s_mb_order2_reqs
 	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+	 * We also support searching for power-of-two requests only for
+	 * requests upto maximum buddy size we have constructed.
 	 */
-	if (i >= sbi->s_mb_order2_reqs) {
+	if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
 		/*
 		 * This should tell if fe_len is exactly power of 2
 		 */
@@ -2217,7 +2219,7 @@ repeat:
 			}
 
 			ac->ac_groups_scanned++;
-			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
+			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 && sbi->s_stripe &&
 					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
-- 
cgit v1.2.3


From 5469d7c3087ecaf760f54b447f11af6061b7c897 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Feb 2017 00:56:09 -0500
Subject: ext4: do not use stripe_width if it is not set

Avoid using stripe_width for sbi->s_stripe value if it is not actually
set. It prevents using the stride for sbi->s_stripe.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3db5b6491513..dde14a7ac6d7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2637,9 +2637,9 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 
 	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
 		ret = sbi->s_stripe;
-	else if (stripe_width <= sbi->s_blocks_per_group)
+	else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
 		ret = stripe_width;
-	else if (stride <= sbi->s_blocks_per_group)
+	else if (stride && stride <= sbi->s_blocks_per_group)
 		ret = stride;
 	else
 		ret = 0;
-- 
cgit v1.2.3


From 0d06863f903ac5f4f6efb0273079d27de3e53a28 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 14 Feb 2017 11:31:15 -0500
Subject: ext4: don't BUG when truncating encrypted inodes on the orphan list

Fix a BUG when the kernel tries to mount a file system constructed as
follows:

echo foo > foo.txt
mke2fs -Fq -t ext4 -O encrypt foo.img 100
debugfs -w foo.img << EOF
write foo.txt a
set_inode_field a i_flags 0x80800
set_super_value s_last_orphan 12
quit
EOF

root@kvm-xfstests:~# mount -o loop foo.img /mnt
[  160.238770] ------------[ cut here ]------------
[  160.240106] kernel BUG at /usr/projects/linux/ext4/fs/ext4/inode.c:3874!
[  160.240106] invalid opcode: 0000 [#1] SMP
[  160.240106] Modules linked in:
[  160.240106] CPU: 0 PID: 2547 Comm: mount Tainted: G        W       4.10.0-rc3-00034-gcdd33b941b67 #227
[  160.240106] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1 04/01/2014
[  160.240106] task: f4518000 task.stack: f47b6000
[  160.240106] EIP: ext4_block_zero_page_range+0x1a7/0x2b4
[  160.240106] EFLAGS: 00010246 CPU: 0
[  160.240106] EAX: 00000001 EBX: f7be4b50 ECX: f47b7dc0 EDX: 00000007
[  160.240106] ESI: f43b05a8 EDI: f43babec EBP: f47b7dd0 ESP: f47b7dac
[  160.240106]  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
[  160.240106] CR0: 80050033 CR2: bfd85b08 CR3: 34a00680 CR4: 000006f0
[  160.240106] Call Trace:
[  160.240106]  ext4_truncate+0x1e9/0x3e5
[  160.240106]  ext4_fill_super+0x286f/0x2b1e
[  160.240106]  ? set_blocksize+0x2e/0x7e
[  160.240106]  mount_bdev+0x114/0x15f
[  160.240106]  ext4_mount+0x15/0x17
[  160.240106]  ? ext4_calculate_overhead+0x39d/0x39d
[  160.240106]  mount_fs+0x58/0x115
[  160.240106]  vfs_kern_mount+0x4b/0xae
[  160.240106]  do_mount+0x671/0x8c3
[  160.240106]  ? _copy_from_user+0x70/0x83
[  160.240106]  ? strndup_user+0x31/0x46
[  160.240106]  SyS_mount+0x57/0x7b
[  160.240106]  do_int80_syscall_32+0x4f/0x61
[  160.240106]  entry_INT80_32+0x2f/0x2f
[  160.240106] EIP: 0xb76b919e
[  160.240106] EFLAGS: 00000246 CPU: 0
[  160.240106] EAX: ffffffda EBX: 08053838 ECX: 08052188 EDX: 080537e8
[  160.240106] ESI: c0ed0000 EDI: 00000000 EBP: 080537e8 ESP: bfa13660
[  160.240106]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b
[  160.240106] Code: 59 8b 00 a8 01 0f 84 09 01 00 00 8b 07 66 25 00 f0 66 3d 00 80 75 61 89 f8 e8 3e e2 ff ff 84 c0 74 56 83 bf 48 02 00 00 00 75 02 <0f> 0b 81 7d e8 00 10 00 00 74 02 0f 0b 8b 43 04 8b 53 08 31 c9
[  160.240106] EIP: ext4_block_zero_page_range+0x1a7/0x2b4 SS:ESP: 0068:f47b7dac
[  160.317241] ---[ end trace d6a773a375c810a5 ]---

The problem is that when the kernel tries to truncate an inode in
ext4_truncate(), it tries to clear any on-disk data beyond i_size.
Without the encryption key, it can't do that, and so it triggers a
BUG.

E2fsck does *not* provide this service, and in practice most file
systems have their orphan list processed by e2fsck, so to avoid
crashing, this patch skips this step if we don't have access to the
encryption key (which is the case when processing the orphan list; in
all other cases, we will have the encryption key, or the kernel
wouldn't have allowed the file to be opened).

An open question is whether the fact that e2fsck isn't clearing the
bytes beyond i_size causing problems --- and if we've lived with it
not doing it for so long, can we drop this from the kernel replay of
the orphan list in all cases (not just when we don't have the key for
encrypted inodes).

Addresses-Google-Bug: #35209576

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bc282f9d0969..f622d4a577e3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3944,6 +3944,10 @@ static int ext4_block_truncate_page(handle_t *handle,
 	unsigned blocksize;
 	struct inode *inode = mapping->host;
 
+	/* If we are processing an encrypted inode during orphan list handling */
+	if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+		return 0;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 
-- 
cgit v1.2.3


From 2ba3e6e8afc9b6188b471f27cf2b5e3cf34e7af2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 15 Feb 2017 01:26:39 -0500
Subject: ext4: fix fencepost in s_first_meta_bg validation

It is OK for s_first_meta_bg to be equal to the number of block group
descriptor blocks.  (It rarely happens, but it shouldn't cause any
problems.)

https://bugzilla.kernel.org/show_bug.cgi?id=194567

Fixes: 3a4b77cd47bb837b8557595ec7425f281f2ca1fe
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dde14a7ac6d7..a673558fe5f8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3860,7 +3860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
 	if (ext4_has_feature_meta_bg(sb)) {
-		if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+		if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
 			ext4_msg(sb, KERN_WARNING,
 				 "first meta block group too large: %u "
 				 "(group descriptor block count %u)",
-- 
cgit v1.2.3


From e9be2ac7c09cabcbbbb12b0869e49b7a715d6fb5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 20 Feb 2017 15:34:59 -0500
Subject: ext4: rename EXT4_IOC_GOINGDOWN to EXT4_IOC_SHUTDOWN

It's very likely the file system independent ioctl name will be
FS_IOC_SHUTDOWN, so let's use the same name for the ext4 ioctl name.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 2 +-
 fs/ext4/ioctl.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 55b7a77a0444..3a87378b9563 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -679,7 +679,7 @@ struct fsxattr {
 #define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
 #define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
 
-#define EXT4_IOC_GOINGDOWN _IOR ('X', 125, __u32)
+#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
 
 /*
  * Flags for going down operation
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b383ebf4020c..a4273ddb9922 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -443,7 +443,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 	return iflags;
 }
 
-int ext4_goingdown(struct super_block *sb, unsigned long arg)
+int ext4_shutdown(struct super_block *sb, unsigned long arg)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	__u32 flags;
@@ -940,8 +940,8 @@ resizefs_out:
 
 		return 0;
 	}
-	case EXT4_IOC_GOINGDOWN:
-		return ext4_goingdown(sb, arg);
+	case EXT4_IOC_SHUTDOWN:
+		return ext4_shutdown(sb, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -1008,7 +1008,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
-	case EXT4_IOC_GOINGDOWN:
+	case EXT4_IOC_SHUTDOWN:
 		break;
 	default:
 		return -ENOIOCTLCMD;
-- 
cgit v1.2.3