8 files changed, 118 insertions, 54 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 97b970e7dd13..1c67139ad4b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -547,7 +547,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  *
  * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
  * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
  * return TRUE.
  *
  * if the total number of retries exceed three times, return FALSE.
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index e25e99bf7ee1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
 
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index dd2cb5076ff9..4890d6f3ad15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1729,7 +1729,7 @@ repeat:
 		BUG_ON(npath->p_depth != path->p_depth);
 		eh = npath[depth].p_hdr;
 		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
-			ext_debug("next leaf isnt full(%d)\n",
+			ext_debug("next leaf isn't full(%d)\n",
 				  le16_to_cpu(eh->eh_entries));
 			path = npath;
 			goto repeat;
@@ -2533,7 +2533,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
+ * extent into multiple extents (up to three - one initialized and two
  * uninitialized).
  * There are three possibilities:
  *   a> There is no split required: Entire extent should be initialized
@@ -3174,7 +3174,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 						   path, flags);
 		/*
 		 * Flag the inode(non aio case) or end_io struct (aio case)
-		 * that this IO needs to convertion to written when IO is
+		 * that this IO needs to conversion to written when IO is
 		 * completed
 		 */
 		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
@@ -3460,10 +3460,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		ext4_ext_mark_uninitialized(&newex);
 		/*
 		 * io_end structure was created for every IO write to an
-		 * uninitialized extent. To avoid unecessary conversion,
+		 * uninitialized extent. To avoid unnecessary conversion,
 		 * here we flag the IO that really needs the conversion.
 		 * For non asycn direct IO case, flag the inode state
-		 * that we need to perform convertion when IO is done.
+		 * that we need to perform conversion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7f74019d6d77..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ extern int ext4_flush_completed_IO(struct inode *inode)
 		 * to the work-to-be schedule is freed.
 		 *
 		 * Thus we need to keep the io structure still valid here after
-		 * convertion finished. The io structure has a flag to
+		 * conversion finished. The io structure has a flag to
 		 * avoid double converting from both fsync and background work
 		 * queue work.
 		 */
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
  * the parent directory's parent as well, and so on recursively, if
  * they are also freshly created.
  */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+	struct writeback_control wbc;
 	struct dentry *dentry = NULL;
+	int ret = 0;
 
 	while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
 		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
 		if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
 			break;
 		inode = dentry->d_parent->d_inode;
-		sync_mapping_buffers(inode->i_mapping);
+		ret = sync_mapping_buffers(inode->i_mapping);
+		if (ret)
+			break;
+		memset(&wbc, 0, sizeof(wbc));
+		wbc.sync_mode = WB_SYNC_ALL;
+		wbc.nr_to_write = 0;         /* only write out the inode */
+		ret = sync_inode(inode, &wbc);
+		if (ret)
+			break;
 	}
+	return ret;
 }
 
 /*
@@ -176,7 +187,7 @@ int ext4_sync_file(struct file *file, int datasync)
 	if (!journal) {
 		ret = generic_file_fsync(file, datasync);
 		if (!ret && !list_empty(&inode->i_dentry))
-			ext4_sync_parent(inode);
+			ret = ext4_sync_parent(inode);
 		goto out;
 	}
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1a86282b9024..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2502,6 +2502,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		 * for partial write.
 		 */
 		set_buffer_new(bh);
+		set_buffer_mapped(bh);
 	}
 	return 0;
 }
@@ -2588,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
  * need to file the inode to the transaction's list in ordered mode because if
  * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
  * transaction the data will hit the disk. In case we are journaling data, we
  * cannot start transaction directly because transaction start ranks above page
  * lock so we have to do some magic.
@@ -2690,7 +2691,7 @@ static int ext4_writepage(struct page *page,
 
 /*
  * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
  * a single extent allocation into a single transaction,
  * ext4_da_writpeages() will loop calling this before
  * the block allocation.
@@ -3304,7 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * the pages by calling redirty_page_for_writepage() but that
 	 * would be ugly in the extreme.  So instead we would need to
 	 * replicate parts of the code in the above functions,
-	 * simplifying them becuase we wouldn't actually intend to
+	 * simplifying them because we wouldn't actually intend to
 	 * write out the pages, but rather only collect contiguous
 	 * logical block extents, call the multi-block allocator, and
 	 * then update the buffer heads with the block allocations.
@@ -3694,7 +3695,7 @@ retry:
  *
  * The unwrritten extents will be converted to written when DIO is completed.
  * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
  * when async direct IO completed.
  *
  * If the O_DIRECT write will extend the file then add this inode to the
@@ -3717,7 +3718,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
  		 * We could direct write to holes and fallocate.
 		 *
  		 * Allocated blocks to fill the hole are marked as uninitialized
- 		 * to prevent paralel buffered read to expose the stale data
+ 		 * to prevent parallel buffered read to expose the stale data
  		 * before DIO complete the data IO.
 		 *
  		 * As to previously fallocated extents, ext4 get_block
@@ -3778,7 +3779,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			int err;
 			/*
 			 * for non AIO case, since the IO is already
-			 * completed, we could do the convertion right here
+			 * completed, we could do the conversion right here
 			 */
 			err = ext4_convert_unwritten_extents(inode,
 							     offset, ret);
@@ -4025,7 +4026,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
  *
  *	When we do truncate() we may have to clean the ends of several
  *	indirect blocks but leave the blocks themselves alive. Block is
- *	partially truncated if some data below the new i_size is refered
+ *	partially truncated if some data below the new i_size is referred
  *	from it (and it is on the path to the first completely truncated
  *	data block, indeed).  We have to free the top of that path along
  *	with everything to the right of the path. Since no allocation
@@ -4169,7 +4170,7 @@ out_err:
  * @first:	array of block numbers
  * @last:	points immediately past the end of array
  *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
  *
  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -4261,7 +4262,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
  *	@last:	pointer immediately past the end of array
  *	@depth:	depth of the branches to free
  *
- *	We are freeing all blocks refered from these branches (numbers are
+ *	We are freeing all blocks referred from these branches (numbers are
  *	stored as little-endian 32-bit) and updating @inode->i_blocks
  *	appropriately.
  */
@@ -4429,8 +4430,8 @@ void ext4_truncate(struct inode *inode)
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
-	int n;
-	ext4_lblk_t last_block;
+	int n = 0;
+	ext4_lblk_t last_block, max_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 
 	trace_ext4_truncate_enter(inode);
@@ -4455,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
 	if (inode->i_size & (blocksize - 1))
 		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
 			goto out_stop;
 
-	n = ext4_block_to_path(inode, last_block, offsets, NULL);
-	if (n == 0)
-		goto out_stop;	/* error */
+	if (last_block != max_block) {
+		n = ext4_block_to_path(inode, last_block, offsets, NULL);
+		if (n == 0)
+			goto out_stop;	/* error */
+	}
 
 	/*
 	 * OK.  This truncate is going to happen.  We add the inode to the
@@ -4493,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
 	 */
 	ei->i_disksize = inode->i_size;
 
-	if (n == 1) {		/* direct blocks */
+	if (last_block == max_block) {
+		/*
+		 * It is unnecessary to free any data blocks if last_block is
+		 * equal to the indirect block limit.
+		 */
+		goto out_unlock;
+	} else if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
 		goto do_indirects;
@@ -4553,6 +4564,7 @@ do_indirects:
 		;
 	}
 
+out_unlock:
 	up_write(&ei->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
@@ -5398,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 	/* if nrblocks are contiguous */
 	if (chunk) {
 		/*
-		 * With N contiguous data blocks, it need at most
-		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
-		 * 2 dindirect blocks
-		 * 1 tindirect block
+		 * With N contiguous data blocks, we need at most
+		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+		 * 2 dindirect blocks, and 1 tindirect block
 		 */
-		indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
-		return indirects + 3;
+		return DIV_ROUND_UP(nrblocks,
+				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 	}
 	/*
 	 * if nrblocks are not contiguous, worse case, each block touch
@@ -5478,7 +5489,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 
 /*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
  * the modification of a single pages into a single transaction,
  * which may include multiple chunks of block allocations.
  *
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a5837a837a8b..d8a16eecf1d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
  * between CPUs. It is possible to get scheduled at this point.
  *
  * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
  *
  * If we can't allocate blocks via inode prealloc or/and locality group
  * prealloc then we look at the buddy cache. The buddy cache is represented
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d1bafa57f483..92816b4e0f16 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 * start with one credit accounted for
 	 * superblock modification.
 	 *
-	 * For the tmp_inode we already have commited the
+	 * For the tmp_inode we already have committed the
 	 * trascation that created the inode. Later as and
 	 * when we add extents we extent the journal
 	 */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 22546ad7f0ae..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -242,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
  * journal_end calls result in the superblock being marked dirty, so
  * that sync() will call the filesystem's write_super callback if
  * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
  */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
 	journal_t *journal;
+	handle_t  *handle;
 
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
-	vfs_check_frozen(sb, SB_FREEZE_TRANS);
-	/* Special case here: if the journal has aborted behind our
-	 * backs (eg. EIO in the commit thread), then we still need to
-	 * take the FS itself readonly cleanly. */
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal) {
-		if (is_journal_aborted(journal)) {
-			ext4_abort(sb, "Detected aborted journal");
-			return ERR_PTR(-EROFS);
-		}
-		return jbd2_journal_start(journal, nblocks);
+	handle = ext4_journal_current_handle();
+
+	/*
+	 * If a handle has been started, it should be allowed to
+	 * finish, otherwise deadlock could happen between freeze
+	 * and others(e.g. truncate) due to the restart of the
+	 * journal handle if the filesystem is forzen and active
+	 * handles are not stopped.
+	 */
+	if (!handle)
+		vfs_check_frozen(sb, SB_FREEZE_TRANS);
+
+	if (!journal)
+		return ext4_get_nojournal();
+	/*
+	 * Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly.
+	 */
+	if (is_journal_aborted(journal)) {
+		ext4_abort(sb, "Detected aborted journal");
+		return ERR_PTR(-EROFS);
 	}
-	return ext4_get_nojournal();
+	return jbd2_journal_start(journal, nblocks);
 }
 
 /*
@@ -617,7 +634,7 @@ __acquires(bitlock)
 	 * filesystem will have already been marked read/only and the
 	 * journal has been aborted.  We return 1 as a hint to callers
 	 * who might what to use the return value from
-	 * ext4_grp_locked_error() to distinguish beween the
+	 * ext4_grp_locked_error() to distinguish between the
 	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
 	 * aggressively from the ext4 function in question, with a
 	 * more appropriate error code.
@@ -2975,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
 	mutex_unlock(&ext4_li_info->li_list_mtx);
 
 	sbi->s_li_request = elr;
+	/*
+	 * set elr to NULL here since it has been inserted to
+	 * the request_list and the removal and free of it is
+	 * handled by ext4_clear_request_list from now on.
+	 */
+	elr = NULL;
 
 	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
 		ret = ext4_run_lazyinit_thread();
@@ -3385,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
+	init_timer(&sbi->s_err_report);
+	sbi->s_err_report.function = print_daily_error_info;
+	sbi->s_err_report.data = (unsigned long) sb;
+
 	err = percpu_counter_init(&sbi->s_freeblocks_counter,
 			ext4_count_free_blocks(sb));
 	if (!err) {
@@ -3646,9 +3673,6 @@ no_journal:
 		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
 		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
 
-	init_timer(&sbi->s_err_report);
-	sbi->s_err_report.function = print_daily_error_info;
-	sbi->s_err_report.data = (unsigned long) sb;
 	if (es->s_error_count)
 		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
 
@@ -3672,6 +3696,7 @@ failed_mount_wq:
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
+	del_timer(&sbi->s_err_report);
 	if (sbi->s_flex_groups) {
 		if (is_vmalloc_addr(sbi->s_flex_groups))
 			vfree(sbi->s_flex_groups);
@@ -4138,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
  */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4614,17 +4644,30 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 
 static int ext4_quota_off(struct super_block *sb, int type)
 {
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	handle_t *handle;
+
 	/* Force all delayed allocation blocks to be allocated.
 	 * Caller already holds s_umount sem */
 	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
 
+	/* Update modification times of quota files when userspace can
+	 * start looking at them */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle))
+		goto out;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+out:
 	return dquot_quota_off(sb, type);
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
  * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
  * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off)
@@ -4714,9 +4757,8 @@ out:
 	if (inode->i_size < off + len) {
 		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_mark_inode_dirty(handle, inode);
 	}
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	ext4_mark_inode_dirty(handle, inode);
 	mutex_unlock(&inode->i_mutex);
 	return len;
 }