From 0a6cf9137ded4856b41910a4336677ee0ffa6736 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 26 Feb 2016 15:19:46 -0800 Subject: ext2, ext4: only set S_DAX for regular inodes When S_DAX is set on an inode we assume that if there are pages attached to the mapping (mapping->nrpages != 0), those pages are clean zero pages that were used to service reads from holes. Any dirty data associated with the inode should be in the form of DAX exceptional entries (mapping->nrexceptional) that is written back via dax_writeback_mapping_range(). With the current code, though, this isn't always true. For example, ext2 and ext4 directory inodes can have S_DAX set, but have their dirty data stored as dirty page cache entries. For these types of inodes, having S_DAX set doesn't really make sense since their I/O doesn't actually happen through the DAX code path. Instead, only allow S_DAX to be set for regular inodes for ext2 and ext4. This allows us to have strict DAX vs non-DAX paths in the writeback code. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Theodore Ts'o Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Jens Axboe Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext2') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 338eefda70c6..27e2cdd4999b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1296,7 +1296,7 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) inode->i_flags |= S_DAX; } -- cgit v1.2.3 From 20a90f58997245749c2bdfaea9e51f785ec90d0b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 26 Feb 2016 15:19:52 -0800 Subject: dax: give DAX clearing code correct bdev dax_clear_blocks() needs a valid struct block_device and previously it was using inode->i_sb->s_bdev in all cases. This is correct for normal inodes on mounted ext2, ext4 and XFS filesystems, but is incorrect for DAX raw block devices and for XFS real-time devices. Instead, rename dax_clear_blocks() to dax_clear_sectors(), and change its arguments to take a bdev and a sector instead of an inode and a block. This better reflects what the function does, and it allows the filesystem and raw block device code to pass in an appropriate struct block_device. Signed-off-by: Ross Zwisler Suggested-by: Dan Williams Reviewed-by: Jan Kara Cc: Theodore Ts'o Cc: Al Viro Cc: Dave Chinner Cc: Jens Axboe Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 9 ++++----- fs/ext2/inode.c | 6 ++++-- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_aops.h | 1 + fs/xfs/xfs_bmap_util.c | 3 ++- include/linux/dax.h | 2 +- 6 files changed, 13 insertions(+), 10 deletions(-) (limited to 'fs/ext2') diff --git a/fs/dax.c b/fs/dax.c index fc2e3141138b..9a173dd8c4a3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -79,15 +79,14 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) } /* - * dax_clear_blocks() is called from within transaction context from XFS, + * dax_clear_sectors() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS * semantics for all operations. */ -int dax_clear_blocks(struct inode *inode, sector_t block, long _size) +int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) { - struct block_device *bdev = inode->i_sb->s_bdev; struct blk_dax_ctl dax = { - .sector = block << (inode->i_blkbits - 9), + .sector = _sector, .size = _size, }; @@ -109,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size) wmb_pmem(); return 0; } -EXPORT_SYMBOL_GPL(dax_clear_blocks); +EXPORT_SYMBOL_GPL(dax_clear_sectors); /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 27e2cdd4999b..4467cbd75f24 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), - 1 << inode->i_blkbits); + err = dax_clear_sectors(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) << + (inode->i_blkbits - 9), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 379c089fb051..fc20518e2398 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -55,7 +55,7 @@ xfs_count_page_state( } while ((bh = bh->b_this_page) != head); } -STATIC struct block_device * +struct block_device * xfs_find_bdev_for_inode( struct inode *inode) { diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index f6ffc9ae5ceb..a4343c63fb38 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); extern void xfs_count_page_state(struct page *, int *, int *); +extern struct block_device *xfs_find_bdev_for_inode(struct inode *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 45ec9e40150c..6c876012b2e5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,7 +75,8 @@ xfs_zero_extent( ssize_t size = XFS_FSB_TO_B(mp, count_fsb); if (IS_DAX(VFS_I(ip))) - return dax_clear_blocks(VFS_I(ip), block, size); + return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), + sector, size); /* * let the block layer decide on the fastest method of diff --git a/include/linux/dax.h b/include/linux/dax.h index 818e45078929..7b6bcedb980f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -7,7 +7,7 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, get_block_t, dio_iodone_t, int flags); -int dax_clear_blocks(struct inode *, sector_t block, long size); +int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, -- cgit v1.2.3 From 7f6d5b529b7dfe2fca30cbf4bc81e16575090025 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 26 Feb 2016 15:19:55 -0800 Subject: dax: move writeback calls into the filesystems Previously calls to dax_writeback_mapping_range() for all DAX filesystems (ext2, ext4 & xfs) were centralized in filemap_write_and_wait_range(). dax_writeback_mapping_range() needs a struct block_device, and it used to get that from inode->i_sb->s_bdev. This is correct for normal inodes mounted on ext2, ext4 and XFS filesystems, but is incorrect for DAX raw block devices and for XFS real-time files. Instead, call dax_writeback_mapping_range() directly from the filesystem ->writepages function so that it can supply us with a valid block device. This also fixes DAX code to properly flush caches in response to sync(2). Signed-off-by: Ross Zwisler Signed-off-by: Jan Kara Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Jens Axboe Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 13 ++++++++++++- fs/dax.c | 12 +++++++----- fs/ext2/inode.c | 8 ++++++++ fs/ext4/inode.c | 4 ++++ fs/xfs/xfs_aops.c | 4 ++++ include/linux/dax.h | 6 ++++-- mm/filemap.c | 12 ++++-------- 7 files changed, 43 insertions(+), 16 deletions(-) (limited to 'fs/ext2') diff --git a/fs/block_dev.c b/fs/block_dev.c index 31c6d1090f11..826b164a4b5b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1697,13 +1697,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (dax_mapping(mapping)) { + struct block_device *bdev = I_BDEV(mapping->host); + + return dax_writeback_mapping_range(mapping, bdev, wbc); + } + return generic_writepages(mapping, wbc); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .readpages = blkdev_readpages, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .writepages = generic_writepages, + .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .is_dirty_writeback = buffer_check_dirty_writeback, diff --git a/fs/dax.c b/fs/dax.c index 9a173dd8c4a3..711172450da6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -484,11 +484,10 @@ static int dax_writeback_one(struct block_device *bdev, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ -int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, - loff_t end) +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct block_device *bdev = inode->i_sb->s_bdev; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -499,8 +498,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; - start_index = start >> PAGE_CACHE_SHIFT; - end_index = end >> PAGE_CACHE_SHIFT; + if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + return 0; + + start_index = wbc->range_start >> PAGE_CACHE_SHIFT; + end_index = wbc->range_end >> PAGE_CACHE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 4467cbd75f24..6bd58e6ff038 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -876,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { +#ifdef CONFIG_FS_DAX + if (dax_mapping(mapping)) { + return dax_writeback_mapping_range(mapping, + mapping->host->i_sb->s_bdev, + wbc); + } +#endif + return mpage_writepages(mapping, wbc, ext2_get_block); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5708e689e63d..aee960b1af34 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2478,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping, trace_ext4_writepages(inode, wbc); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index fc20518e2398..a9ebabfe7587 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1208,6 +1208,10 @@ xfs_vm_writepages( struct writeback_control *wbc) { xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, + xfs_find_bdev_for_inode(mapping->host), wbc); + return generic_writepages(mapping, wbc); } diff --git a/include/linux/dax.h b/include/linux/dax.h index 7b6bcedb980f..636dd59ab505 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -52,6 +52,8 @@ static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } -int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, - loff_t end); + +struct writeback_control; +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc); #endif diff --git a/mm/filemap.c b/mm/filemap.c index 23edccecadb0..3461d97ecb30 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -446,7 +446,8 @@ int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if (mapping->nrpages) { + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -482,13 +483,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if (dax_mapping(mapping) && mapping->nrexceptional) { - err = dax_writeback_mapping_range(mapping, lstart, lend); - if (err) - return err; - } - - if (mapping->nrpages) { + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ -- cgit v1.2.3