summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-26 19:34:26 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-26 19:34:26 -0700
commit315227f6da389f3a560f27f7777080857278e1b4 (patch)
tree11306e1e8d8b66044ab48901b90141b5362c12e3
parenta10c38a4f385f5d7c173a263ff6bb2d36021b3bb (diff)
parent40543f62cbdce42633e3fe10923099feee272e1f (diff)
downloadlinux-315227f6da389f3a560f27f7777080857278e1b4.tar.bz2
Merge tag 'dax-misc-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull misc DAX updates from Vishal Verma: "DAX error handling for 4.7 - Until now, dax has been disabled if media errors were found on any device. This enables the use of DAX in the presence of these errors by making all sector-aligned zeroing go through the driver. - The driver (already) has the ability to clear errors on writes that are sent through the block layer using 'DSMs' defined in ACPI 6.1. Other misc changes: - When mounting DAX filesystems, check to make sure the partition is page aligned. This is a requirement for DAX, and previously, we allowed such unaligned mounts to succeed, but subsequent reads/writes would fail. - Misc/cleanup fixes from Jan that remove unused code from DAX related to zeroing, writeback, and some size checks" * tag 'dax-misc-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: dax: fix a comment in dax_zero_page_range and dax_truncate_page dax: for truncate/hole-punch, do zeroing through the driver if possible dax: export a low-level __dax_zero_page_range helper dax: use sb_issue_zerout instead of calling dax_clear_sectors dax: enable dax in the presence of known media errors (badblocks) dax: fallback from pmd to pte on error block: Update blkdev_dax_capable() for consistency xfs: Add alignment check for DAX mount ext2: Add alignment check for DAX mount ext4: Add alignment check for DAX mount block: Add bdev_dax_supported() for dax mount checks block: Add vfs_msg() interface dax: Remove redundant inode size checks dax: Remove pointless writeback from dax_do_io() dax: Remove zeroing from dax_io() dax: Remove dead zeroing code from fault handlers ext2: Avoid DAX zeroing to corrupt data ext2: Fix block zeroing in ext2_get_blocks() for DAX dax: Remove complete_unwritten argument DAX: move RADIX_DAX_ definitions to dax.c
-rw-r--r--Documentation/filesystems/dax.txt32
-rw-r--r--arch/powerpc/sysdev/axonram.c2
-rw-r--r--block/ioctl.c1
-rw-r--r--drivers/block/brd.c2
-rw-r--r--drivers/nvdimm/pmem.c10
-rw-r--r--drivers/s390/block/dcssblk.c4
-rw-r--r--fs/block_dev.c114
-rw-r--r--fs/dax.c257
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/super.c11
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/super.c11
-rw-r--r--fs/xfs/xfs_bmap_util.c15
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--include/linux/blkdev.h15
-rw-r--r--include/linux/dax.h25
-rw-r--r--include/linux/fs.h1
19 files changed, 246 insertions, 293 deletions
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7bde64014a89..ce4587d257d2 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -79,6 +79,38 @@ These filesystems may be used for inspiration:
- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
+Handling Media Errors
+---------------------
+
+The libnvdimm subsystem stores a record of known media error locations for
+each pmem block device (in gendisk->badblocks). If we fault at such location,
+or one with a latent error not yet discovered, the application can expect
+to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply
+writing the affected sectors (through the pmem driver, and if the underlying
+NVDIMM supports the clear_poison DSM defined by ACPI).
+
+Since DAX IO normally doesn't go through the driver/bio path, applications or
+sysadmins have an option to restore the lost data from a prior backup/inbuilt
+redundancy in the following ways:
+
+1. Delete the affected file, and restore from a backup (sysadmin route):
+ This will free the file system blocks that were being used by the file,
+ and the next time they're allocated, they will be zeroed first, which
+ happens through the driver, and will clear bad sectors.
+
+2. Truncate or hole-punch the part of the file that has a bad-block (at least
+ an entire aligned sector has to be hole-punched, but not necessarily an
+ entire filesystem block).
+
+These are the two basic paths that allow DAX filesystems to continue operating
+in the presence of media errors. More robust error recovery mechanisms can be
+built on top of this in the future, for example, involving redundancy/mirroring
+provided at the block layer through DM, or additionally, at the filesystem
+level. These would have to rely on the above two tenets, that error clearing
+can happen either by sending an IO through the driver, or zeroing (also through
+the driver).
+
+
Shortcomings
------------
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 0d112b94d91d..ff75d70f7285 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
*/
static long
axon_ram_direct_access(struct block_device *device, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn)
+ void __pmem **kaddr, pfn_t *pfn, long size)
{
struct axon_ram_bank *bank = device->bd_disk->private_data;
loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
diff --git a/block/ioctl.c b/block/ioctl.c
index 698c7933d582..ed2397f8de9d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,7 +4,6 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
-#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 51a071e32221..c04bd9bc39fd 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
#ifdef CONFIG_BLK_DEV_RAM_DAX
static long brd_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn)
+ void __pmem **kaddr, pfn_t *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 042baec56931..608fc4464574 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -164,14 +164,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
}
static long pmem_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn)
+ void __pmem **kaddr, pfn_t *pfn, long size)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
resource_size_t offset = sector * 512 + pmem->data_offset;
+ if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+ return -EIO;
*kaddr = pmem->virt_addr + offset;
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ /*
+ * If badblocks are present, limit known good range to the
+ * requested range.
+ */
+ if (unlikely(pmem->bb.count))
+ return size;
return pmem->size - pmem->pfn_pad - offset;
}
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index b83908670a9a..bed53c46dd90 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
static blk_qc_t dcssblk_make_request(struct request_queue *q,
struct bio *bio);
static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
- void __pmem **kaddr, pfn_t *pfn);
+ void __pmem **kaddr, pfn_t *pfn, long size);
static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
@@ -884,7 +884,7 @@ fail:
static long
dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
- void __pmem **kaddr, pfn_t *pfn)
+ void __pmem **kaddr, pfn_t *pfn, long size)
{
struct dcssblk_dev_info *dev_info;
unsigned long offset, dev_sz;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1089dbf25925..71ccab1d22c6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+ va_end(args);
+}
+
static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = bdev->bd_inode;
@@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
if (!avail)
return -ERANGE;
if (avail > 0 && avail & ~PAGE_MASK)
@@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+ struct blk_dax_ctl dax = {
+ .sector = 0,
+ .size = PAGE_SIZE,
+ };
+ int err;
+
+ if (blocksize != PAGE_SIZE) {
+ vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+ return -EINVAL;
+ }
+
+ err = bdev_direct_access(sb->s_bdev, &dax);
+ if (err < 0) {
+ switch (err) {
+ case -EOPNOTSUPP:
+ vfs_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ break;
+ case -EINVAL:
+ vfs_msg(sb, KERN_ERR,
+ "error: unaligned partition for dax");
+ break;
+ default:
+ vfs_msg(sb, KERN_ERR,
+ "error: dax access failed (%d)", err);
+ }
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+ struct blk_dax_ctl dax = {
+ .size = PAGE_SIZE,
+ };
+
+ if (!IS_ENABLED(CONFIG_FS_DAX))
+ return false;
+
+ dax.sector = 0;
+ if (bdev_direct_access(bdev, &dax) < 0)
+ return false;
+
+ dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+ if (bdev_direct_access(bdev, &dax) < 0)
+ return false;
+
+ return true;
+}
+
/*
* pseudo-fs
*/
@@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
}
EXPORT_SYMBOL(bd_set_size);
-static bool blkdev_dax_capable(struct block_device *bdev)
-{
- struct gendisk *disk = bdev->bd_disk;
-
- if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
- return false;
-
- /*
- * If the partition is not aligned on a page boundary, we can't
- * do dax I/O to it.
- */
- if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
- || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
- return false;
-
- /*
- * If the device has known bad blocks, force all I/O through the
- * driver / page cache.
- *
- * TODO: support finer grained dax error handling
- */
- if (disk->bb && disk->bb->count)
- return false;
-
- return true;
-}
-
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
/*
@@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- if (!blkdev_dax_capable(bdev))
+ if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
@@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- if (!blkdev_dax_capable(bdev))
+ if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
diff --git a/fs/dax.c b/fs/dax.c
index 7d9df93b3a14..5a282260d27e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -87,50 +87,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
return page;
}
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
- struct blk_dax_ctl dax = {
- .sector = _sector,
- .size = _size,
- };
-
- might_sleep();
- do {
- long count, sz;
-
- count = dax_map_atomic(bdev, &dax);
- if (count < 0)
- return count;
- sz = min_t(long, count, SZ_128K);
- clear_pmem(dax.addr, sz);
- dax.size -= sz;
- dax.sector += sz / 512;
- dax_unmap_atomic(bdev, &dax);
- cond_resched();
- } while (dax.size);
-
- wmb_pmem();
- return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
- loff_t pos, loff_t end)
-{
- loff_t final = end - pos + first; /* The final byte of the buffer */
-
- if (first > 0)
- clear_pmem(addr, first);
- if (final < size)
- clear_pmem(addr + final, size - final);
-}
-
static bool buffer_written(struct buffer_head *bh)
{
return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -169,6 +125,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
struct blk_dax_ctl dax = {
.addr = (void __pmem *) ERR_PTR(-EIO),
};
+ unsigned blkbits = inode->i_blkbits;
+ sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+ >> blkbits;
if (rw == READ)
end = min(end, i_size_read(inode));
@@ -176,7 +135,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
while (pos < end) {
size_t len;
if (pos == max) {
- unsigned blkbits = inode->i_blkbits;
long page = pos >> PAGE_SHIFT;
sector_t block = page << (PAGE_SHIFT - blkbits);
unsigned first = pos - (block << blkbits);
@@ -192,6 +150,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
bdev = bh->b_bdev;
+ /*
+ * We allow uninitialized buffers for writes
+ * beyond EOF as those cannot race with faults
+ */
+ WARN_ON_ONCE(
+ (buffer_new(bh) && block < file_blks) ||
+ (rw == WRITE && buffer_unwritten(bh)));
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -211,11 +176,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
rc = map_len;
break;
}
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(dax.addr, map_len, first,
- pos, end);
- need_wmb = true;
- }
dax.addr += first;
size = map_len - first;
}
@@ -276,15 +236,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
memset(&bh, 0, sizeof(bh));
bh.b_bdev = inode->i_sb->s_bdev;
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
- struct address_space *mapping = inode->i_mapping;
+ if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_lock(inode);
- retval = filemap_write_and_wait_range(mapping, pos, end - 1);
- if (retval) {
- inode_unlock(inode);
- goto out;
- }
- }
/* Protects against truncate */
if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -305,7 +258,6 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
- out:
return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
@@ -321,20 +273,11 @@ EXPORT_SYMBOL_GPL(dax_do_io);
static int dax_load_hole(struct address_space *mapping, struct page *page,
struct vm_fault *vmf)
{
- unsigned long size;
- struct inode *inode = mapping->host;
if (!page)
page = find_or_create_page(mapping, vmf->pgoff,
GFP_KERNEL | __GFP_ZERO);
if (!page)
return VM_FAULT_OOM;
- /* Recheck i_size under page lock to avoid truncate race */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- unlock_page(page);
- put_page(page);
- return VM_FAULT_SIGBUS;
- }
vmf->page = page;
return VM_FAULT_LOCKED;
@@ -565,33 +508,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
.sector = to_sector(bh, inode),
.size = bh->b_size,
};
- pgoff_t size;
int error;
i_mmap_lock_read(mapping);
- /*
- * Check truncate didn't happen while we were allocating a block.
- * If it did, this block may or may not be still allocated to the
- * file. We can't tell the filesystem to free it because we can't
- * take i_mutex here. In the worst case, the file still has blocks
- * allocated past the end of the file.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- error = -EIO;
- goto out;
- }
-
if (dax_map_atomic(bdev, &dax) < 0) {
error = PTR_ERR(dax.addr);
goto out;
}
-
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(dax.addr, PAGE_SIZE);
- wmb_pmem();
- }
dax_unmap_atomic(bdev, &dax);
error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
@@ -612,19 +536,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- * to written so the data written to them is exposed. This is required for
- * required by write faults for filesystems that will return unwritten
- * extent mappings from @get_block, but it is optional for reads as
- * dax_insert_mapping() will always zero unwritten blocks. If the fs does
- * not support unwritten extents, the it should pass NULL.
*
* When a page fault occurs, filesystems may call this helper in their
* fault handler for DAX files. __dax_fault() assumes the caller has done all
* the necessary locking for the page fault to proceed successfully.
*/
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+ get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -659,15 +577,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
put_page(page);
goto repeat;
}
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- /*
- * We have a struct page covering a hole in the file
- * from a read fault and we've raced with a truncate
- */
- error = -EIO;
- goto unlock_page;
- }
}
error = get_block(inode, block, &bh, 0);
@@ -700,17 +609,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (error)
goto unlock_page;
vmf->page = page;
- if (!page) {
+ if (!page)
i_mmap_lock_read(mapping);
- /* Check we didn't race with truncate */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
- error = -EIO;
- goto out;
- }
- }
return VM_FAULT_LOCKED;
}
@@ -727,23 +627,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
page = NULL;
}
- /*
- * If we successfully insert the new mapping over an unwritten extent,
- * we need to ensure we convert the unwritten extent. If there is an
- * error inserting the mapping, the filesystem needs to leave it as
- * unwritten to prevent exposure of the stale underlying data to
- * userspace, but we still need to call the completion function so
- * the private resources on the mapping buffer can be released. We
- * indicate what the callback should do via the uptodate variable, same
- * as for normal BH based IO completions.
- */
+ /* Filesystem should not return unwritten buffers to us! */
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
error = dax_insert_mapping(inode, &bh, vma, vmf);
- if (buffer_unwritten(&bh)) {
- if (complete_unwritten)
- complete_unwritten(&bh, !error);
- else
- WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
- }
out:
if (error == -ENOMEM)
@@ -772,7 +658,7 @@ EXPORT_SYMBOL(__dax_fault);
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+ get_block_t get_block)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +667,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+ result = __dax_fault(vma, vmf, get_block);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
@@ -815,8 +701,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -875,6 +760,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (get_block(inode, block, &bh, 1) != 0)
return VM_FAULT_SIGBUS;
alloc = true;
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
}
bdev = bh.b_bdev;
@@ -902,23 +788,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
i_mmap_lock_read(mapping);
- /*
- * If a truncate happened while we were allocating blocks, we may
- * leave blocks allocated to the file that are beyond EOF. We can't
- * take i_mutex here, so just leave them hanging; they'll be freed
- * when the file is deleted.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size) {
- result = VM_FAULT_SIGBUS;
- goto out;
- }
- if ((pgoff | PG_PMD_COLOUR) >= size) {
- dax_pmd_dbg(&bh, address,
- "offset + huge page size > file size");
- goto fallback;
- }
-
if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
spinlock_t *ptl;
pmd_t entry;
@@ -954,8 +823,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
long length = dax_map_atomic(bdev, &dax);
if (length < 0) {
- result = VM_FAULT_SIGBUS;
- goto out;
+ dax_pmd_dbg(&bh, address, "dax-error fallback");
+ goto fallback;
}
if (length < PMD_SIZE) {
dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -973,14 +842,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
dax_pmd_dbg(&bh, address, "pfn not in memmap");
goto fallback;
}
-
- if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- clear_pmem(dax.addr, PMD_SIZE);
- wmb_pmem();
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- result |= VM_FAULT_MAJOR;
- }
dax_unmap_atomic(bdev, &dax);
/*
@@ -1020,9 +881,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
out:
i_mmap_unlock_read(mapping);
- if (buffer_unwritten(&bh))
- complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
return result;
fallback:
@@ -1042,8 +900,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
* pmd_fault handler for DAX files.
*/
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +909,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
- complete_unwritten);
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
if (flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
@@ -1091,6 +947,43 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+static bool dax_range_is_aligned(struct block_device *bdev,
+ unsigned int offset, unsigned int length)
+{
+ unsigned short sector_size = bdev_logical_block_size(bdev);
+
+ if (!IS_ALIGNED(offset, sector_size))
+ return false;
+ if (!IS_ALIGNED(length, sector_size))
+ return false;
+
+ return true;
+}
+
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ unsigned int offset, unsigned int length)
+{
+ struct blk_dax_ctl dax = {
+ .sector = sector,
+ .size = PAGE_SIZE,
+ };
+
+ if (dax_range_is_aligned(bdev, offset, length)) {
+ sector_t start_sector = dax.sector + (offset >> 9);
+
+ return blkdev_issue_zeroout(bdev, start_sector,
+ length >> 9, GFP_NOFS, true);
+ } else {
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
+ wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
/**
* dax_zero_page_range - zero a range within a page of a DAX file
* @inode: The file being truncated
@@ -1102,12 +995,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
* page in a DAX file. This is intended for hole-punch operations. If
* you are truncating a file, the helper function dax_truncate_page() may be
* more convenient.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks. Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
*/
int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
get_block_t get_block)
@@ -1126,23 +1013,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_SIZE;
err = get_block(inode, index, &bh, 0);
- if (err < 0)
+ if (err < 0 || !buffer_written(&bh))
return err;
- if (buffer_written(&bh)) {
- struct block_device *bdev = bh.b_bdev;
- struct blk_dax_ctl dax = {
- .sector = to_sector(&bh, inode),
- .size = PAGE_SIZE,
- };
-
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- wmb_pmem();
- dax_unmap_atomic(bdev, &dax);
- }
- return 0;
+ return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+ offset, length);
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);
@@ -1154,12 +1029,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
*
* Similar to block_truncate_page(), this function can be called by a
* filesystem when it is truncating a DAX file to handle the partial page.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks. Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
*/
int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
{
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+ ret = __dax_fault(vma, vmf, ext2_get_block);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
}
down_read(&ei->dax_sem);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
up_read(&ei->dax_sem);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b675610391b8..fcbe58641e40 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,6 +26,7 @@
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
+#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
- err = dax_clear_sectors(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) <<
- (inode->i_blkbits - 9),
- 1 << inode->i_blkbits);
+ err = sb_issue_zeroout(inode->i_sb,
+ le32_to_cpu(chain[depth-1].key), count,
+ GFP_NOFS);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- }
+ } else
+ set_buffer_new(bh_result);
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
- set_buffer_new(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b78caf25f746..1d9379568aa8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
- if (blocksize != PAGE_SIZE) {
- ext2_msg(sb, KERN_ERR,
- "error: unsupported blocksize for dax");
+ err = bdev_dax_supported(sb, blocksize);
+ if (err)
goto failed_mount;
- }
- if (!sb->s_bdev->bd_disk->fops->direct_access) {
- ext2_msg(sb, KERN_ERR,
- "error: device does not support dax");
- goto failed_mount;
- }
}
/* If the blocksize doesn't match, re-read the thing.. */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d478110c32a6..df44c877892a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
+ result = __dax_fault(vma, vmf, ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
result = VM_FAULT_SIGBUS;
else
result = __dax_pmd_fault(vma, addr, pmd, flags,
- ext4_dax_get_block, NULL);
+ ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 20c5d52253b4..3822a5aedc61 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3417,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
- if (blocksize != PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "error: unsupported blocksize for dax");
- goto failed_mount;
- }
- if (!sb->s_bdev->bd_disk->fops->direct_access) {
- ext4_msg(sb, KERN_ERR,
- "error: device does not support dax");
+ err = bdev_dax_supported(sb, blocksize);
+ if (err)
goto failed_mount;
- }
}
if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 613ea2d7ac19..586bb64e674b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -72,18 +72,11 @@ xfs_zero_extent(
struct xfs_mount *mp = ip->i_mount;
xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
sector_t block = XFS_BB_TO_FSBT(mp, sector);
- ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
-
- if (IS_DAX(VFS_I(ip)))
- return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
- sector, size);
-
- /*
- * let the block layer decide on the fastest method of
- * implementing the zeroing.
- */
- return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+ return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+ block << (mp->m_super->s_blocksize_bits - 9),
+ count_fsb << (mp->m_super->s_blocksize_bits - 9),
+ GFP_NOFS, true);
}
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 44af22897c8b..47fc63295422 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1551,7 +1551,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
} else {
ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
@@ -1585,7 +1585,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1622,8 +1622,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
- NULL);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 416421d7ff10..11ea5d51db56 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1555,14 +1555,12 @@ xfs_fs_fill_super(
if (mp->m_flags & XFS_MOUNT_DAX) {
xfs_warn(mp,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- if (sb->s_blocksize != PAGE_SIZE) {
- xfs_alert(mp,
- "Filesystem block size invalid for DAX Turning DAX off.");
- mp->m_flags &= ~XFS_MOUNT_DAX;
- } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+
+ error = bdev_dax_supported(sb, sb->s_blocksize);
+ if (error) {
xfs_alert(mp,
- "Block device does not support DAX Turning DAX off.");
+ "DAX unsupported by block device. Turning off DAX.");
mp->m_flags &= ~XFS_MOUNT_DAX;
}
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1fd8fdff2f81..3d9cf326574f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -768,6 +768,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
}
#endif
+#ifdef CONFIG_PRINTK
+#define vfs_msg(sb, level, fmt, ...) \
+ __vfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#else
+#define vfs_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __vfs_msg(sb, "", " "); \
+} while (0)
+#endif
+
extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
extern blk_qc_t generic_make_request(struct bio *bio);
@@ -1660,7 +1671,7 @@ struct block_device_operations {
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
long (*direct_access)(struct block_device *, sector_t, void __pmem **,
- pfn_t *);
+ pfn_t *, long);
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1680,6 +1691,8 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
+extern int bdev_dax_supported(struct super_block *, int);
+extern bool bdev_dax_capable(struct block_device *);
#else /* CONFIG_BLOCK */
struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 982a6c4a62f3..7743e51f826c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,41 +7,44 @@
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
get_block_t, dio_iodone_t, int flags);
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
#ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+ unsigned int offset, unsigned int length);
#else
static inline struct page *read_dax_sector(struct block_device *bdev,
sector_t n)
{
return ERR_PTR(-ENXIO);
}
+static inline int __dax_zero_page_range(struct block_device *bdev,
+ sector_t sector, unsigned int offset, unsigned int length)
+{
+ return -ENXIO;
+}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
- unsigned int flags, get_block_t, dax_iodone_t);
+ unsigned int flags, get_block_t);
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
- unsigned int flags, get_block_t, dax_iodone_t);
+ unsigned int flags, get_block_t);
#else
static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags, get_block_t gb,
- dax_iodone_t di)
+ pmd_t *pmd, unsigned int flags, get_block_t gb)
{
return VM_FAULT_FALLBACK;
}
#define __dax_pmd_fault dax_pmd_fault
#endif
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
+#define __dax_mkwrite(vma, vmf, gb) __dax_fault(vma, vmf, gb)
static inline bool vma_is_dax(struct vm_area_struct *vma)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5f61431d8673..9ace7f745bcd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
-typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002