summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2018-07-11 22:24:40 -0700
committerDarrick J. Wong <darrick.wong@oracle.com>2018-07-11 22:24:40 -0700
commitc2efdfc100af42cc04525ef0db74b176da59e1a4 (patch)
tree18dd5e3e42762b00d66c2c86b91ad08c91063b96 /fs
parent1e4b044d22517cae7047c99038abb444423243ca (diff)
parent806a1477b10a153cd01ee7ccba8ca2492df3e0b2 (diff)
downloadlinux-c2efdfc100af42cc04525ef0db74b176da59e1a4.tar.bz2
Merge branch 'iomap-4.19-merge' into xfs-4.19-merge
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c76
-rw-r--r--fs/internal.h2
-rw-r--r--fs/iomap.c532
-rw-r--r--fs/xfs/xfs_iomap.c6
4 files changed, 520 insertions, 96 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index cabc045f483d..c8c2b7d8b8d6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1900,15 +1900,16 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
break;
case IOMAP_UNWRITTEN:
/*
- * For unwritten regions, we always need to ensure that
- * sub-block writes cause the regions in the block we are not
- * writing to are zeroed. Set the buffer as new to ensure this.
+ * For unwritten regions, we always need to ensure that regions
+ * in the block we are not writing to are zeroed. Mark the
+ * buffer as new to ensure this.
*/
set_buffer_new(bh);
set_buffer_unwritten(bh);
/* FALLTHRU */
case IOMAP_MAPPED:
- if (offset >= i_size_read(inode))
+ if ((iomap->flags & IOMAP_F_NEW) ||
+ offset >= i_size_read(inode))
set_buffer_new(bh);
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
@@ -2076,6 +2077,40 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(block_write_begin);
+int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
+ struct page *page)
+{
+ loff_t old_size = inode->i_size;
+ bool i_size_changed = false;
+
+ /*
+ * No need to use i_size_read() here, the i_size cannot change under us
+ * because we hold i_rwsem.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos + copied > inode->i_size) {
+ i_size_write(inode, pos + copied);
+ i_size_changed = true;
+ }
+
+ unlock_page(page);
+ put_page(page);
+
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+ return copied;
+}
+
int block_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
@@ -2116,39 +2151,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- struct inode *inode = mapping->host;
- loff_t old_size = inode->i_size;
- int i_size_changed = 0;
-
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- *
- * But it's important to update i_size while still holding page lock:
- * page writeout could otherwise come in and zero beyond i_size.
- */
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- i_size_changed = 1;
- }
-
- unlock_page(page);
- put_page(page);
-
- if (old_size < pos)
- pagecache_isize_extended(inode, old_size, pos);
- /*
- * Don't mark the inode dirty under page lock. First, it unnecessarily
- * makes the holding time of page lock longer. Second, it forces lock
- * ordering of page lock and transaction start for journaling
- * filesystems.
- */
- if (i_size_changed)
- mark_inode_dirty(inode);
-
- return copied;
+ return __generic_write_end(mapping->host, pos, copied, page);
}
EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/internal.h b/fs/internal.h
index 980d005b21b4..4a18bdbd2214 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,6 +43,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
extern void guard_bio_eod(int rw, struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap);
+int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
+ struct page *page);
/*
* char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
index 77397b5a96ef..13cdcf33e6c0 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016 Christoph Hellwig.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
@@ -104,6 +105,243 @@ iomap_sector(struct iomap *iomap, loff_t pos)
}
static void
+iomap_read_inline_data(struct inode *inode, struct page *page,
+ struct iomap *iomap)
+{
+ size_t size = i_size_read(inode);
+ void *addr;
+
+ if (PageUptodate(page))
+ return;
+
+ BUG_ON(page->index);
+ BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+ addr = kmap_atomic(page);
+ memcpy(addr, iomap->inline_data, size);
+ memset(addr + size, 0, PAGE_SIZE - size);
+ kunmap_atomic(addr);
+ SetPageUptodate(page);
+}
+
+static void
+iomap_read_end_io(struct bio *bio)
+{
+ int error = blk_status_to_errno(bio->bi_status);
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ page_endio(bvec->bv_page, false, error);
+ bio_put(bio);
+}
+
+struct iomap_readpage_ctx {
+ struct page *cur_page;
+ bool cur_page_in_bio;
+ bool is_readahead;
+ struct bio *bio;
+ struct list_head *pages;
+};
+
+static loff_t
+iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap)
+{
+ struct iomap_readpage_ctx *ctx = data;
+ struct page *page = ctx->cur_page;
+ unsigned poff = pos & (PAGE_SIZE - 1);
+ unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
+ bool is_contig = false;
+ sector_t sector;
+
+ if (iomap->type == IOMAP_INLINE) {
+ WARN_ON_ONCE(poff);
+ iomap_read_inline_data(inode, page, iomap);
+ return PAGE_SIZE;
+ }
+
+ /* we don't support blocksize < PAGE_SIZE quite yet. */
+ WARN_ON_ONCE(pos != page_offset(page));
+ WARN_ON_ONCE(plen != PAGE_SIZE);
+
+ if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
+ zero_user(page, poff, plen);
+ SetPageUptodate(page);
+ goto done;
+ }
+
+ ctx->cur_page_in_bio = true;
+
+ /*
+ * Try to merge into a previous segment if we can.
+ */
+ sector = iomap_sector(iomap, pos);
+ if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+ goto done;
+ is_contig = true;
+ }
+
+ if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ if (ctx->bio)
+ submit_bio(ctx->bio);
+
+ if (ctx->is_readahead) /* same as readahead_gfp_mask */
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+ ctx->bio->bi_opf = REQ_OP_READ;
+ if (ctx->is_readahead)
+ ctx->bio->bi_opf |= REQ_RAHEAD;
+ ctx->bio->bi_iter.bi_sector = sector;
+ bio_set_dev(ctx->bio, iomap->bdev);
+ ctx->bio->bi_end_io = iomap_read_end_io;
+ }
+
+ __bio_add_page(ctx->bio, page, plen, poff);
+done:
+ return plen;
+}
+
+int
+iomap_readpage(struct page *page, const struct iomap_ops *ops)
+{
+ struct iomap_readpage_ctx ctx = { .cur_page = page };
+ struct inode *inode = page->mapping->host;
+ unsigned poff;
+ loff_t ret;
+
+ WARN_ON_ONCE(page_has_buffers(page));
+
+ for (poff = 0; poff < PAGE_SIZE; poff += ret) {
+ ret = iomap_apply(inode, page_offset(page) + poff,
+ PAGE_SIZE - poff, 0, ops, &ctx,
+ iomap_readpage_actor);
+ if (ret <= 0) {
+ WARN_ON_ONCE(ret == 0);
+ SetPageError(page);
+ break;
+ }
+ }
+
+ if (ctx.bio) {
+ submit_bio(ctx.bio);
+ WARN_ON_ONCE(!ctx.cur_page_in_bio);
+ } else {
+ WARN_ON_ONCE(ctx.cur_page_in_bio);
+ unlock_page(page);
+ }
+
+ /*
+ * Just like mpage_readpages and block_read_full_page we always
+ * return 0 and just mark the page as PageError on errors. This
+ * should be cleaned up all through the stack eventually.
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_readpage);
+
+static struct page *
+iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
+ loff_t length, loff_t *done)
+{
+ while (!list_empty(pages)) {
+ struct page *page = lru_to_page(pages);
+
+ if (page_offset(page) >= (u64)pos + length)
+ break;
+
+ list_del(&page->lru);
+ if (!add_to_page_cache_lru(page, inode->i_mapping, page->index,
+ GFP_NOFS))
+ return page;
+
+ /*
+ * If we already have a page in the page cache at index we are
+ * done. Upper layers don't care if it is uptodate after the
+ * readpages call itself as every page gets checked again once
+ * actually needed.
+ */
+ *done += PAGE_SIZE;
+ put_page(page);
+ }
+
+ return NULL;
+}
+
+static loff_t
+iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct iomap_readpage_ctx *ctx = data;
+ loff_t done, ret;
+
+ for (done = 0; done < length; done += ret) {
+ if (ctx->cur_page && ((pos + done) & (PAGE_SIZE - 1)) == 0) {
+ if (!ctx->cur_page_in_bio)
+ unlock_page(ctx->cur_page);
+ put_page(ctx->cur_page);
+ ctx->cur_page = NULL;
+ }
+ if (!ctx->cur_page) {
+ ctx->cur_page = iomap_next_page(inode, ctx->pages,
+ pos, length, &done);
+ if (!ctx->cur_page)
+ break;
+ ctx->cur_page_in_bio = false;
+ }
+ ret = iomap_readpage_actor(inode, pos + done, length - done,
+ ctx, iomap);
+ }
+
+ return done;
+}
+
+int
+iomap_readpages(struct address_space *mapping, struct list_head *pages,
+ unsigned nr_pages, const struct iomap_ops *ops)
+{
+ struct iomap_readpage_ctx ctx = {
+ .pages = pages,
+ .is_readahead = true,
+ };
+ loff_t pos = page_offset(list_entry(pages->prev, struct page, lru));
+ loff_t last = page_offset(list_entry(pages->next, struct page, lru));
+ loff_t length = last - pos + PAGE_SIZE, ret = 0;
+
+ while (length > 0) {
+ ret = iomap_apply(mapping->host, pos, length, 0, ops,
+ &ctx, iomap_readpages_actor);
+ if (ret <= 0) {
+ WARN_ON_ONCE(ret == 0);
+ goto done;
+ }
+ pos += ret;
+ length -= ret;
+ }
+ ret = 0;
+done:
+ if (ctx.bio)
+ submit_bio(ctx.bio);
+ if (ctx.cur_page) {
+ if (!ctx.cur_page_in_bio)
+ unlock_page(ctx.cur_page);
+ put_page(ctx.cur_page);
+ }
+
+ /*
+ * Check that we didn't lose a page due to the arcance calling
+ * conventions..
+ */
+ WARN_ON_ONCE(!ret && !list_empty(ctx.pages));
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_readpages);
+
+static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
loff_t i_size = i_size_read(inode);
@@ -117,6 +355,48 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
}
static int
+iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
+ unsigned poff, unsigned plen, unsigned from, unsigned to,
+ struct iomap *iomap)
+{
+ struct bio_vec bvec;
+ struct bio bio;
+
+ if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
+ zero_user_segments(page, poff, from, to, poff + plen);
+ return 0;
+ }
+
+ bio_init(&bio, &bvec, 1);
+ bio.bi_opf = REQ_OP_READ;
+ bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+ bio_set_dev(&bio, iomap->bdev);
+ __bio_add_page(&bio, page, plen, poff);
+ return submit_bio_wait(&bio);
+}
+
+static int
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+ struct page *page, struct iomap *iomap)
+{
+ loff_t block_size = i_blocksize(inode);
+ loff_t block_start = pos & ~(block_size - 1);
+ loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+ unsigned poff = block_start & (PAGE_SIZE - 1);
+ unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - block_start);
+ unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
+
+ WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+
+ if (PageUptodate(page))
+ return 0;
+ if (from <= poff && to >= poff + plen)
+ return 0;
+ return iomap_read_page_sync(inode, block_start, page,
+ poff, plen, from, to, iomap);
+}
+
+static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
struct page **pagep, struct iomap *iomap)
{
@@ -133,7 +413,12 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
if (!page)
return -ENOMEM;
- status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ if (iomap->type == IOMAP_INLINE)
+ iomap_read_inline_data(inode, page, iomap);
+ else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
+ status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ else
+ status = __iomap_write_begin(inode, pos, len, page, iomap);
if (unlikely(status)) {
unlock_page(page);
put_page(page);
@@ -146,14 +431,93 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
return status;
}
+int
+iomap_set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int newly_dirty;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ /*
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
+ */
+ lock_page_memcg(page);
+ newly_dirty = !TestSetPageDirty(page);
+ if (newly_dirty)
+ __set_page_dirty(page, mapping, 0);
+ unlock_page_memcg(page);
+
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ return newly_dirty;
+}
+EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
+
+static int
+__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page, struct iomap *iomap)
+{
+ flush_dcache_page(page);
+
+ /*
+ * The blocks that were entirely written will now be uptodate, so we
+ * don't have to worry about a readpage reading them and overwriting a
+ * partial write. However if we have encountered a short write and only
+ * partially written into a block, it will not be marked uptodate, so a
+ * readpage might come in and destroy our partial write.
+ *
+ * Do the simplest thing, and just treat any short write to a non
+ * uptodate page as a zero-length write, and force the caller to redo
+ * the whole thing.
+ */
+ if (unlikely(copied < len && !PageUptodate(page))) {
+ copied = 0;
+ } else {
+ SetPageUptodate(page);
+ iomap_set_page_dirty(page);
+ }
+ return __generic_write_end(inode, pos, copied, page);
+}
+
+static int
+iomap_write_end_inline(struct inode *inode, struct page *page,
+ struct iomap *iomap, loff_t pos, unsigned copied)
+{
+ void *addr;
+
+ WARN_ON_ONCE(!PageUptodate(page));
+ BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+ addr = kmap_atomic(page);
+ memcpy(iomap->inline_data + pos, addr + pos, copied);
+ kunmap_atomic(addr);
+
+ mark_inode_dirty(inode);
+ __generic_write_end(inode, pos, copied, page);
+ return copied;
+}
+
static int
iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+ unsigned copied, struct page *page, struct iomap *iomap)
{
int ret;
- ret = generic_write_end(NULL, inode->i_mapping, pos, len,
- copied, page, NULL);
+ if (iomap->type == IOMAP_INLINE) {
+ ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
+ } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+ ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+ copied, page, NULL);
+ } else {
+ ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
+ }
+
+ if (iomap->page_done)
+ iomap->page_done(inode, pos, copied, page, iomap);
+
if (ret < len)
iomap_write_failed(inode, pos, len);
return ret;
@@ -208,7 +572,8 @@ again:
flush_dcache_page(page);
- status = iomap_write_end(inode, pos, bytes, copied, page);
+ status = iomap_write_end(inode, pos, bytes, copied, page,
+ iomap);
if (unlikely(status < 0))
break;
copied = status;
@@ -302,7 +667,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
WARN_ON_ONCE(!PageUptodate(page));
- status = iomap_write_end(inode, pos, bytes, bytes, page);
+ status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
if (unlikely(status <= 0)) {
if (WARN_ON_ONCE(status == 0))
return -EIO;
@@ -354,7 +719,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
zero_user(page, offset, bytes);
mark_page_accessed(page);
- return iomap_write_end(inode, pos, bytes, bytes, page);
+ return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
}
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
@@ -440,11 +805,16 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
struct page *page = data;
int ret;
- ret = __block_write_begin_int(page, pos, length, NULL, iomap);
- if (ret)
- return ret;
+ if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+ ret = __block_write_begin_int(page, pos, length, NULL, iomap);
+ if (ret)
+ return ret;
+ block_commit_write(page, 0, length);
+ } else {
+ WARN_ON_ONCE(!PageUptodate(page));
+ WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+ }
- block_commit_write(page, 0, length);
return length;
}
@@ -811,6 +1181,7 @@ struct iomap_dio {
atomic_t ref;
unsigned flags;
int error;
+ bool wait_for_completion;
union {
/* used during submission and for synchronous completion: */
@@ -914,9 +1285,8 @@ static void iomap_dio_bio_end_io(struct bio *bio)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (atomic_dec_and_test(&dio->ref)) {
- if (is_sync_kiocb(dio->iocb)) {
+ if (dio->wait_for_completion) {
struct task_struct *waiter = dio->submit.waiter;
-
WRITE_ONCE(dio->submit.waiter, NULL);
wake_up_process(waiter);
} else if (dio->flags & IOMAP_DIO_WRITE) {
@@ -963,10 +1333,9 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
}
static loff_t
-iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap_dio *dio, struct iomap *iomap)
{
- struct iomap_dio *dio = data;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
unsigned int align = iov_iter_alignment(dio->submit.iter);
@@ -980,41 +1349,27 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
if ((pos | length | align) & ((1 << blkbits) - 1))
return -EINVAL;
- switch (iomap->type) {
- case IOMAP_HOLE:
- if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
- return -EIO;
- /*FALLTHRU*/
- case IOMAP_UNWRITTEN:
- if (!(dio->flags & IOMAP_DIO_WRITE)) {
- length = iov_iter_zero(length, dio->submit.iter);
- dio->size += length;
- return length;
- }
+ if (iomap->type == IOMAP_UNWRITTEN) {
dio->flags |= IOMAP_DIO_UNWRITTEN;
need_zeroout = true;
- break;
- case IOMAP_MAPPED:
- if (iomap->flags & IOMAP_F_SHARED)
- dio->flags |= IOMAP_DIO_COW;
- if (iomap->flags & IOMAP_F_NEW) {
- need_zeroout = true;
- } else {
- /*
- * Use a FUA write if we need datasync semantics, this
- * is a pure data IO that doesn't require any metadata
- * updates and the underlying device supports FUA. This
- * allows us to avoid cache flushes on IO completion.
- */
- if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) &&
- blk_queue_fua(bdev_get_queue(iomap->bdev)))
- use_fua = true;
- }
- break;
- default:
- WARN_ON_ONCE(1);
- return -EIO;
+ }
+
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+
+ if (iomap->flags & IOMAP_F_NEW) {
+ need_zeroout = true;
+ } else {
+ /*
+ * Use a FUA write if we need datasync semantics, this
+ * is a pure data IO that doesn't require any metadata
+ * updates and the underlying device supports FUA. This
+ * allows us to avoid cache flushes on IO completion.
+ */
+ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+ (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+ blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ use_fua = true;
}
/*
@@ -1093,6 +1448,66 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
return copied;
}
+static loff_t
+iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
+{
+ length = iov_iter_zero(length, dio->submit.iter);
+ dio->size += length;
+ return length;
+}
+
+static loff_t
+iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap_dio *dio, struct iomap *iomap)
+{
+ struct iov_iter *iter = dio->submit.iter;
+ size_t copied;
+
+ BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ loff_t size = inode->i_size;
+
+ if (pos > size)
+ memset(iomap->inline_data + size, 0, pos - size);
+ copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+ if (copied) {
+ if (pos + copied > size)
+ i_size_write(inode, pos + copied);
+ mark_inode_dirty(inode);
+ }
+ } else {
+ copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+ }
+ dio->size += copied;
+ return copied;
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct iomap_dio *dio = data;
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+ return -EIO;
+ return iomap_dio_hole_actor(length, dio);
+ case IOMAP_UNWRITTEN:
+ if (!(dio->flags & IOMAP_DIO_WRITE))
+ return iomap_dio_hole_actor(length, dio);
+ return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+ case IOMAP_MAPPED:
+ return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+ case IOMAP_INLINE:
+ return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+}
+
/*
* iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
* is being issued as AIO or not. This allows us to optimise pure data writes
@@ -1131,13 +1546,12 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->end_io = end_io;
dio->error = 0;
dio->flags = 0;
+ dio->wait_for_completion = is_sync_kiocb(iocb);
dio->submit.iter = iter;
- if (is_sync_kiocb(iocb)) {
- dio->submit.waiter = current;
- dio->submit.cookie = BLK_QC_T_NONE;
- dio->submit.last_queue = NULL;
- }
+ dio->submit.waiter = current;
+ dio->submit.cookie = BLK_QC_T_NONE;
+ dio->submit.last_queue = NULL;
if (iov_iter_rw(iter) == READ) {
if (pos >= dio->i_size)
@@ -1187,7 +1601,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
- if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+ if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
!inode->i_sb->s_dio_done_wq) {
ret = sb_init_dio_done_wq(inode->i_sb);
if (ret < 0)
@@ -1202,8 +1616,10 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomap_dio_actor);
if (ret <= 0) {
/* magic error code to fall back to buffered I/O */
- if (ret == -ENOTBLK)
+ if (ret == -ENOTBLK) {
+ dio->wait_for_completion = true;
ret = 0;
+ }
break;
}
pos += ret;
@@ -1224,7 +1640,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
if (!atomic_dec_and_test(&dio->ref)) {
- if (!is_sync_kiocb(iocb))
+ if (!dio->wait_for_completion)
return -EIOCBQUEUED;
for (;;) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 55876dd02f0c..e08a84d9ee72 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -626,7 +626,7 @@ retry:
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
- iomap->flags = IOMAP_F_NEW;
+ iomap->flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
done:
if (isnullstartblock(got.br_startblock))
@@ -1032,6 +1032,8 @@ xfs_file_iomap_begin(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD;
+
if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
@@ -1132,7 +1134,7 @@ xfs_file_iomap_begin(
if (error)
return error;
- iomap->flags = IOMAP_F_NEW;
+ iomap->flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
out_finish: