diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-05-02 14:13:46 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-05-02 14:13:46 -0700 |
commit | 9ccce092fc64d19504fa54de4fd659e279cc92e7 (patch) | |
tree | 050c91cf4d884d9feb27c1074c3f73cab4f5bbf8 | |
parent | 27787ba3fa4904422b3928b898d1bd3d74d98bea (diff) | |
parent | 211f9f2e0503efa4023a46920e7ad07377b4ec58 (diff) | |
download | linux-9ccce092fc64d19504fa54de4fd659e279cc92e7.tar.bz2 |
Merge tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux
Pull orangefs updates from Mike Marshall:
"orangefs: implement orangefs_readahead
mm/readahead.c/read_pages was quite a bit different back when I put my
open-coded readahead logic into orangefs_readpage. That logic seemed
to work as designed back then, it is a trainwreck now.
This implements orangefs_readahead using the new xarray and
readahead_expand features and removes all my open-coded readahead
logic.
This results in an extreme read performance improvement, these sample
numbers are from my test VM:
Here's an example of what's upstream in
5.11.8-200.fc33.x86_64:
30+0 records in
30+0 records out
125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s
And here's this version of orangefs_readahead on top of 5.12.0-rc4:
30+0 records in
30+0 records out
125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s
There are four xfstest regressions with this patch. David Howells and
Matthew Wilcox have been helping me work with this code"
* tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux:
orangefs: leave files in the page cache for a few micro seconds at least
Orangef: implement orangefs_readahead.
-rw-r--r-- | fs/orangefs/file.c | 34 | ||||
-rw-r--r-- | fs/orangefs/inode.c | 122 | ||||
-rw-r--r-- | fs/orangefs/orangefs-mod.c | 2 |
3 files changed, 54 insertions, 104 deletions
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index ccef8c9dd516..86810e5d7914 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -248,21 +248,7 @@ populate_shared_memory: * or it can pointers to struct page's */ - /* - * When reading, readahead_size will only be zero when - * we're doing O_DIRECT, otherwise we got here from - * orangefs_readpage. - * - * If we got here from orangefs_readpage we want to - * copy either a page or the whole file into the io - * vector, whichever is smaller. - */ - if (readahead_size) - copy_amount = - min(new_op->downcall.resp.io.amt_complete, - (__s64)PAGE_SIZE); - else - copy_amount = new_op->downcall.resp.io.amt_complete; + copy_amount = new_op->downcall.resp.io.amt_complete; ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, copy_amount); @@ -283,19 +269,11 @@ populate_shared_memory: out: if (buffer_index >= 0) { - if ((readahead_size) && (type == ORANGEFS_IO_READ)) { - /* readpage */ - *index_return = buffer_index; - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: hold on to buffer_index :%d:\n", - __func__, buffer_index); - } else { - /* O_DIRECT */ - orangefs_bufmap_put(buffer_index); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): PUT buffer_index %d\n", - __func__, handle, buffer_index); - } + orangefs_bufmap_put(buffer_index); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): PUT buffer_index %d\n", + __func__, handle, buffer_index); + buffer_index = -1; } op_release(new_op); return ret; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 85b3dd2d769d..6bf35a0d61f3 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -245,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping, static int orangefs_launder_page(struct page *); +static void orangefs_readahead(struct readahead_control *rac) +{ + loff_t offset; + struct iov_iter iter; + struct file *file = rac->file; + struct inode *inode = file->f_mapping->host; + struct xarray *i_pages; + struct page *page; + loff_t new_start = readahead_pos(rac); + int ret; + size_t new_len = 0; + + loff_t bytes_remaining = inode->i_size - readahead_pos(rac); + loff_t pages_remaining = bytes_remaining / PAGE_SIZE; + + if (pages_remaining >= 1024) + new_len = 4194304; + else if (pages_remaining > readahead_count(rac)) + new_len = bytes_remaining; + + if (new_len) + readahead_expand(rac, new_start, new_len); + + offset = readahead_pos(rac); + i_pages = &file->f_mapping->i_pages; + + iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); + + /* read in the pages. */ + if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, + &offset, &iter, readahead_length(rac), + inode->i_size, NULL, NULL, file)) < 0) + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: wait_for_direct_io failed. \n", __func__); + else + ret = 0; + + /* clean up. */ + while ((page = readahead_page(rac))) { + page_endio(page, false, ret); + put_page(page); + } +} + static int orangefs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; @@ -252,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page) struct bio_vec bv; ssize_t ret; loff_t off; /* offset into this page */ - pgoff_t index; /* which page */ - struct page *next_page; - char *kaddr; - loff_t read_size; - int buffer_index = -1; /* orangefs shared memory slot */ - int slot_index; /* index into slot */ - int remaining; - - /* - * Get up to this many bytes from Orangefs at a time and try - * to fill them into the page cache at once. Tests with dd made - * this seem like a reasonable static number, if there was - * interest perhaps this number could be made setable through - * sysfs... - */ - read_size = 524288; if (PageDirty(page)) orangefs_launder_page(page); off = page_offset(page); - index = off >> PAGE_SHIFT; bv.bv_page = page; bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - read_size, inode->i_size, NULL, &buffer_index, file); - remaining = ret; + PAGE_SIZE, inode->i_size, NULL, NULL, file); /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_page(page); if (ret < 0) { SetPageError(page); - unlock_page(page); - goto out; } else { SetPageUptodate(page); if (PageError(page)) @@ -298,60 +322,7 @@ static int orangefs_readpage(struct file *file, struct page *page) } /* unlock the page after the ->readpage() routine completes */ unlock_page(page); - - if (remaining > PAGE_SIZE) { - slot_index = 0; - while ((remaining - PAGE_SIZE) >= PAGE_SIZE) { - remaining -= PAGE_SIZE; - /* - * It is an optimization to try and fill more than one - * page... by now we've already gotten the single - * page we were after, if stuff doesn't seem to - * be going our way at this point just return - * and hope for the best. - * - * If we look for pages and they're already there is - * one reason to give up, and if they're not there - * and we can't create them is another reason. - */ - - index++; - slot_index++; - next_page = find_get_page(inode->i_mapping, index); - if (next_page) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: found next page, quitting\n", - __func__); - put_page(next_page); - goto out; - } - next_page = find_or_create_page(inode->i_mapping, - index, - GFP_KERNEL); - /* - * I've never hit this, leave it as a printk for - * now so it will be obvious. - */ - if (!next_page) { - printk("%s: can't create next page, quitting\n", - __func__); - goto out; - } - kaddr = kmap_atomic(next_page); - orangefs_bufmap_page_fill(kaddr, - buffer_index, - slot_index); - kunmap_atomic(kaddr); - SetPageUptodate(next_page); - unlock_page(next_page); - put_page(next_page); - } - } - -out: - if (buffer_index != -1) - orangefs_bufmap_put(buffer_index); - return ret; + return ret; } static int orangefs_write_begin(struct file *file, @@ -660,6 +631,7 @@ out: /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { .writepage = orangefs_writepage, + .readahead = orangefs_readahead, .readpage = orangefs_readpage, .writepages = orangefs_writepages, .set_page_dirty = __set_page_dirty_nobuffers, diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 74a3d6337ef4..cd7297815f91 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -31,7 +31,7 @@ static ulong module_parm_debug_mask; __u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; -int orangefs_cache_timeout_msecs = 50; +int orangefs_cache_timeout_msecs = 500; int orangefs_dcache_timeout_msecs = 50; int orangefs_getattr_timeout_msecs = 50; |