From 38512aa98a3feb6acd7da8f0ed5dade5b592b426 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Jun 2016 21:44:08 -0400 Subject: NFS: Don't flush caches for a getattr that races with writeback If there were outstanding writes then chalk up the unexpected change attribute on the server to them. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52e7d6869e3b..60051e62d3f1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1729,12 +1729,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); + /* Could it be a race with writeback? */ + if (nfsi->nrequests == 0) { + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + } inode->i_version = fattr->change_attr; } } else { -- cgit v1.2.3 From 57b691819ee2b095da505b34abdcd3193d0af75c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Jun 2016 17:07:19 -0400 Subject: NFS: Cache access checks more aggressively If an attribute revalidation fails, then we already know that we'll zap the access cache. If, OTOH, the inode isn't changing, there should be no need to eject access calls just because they are old. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index aaf7bd0cbae2..210b33636fe4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2228,21 +2228,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; - int err = -ENOENT; + bool retry = true; + int err; spin_lock(&inode->i_lock); - if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) - goto out_zap; - cache = nfs_access_search_rbtree(inode, cred); - if (cache == NULL) - goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) - goto out_stale; + for(;;) { + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + err = -ENOENT; + if (cache == NULL) + goto out; + /* Found an entry, is our attribute cache valid? */ + if (!nfs_attribute_cache_expired(inode) && + !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) + break; + err = -ECHILD; + if (!may_block) + goto out; + if (!retry) + goto out_zap; + spin_unlock(&inode->i_lock); + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (err) + return err; + spin_lock(&inode->i_lock); + retry = false; + } res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; @@ -2251,12 +2267,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str out: spin_unlock(&inode->i_lock); return err; -out_stale: - rb_erase(&cache->rb_node, &nfsi->access_cache); - list_del(&cache->lru); - spin_unlock(&inode->i_lock); - nfs_access_free_entry(cache); - return -ENOENT; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); @@ -2283,13 +2293,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, cache = NULL; if (cache == NULL) goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); + if (err) goto out; res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; - err = 0; out: rcu_read_unlock(); return err; @@ -2378,18 +2387,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; + bool may_block = (mask & MAY_NOT_BLOCK) == 0; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached_rcu(inode, cred, &cache); if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache); + status = nfs_access_get_cached(inode, cred, &cache, may_block); if (status == 0) goto out_cached; status = -ECHILD; - if (mask & MAY_NOT_BLOCK) + if (!may_block) goto out; /* Be clever: ask server to check for all possible rights */ -- cgit v1.2.3 From ca0daa277acac1029f74d9fea838c9e507398226 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Jun 2016 17:08:28 -0400 Subject: NFS: Cache aggressively when file is open for writing Unless the user is using file locking, we must assume close-to-open cache consistency when the file is open for writing. Adjust the caching algorithm so that it does not clear the cache on out-of-order writes and/or attribute revalidations. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 13 ++---------- fs/nfs/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6af52d..2d39d9f9da7d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -779,11 +779,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) return status; } -static int -is_time_granular(struct timespec *ts) { - return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); -} - static int do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -817,12 +812,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { - if (is_time_granular(&NFS_SERVER(inode)->time_delta)) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - nfs_zap_caches(inode); - } + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_zap_mapping(inode, filp->f_mapping); out: return status; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 60051e62d3f1..4e65a5a8a01b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -878,7 +878,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - list_add(&ctx->list, &nfsi->open_files); + if (ctx->mode & FMODE_WRITE) + list_add(&ctx->list, &nfsi->open_files); + else + list_add_tail(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -1215,6 +1218,25 @@ int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space * return __nfs_revalidate_mapping(inode, mapping, true); } +static bool nfs_file_has_writers(struct nfs_inode *nfsi) +{ + struct inode *inode = &nfsi->vfs_inode; + + assert_spin_locked(&inode->i_lock); + + if (!S_ISREG(inode->i_mode)) + return false; + if (list_empty(&nfsi->open_files)) + return false; + /* Note: This relies on nfsi->open_files being ordered with writers + * being placed at the head of the list. + * See nfs_inode_attach_open_context() + */ + return (list_first_entry(&nfsi->open_files, + struct nfs_open_context, + list)->mode & FMODE_WRITE) == FMODE_WRITE; +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1279,22 +1301,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - inode->i_version != fattr->change_attr) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (!nfs_file_has_writers(nfsi)) { + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; - /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + invalid |= NFS_INO_INVALID_ATTR; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } } - if (nfsi->nrequests != 0) - invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1526,7 +1550,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + unsigned long invalid = NFS_INO_INVALID_ATTR; /* * Don't revalidate the pagecache if we hold a delegation, but do @@ -1675,6 +1699,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool have_writers = nfs_file_has_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", @@ -1730,7 +1755,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ - if (nfsi->nrequests == 0) { + if (!have_writers) { invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS @@ -1770,9 +1795,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->nrequests == 0) || new_isize > cur_isize) { + if (nfsi->nrequests == 0 || new_isize > cur_isize) { i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!have_writers) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", -- cgit v1.2.3 From 6b56a89833fa7903595c8d138bb4927187315cba Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 18:23:01 -0400 Subject: NFS: Kill NFS_INO_NFS_INO_FLUSHING: it is a performance killer filemap_datawrite() and friends already deal just fine with livelock. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 8 -------- fs/nfs/nfstrace.h | 1 - fs/nfs/write.c | 11 ----------- include/linux/nfs_fs.h | 1 - 4 files changed, 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d39d9f9da7d..29d7477a62e8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -359,14 +359,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; /* * Wait for O_DIRECT to complete */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 0b9e5cc9a747..fe80a1c26340 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -37,7 +37,6 @@ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..980d44f3a84c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, @@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d71278c3c5bd..120dd04b553c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -205,7 +205,6 @@ struct nfs_inode { #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ -#define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ -- cgit v1.2.3 From 811ed92ecc9f47eee90beabcf5c2133f2a6d2440 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 18:25:56 -0400 Subject: NFS: writepage of a single page should not be synchronous It is almost always better to wait for more so that we can issue a bulk commit. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 980d44f3a84c..b13d48881d3a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page, int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), + nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); -- cgit v1.2.3 From 93761d9863c332d1099d80629f89cf48eb745e48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Jun 2016 11:03:00 -0400 Subject: NFS: Don't hold the inode lock across fsync() Commits are no longer required to be serialised. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29d7477a62e8..249262b6bcbe 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -277,11 +277,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by -- cgit v1.2.3 From 4f52b6bb8c57b9accafad526a429d6c0851cc62f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Jun 2016 18:10:33 -0400 Subject: NFS: Don't call COMMIT in ->releasepage() While COMMIT has the potential to free up a lot of memory that is being taken by unstable writes, it isn't guaranteed to free up this particular page. Also, calling fsync() on the server is expensive and so we want to do it in a more controlled fashion, rather than have it triggered at random by the VM. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 249262b6bcbe..df4dd8e7e62e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -460,31 +460,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, */ static int nfs_release_page(struct page *page, gfp_t gfp) { - struct address_space *mapping = page->mapping; - dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Always try to initiate a 'commit' if relevant, but only - * wait for it if the caller allows blocking. Even then, - * only wait 1 second and only if the 'bdi' is not congested. - * Waiting indefinitely can cause deadlocks when the NFS - * server is on this machine, when a new TCP connection is - * needed and in other rare cases. There is no particular - * need to wait extensively here. A short wait has the - * benefit that someone else can worry about the freezer. - */ - if (mapping) { - struct nfs_server *nfss = NFS_SERVER(mapping->host); - nfs_commit_inode(mapping->host, 0); - if (gfpflags_allow_blocking(gfp) && - !bdi_write_congested(&nfss->backing_dev_info)) { - wait_on_page_bit_killable_timeout(page, PG_private, - HZ); - if (PagePrivate(page)) - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } - } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; -- cgit v1.2.3 From 5c6e5b60aae4347223f176966455010a5715b863 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 14:13:12 -0400 Subject: NFS: Fix an Oops in the pNFS files and flexfiles connection setup to the DS Chris Worley reports: RIP: 0010:[] [] rpc_new_client+0x2a0/0x2e0 [sunrpc] RSP: 0018:ffff880158f6f548 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880234f8bc00 RCX: 000000000000ea60 RDX: 0000000000074cc0 RSI: 000000000000ea60 RDI: ffff880234f8bcf0 RBP: ffff880158f6f588 R08: 000000000001ac80 R09: ffff880237003300 R10: ffff880201171000 R11: ffffea0000d75200 R12: ffffffffa03afc60 R13: ffff880230c18800 R14: 0000000000000000 R15: ffff880158f6f680 FS: 00007f0e32673740(0000) GS:ffff88023fc40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 0000000234886000 CR4: 00000000001406e0 Stack: ffffffffa047a680 0000000000000000 ffff880158f6f598 ffff880158f6f680 ffff880158f6f680 ffff880234d11d00 ffff88023357f800 ffff880158f6f7d0 ffff880158f6f5b8 ffffffffa024660a ffff880158f6f5b8 ffffffffa02492ec Call Trace: [] rpc_create_xprt+0x1a/0xb0 [sunrpc] [] ? xprt_create_transport+0x13c/0x240 [sunrpc] [] rpc_create+0xc6/0x1a0 [sunrpc] [] nfs_create_rpc_client+0xf5/0x140 [nfs] [] nfs_init_client+0x3a/0xd0 [nfs] [] nfs_get_client+0x25f/0x310 [nfs] [] ? rpc_ntop+0xe8/0x100 [sunrpc] [] nfs3_set_ds_client+0xcc/0x100 [nfsv3] [] nfs4_pnfs_ds_connect+0x120/0x400 [nfsv4] [] nfs4_ff_layout_prepare_ds+0xe7/0x330 [nfs_layout_flexfiles] [] ff_layout_pg_init_write+0xcb/0x280 [nfs_layout_flexfiles] [] __nfs_pageio_add_request+0x12c/0x490 [nfs] [] nfs_pageio_add_request+0xc2/0x2a0 [nfs] [] ? nfs_pageio_init+0x75/0x120 [nfs] [] nfs_do_writepage+0x120/0x270 [nfs] [] nfs_writepage_locked+0x61/0xc0 [nfs] [] ? __percpu_counter_add+0x55/0x70 [] nfs_wb_single_page+0xef/0x1c0 [nfs] [] ? __dec_zone_page_state+0x33/0x40 [] nfs_launder_page+0x41/0x90 [nfs] [] invalidate_inode_pages2_range+0x340/0x3a0 [] invalidate_inode_pages2+0x17/0x20 [] nfs_release+0x9e/0xb0 [nfs] [] ? nfs_open+0x60/0x60 [nfs] [] nfs_file_release+0x3d/0x60 [nfs] [] __fput+0xdc/0x1e0 [] ____fput+0xe/0x10 [] task_work_run+0xc4/0xe0 [] do_exit+0x2e8/0xb30 [] ? do_audit_syscall_entry+0x6c/0x70 [] ? __audit_syscall_exit+0x1e6/0x280 [] do_group_exit+0x3f/0xa0 [] SyS_exit_group+0x14/0x20 [] system_call_fastpath+0x12/0x71 Which seems to be due to a call to utsname() when in a task exit context in order to determine the hostname to set in rpc_new_client(). In reality, what we want here is not the hostname of the current task, but the hostname that was used to set up the metadata server. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 22 ++++++++++------------ fs/nfs/internal.h | 16 ++++++++-------- fs/nfs/nfs3client.c | 8 +++++--- fs/nfs/nfs4client.c | 20 ++++++++++++-------- include/linux/nfs_xdr.h | 5 ++--- 5 files changed, 37 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0c96528db94a..4849d0f778dc 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, */ struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, - const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) { struct nfs_client *clp, *new = NULL; @@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, timeparms, ip_addr); + return rpc_ops->init_client(new, cl_init); } spin_unlock(&nn->nfs_client_lock); @@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values); * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, + const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; @@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, - .timeout = timeparms, + .timeout = cl_init->timeparms, .servername = clp->cl_hostname, + .nodename = cl_init->nodename, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, @@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: IP presentation address (not used) + * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { int error; @@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_parsed_mount_data *data, struct nfs_subversion *nfs_mod) { + struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, @@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server, .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, .net = data->net, + .timeparms = &timeparms, }; - struct rpc_timeout timeparms; struct nfs_client *clp; int error; @@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..fa88609f85e3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -66,13 +66,16 @@ struct nfs_clone_mount { struct nfs_client_initdata { unsigned long init_flags; - const char *hostname; - const struct sockaddr *addr; + const char *hostname; /* Hostname of the server */ + const struct sockaddr *addr; /* Address of the server */ + const char *nodename; /* Hostname of the client */ + const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; struct net *net; + const struct rpc_timeout *timeparms; }; /* @@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info); extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); -int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - const struct rpc_timeout *, const char *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); @@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); /* dir.c */ extern void nfs_force_use_readdirplus(struct inode *dir); @@ -521,8 +522,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 9e9fa347a948..0457b4129421 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -81,14 +81,17 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v3, .proto = ds_proto, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -99,8 +102,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); return clp; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 10410e8b5853..5fc7fbbfdcef 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server, .hostname = hostname, .addr = addr, .addrlen = addrlen, + .ip_addr = ip_addr, .nfs_mod = &nfs_v4, .proto = proto, .minorversion = minorversion, .net = net, + .timeparms = timeparms, }; struct nfs_client *clp; int error; @@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + clp = nfs_get_client(&cl_init, authflavour); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -847,15 +849,18 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v4, .proto = ds_proto, .minorversion = minor_version, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -869,8 +874,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c304a11b5b1a..82b81a1c2438 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1596,9 +1596,8 @@ struct nfs_rpc_ops { int (*have_delegation)(struct inode *, fmode_t); int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); - struct nfs_client * - (*init_client) (struct nfs_client *, const struct rpc_timeout *, - const char *); + struct nfs_client *(*init_client) (struct nfs_client *, + const struct nfs_client_initdata *); void (*free_client) (struct nfs_client *); struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, -- cgit v1.2.3 From ca857cc1d4cf17aba4bbb3b95d35454ad96924b3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 28 Jun 2016 13:54:09 -0400 Subject: NFS/pnfs: Do not clobber existing pgio_done_cb in nfs4_proc_read_setup If a pNFS client sets hdr->pgio_done_cb, then we should not overwrite that in nfs4_proc_read_setup() Fixes: 75bf47ebf6b5 ("pNFS/flexfile: Fix erroneous fall back to...") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff416d0e24bc..6191b7e46913 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4392,7 +4392,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { hdr->timestamp = jiffies; - hdr->pgio_done_cb = nfs4_read_done_cb; + if (!hdr->pgio_done_cb) + hdr->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); } -- cgit v1.2.3 From 8487c479e2668dd1231e9c3c77a203d744aec081 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 08:44:35 -0400 Subject: NFSv4: Allow retry of operations that used a returned delegation stateid Fix up nfs4_do_handle_exception() so that it can check if the operation that received the NFS4ERR_BAD_STATEID was using a defunct delegation. Apply that to the case of SETATTR, which will currently return EIO in some cases where this happens. Reported-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 79 ++++++++++++++++++++++++++++++++----------------------- 2 files changed, 47 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 768456fa1b17..4be567a54958 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -185,6 +185,7 @@ struct nfs4_state { struct nfs4_exception { struct nfs4_state *state; struct inode *inode; + nfs4_stateid *stateid; long timeout; unsigned char delay : 1, recovering : 1, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6191b7e46913..519368b98762 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; + const nfs4_stateid *stateid = exception->stateid; struct inode *inode = exception->inode; int ret = errorcode; @@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (inode && nfs_async_inode_return_delegation(inode, - NULL) == 0) - goto wait_on_recovery; + if (inode) { + int err; + + err = nfs_async_inode_return_delegation(inode, + stateid); + if (err == 0) + goto wait_on_recovery; + if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) { + exception->retry = 1; + break; + } + } if (state == NULL) break; ret = nfs4_schedule_stateid_recovery(server, state); @@ -2669,28 +2679,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, return res; } -static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, - struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, - struct nfs4_label *olabel) +static int _nfs4_do_setattr(struct inode *inode, + struct nfs_setattrargs *arg, + struct nfs_setattrres *res, + struct rpc_cred *cred, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = sattr, - .server = server, - .bitmask = server->attr_bitmask, - .label = ilabel, - }; - struct nfs_setattrres res = { - .fattr = fattr, - .label = olabel, - .server = server, - }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_argp = arg, + .rpc_resp = res, .rpc_cred = cred, }; struct rpc_cred *delegation_cred = NULL; @@ -2699,17 +2698,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, bool truncate; int status; - arg.bitmask = nfs4_bitmask(server, ilabel); - if (ilabel) - arg.bitmask = nfs4_bitmask(server, olabel); - - nfs_fattr_init(fattr); + nfs_fattr_init(res->fattr); /* Servers should only apply open mode checks for file size changes */ - truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2719,19 +2714,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (!nfs4_valid_open_stateid(state)) return -EBADF; if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, - &arg.stateid, &delegation_cred) == -EIO) + &arg->stateid, &delegation_cred) == -EIO) return -EBADF; } else - nfs4_stateid_copy(&arg.stateid, &zero_stateid); + nfs4_stateid_copy(&arg->stateid, &zero_stateid); if (delegation_cred) msg.rpc_cred = delegation_cred; - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); - trace_nfs4_setattr(inode, &arg.stateid, status); + trace_nfs4_setattr(inode, &arg->stateid, status); return status; } @@ -2741,13 +2736,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, + .server = server, + .bitmask = server->attr_bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; struct nfs4_exception exception = { .state = state, .inode = inode, + .stateid = &arg.stateid, }; int err; + + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + do { - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + err = _nfs4_do_setattr(inode, &arg, &res, cred, state); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { -- cgit v1.2.3 From 73e6c5d854d3f7f75e8b46d3e54aeb5d83fe6b1f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 12:27:25 -0400 Subject: pNFS/files: Fix layoutcommit after a commit to DS According to the errata https://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751 we should always send layout commit after a commit to DS. Fixes: bc7d4b8fd091 ("nfs/filelayout: set layoutcommit...") Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index aa59757389dc..b4c1407e8fe4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -375,8 +375,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } -- cgit v1.2.3 From c001c87a63aa2f35358e33eb05e45e4cbcb34f54 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 12:39:49 -0400 Subject: pNFS/flexfiles: Fix layoutcommit after a commit to DS We should always do a layoutcommit after commit to DS, except if the layout segment we're using has set FF_FLAGS_NO_LAYOUTCOMMIT. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0e8018bc9880..2689c9e9dc3c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1530,8 +1530,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE - && ff_layout_need_layoutcommit(data->lseg)) + if (ff_layout_need_layoutcommit(data->lseg)) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; -- cgit v1.2.3 From bc28e1c2e3c8a4c5198ebfd8bbae0afd73dfafd5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 16:14:40 -0400 Subject: pNFS/flexfiles: Clean up calls to pnfs_set_layoutcommit() Let's just have one place where we check ff_layout_need_layoutcommit(). Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 2689c9e9dc3c..14f2ed3f1a5b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) * we always send layoutcommit after DS writes. */ static void -ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +ff_layout_set_layoutcommit(struct inode *inode, + struct pnfs_layout_segment *lseg, + loff_t end_offset) { - if (!ff_layout_need_layoutcommit(hdr->lseg)) + if (!ff_layout_need_layoutcommit(lseg)) return; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, - (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); + pnfs_set_layoutcommit(inode, lseg, end_offset); + dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + (unsigned long long) NFS_I(inode)->layout->plh_lwb); } static bool @@ -1494,7 +1495,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr); + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + (loff_t)hdr->res.count); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; @@ -1530,8 +1532,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (ff_layout_need_layoutcommit(data->lseg)) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } -- cgit v1.2.3 From 2e18d4d822ea9cc811ea26a880cf2ed47cbf8889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 18:54:58 -0400 Subject: pNFS: Files and flexfiles always need to commit before layoutcommit So ensure that we mark the layout for commit once the write is done, and then ensure that the commit to ds is finished before sending layoutcommit. Note that by doing this, we're able to optimise away the commit for the case of servers that don't need layoutcommit in order to return updated attributes. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 9 ++++++--- fs/nfs/flexfilelayout/flexfilelayout.c | 7 +++++-- fs/nfs/nfs4xdr.c | 11 ++++++++--- fs/nfs/pnfs.c | 5 ++++- fs/nfs/pnfs_nfs.c | 7 +++++++ 5 files changed, 30 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index b4c1407e8fe4..25bd91a6e088 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed != NFS_DATA_SYNC) + hdr->res.verf->committed == NFS_FILE_SYNC) return; + if (hdr->res.verf->committed == NFS_DATA_SYNC) + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); + /* Note: if the write is unstable, don't set end_offs until commit */ + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 14f2ed3f1a5b..e6206eaf2bdf 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1470,6 +1470,7 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); @@ -1495,8 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + (loff_t)hdr->res.count); + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; + + /* Note: if the write is unstable, don't set end_offs until commit */ + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 661e753fe1c9..7bd3a5c09d31 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p = cpu_to_be32(0); /* reclaim */ encode_nfs4_stateid(xdr, &args->stateid); - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(1); /* newoffset = TRUE */ - p = xdr_encode_hyper(p, args->lastbytewritten); + if (args->lastbytewritten != U64_MAX) { + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + } else { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(0); /* newoffset = FALSE */ + } *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0c7e0d45a4de..62553182514e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2378,7 +2378,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; - data->args.lastbytewritten = end_pos - 1; + if (end_pos != 0) + data->args.lastbytewritten = end_pos - 1; + else + data->args.lastbytewritten = U64_MAX; data->res.server = NFS_SERVER(inode); if (ld->prepare_layoutcommit) { diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 0dfc476da3e1..0d10cc280a23 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -932,6 +932,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); int pnfs_nfs_generic_sync(struct inode *inode, bool datasync) { + int ret; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + return ret; if (datasync) return 0; return pnfs_layoutcommit_inode(inode, true); -- cgit v1.2.3 From ac46bd374c9a838874c450c528e2e922ee748ff9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jul 2016 13:46:53 -0400 Subject: pNFS: Ensure we layoutcommit before revalidating attributes If we need to update the cached attributes, then we'd better make sure that we also layoutcommit first. Otherwise, the server may have stale attributes. Prior to this patch, the revalidation code tried to "fix" this problem by simply disabling attributes that would be affected by the layoutcommit. That approach breaks nfs_writeback_check_extend(), leading to a file size corruption. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4e65a5a8a01b..6c0618eb5d57 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -974,6 +974,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + /* pNFS: Attributes aren't updated until we layoutcommit */ + if (S_ISREG(inode->i_mode)) { + status = pnfs_sync_inode(inode, false); + if (status) + goto out; + } + status = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -1493,28 +1500,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } -/* - * Don't trust the change_attribute, mtime, ctime or size if - * a pnfs LAYOUTCOMMIT is outstanding - */ -static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, - struct nfs_fattr *fattr) -{ - if (pnfs_layoutcommit_outstanding(inode)) - fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | - NFS_ATTR_FATTR_MTIME | - NFS_ATTR_FATTR_CTIME | - NFS_ATTR_FATTR_SIZE); -} - static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); - nfs_inode_attrs_handle_layoutcommit(inode, fattr); - if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else -- cgit v1.2.3 From 6712007734cbd64ff924af16fc236751d47ff80b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jul 2016 19:08:58 -0400 Subject: pNFS: pnfs_layoutcommit_outstanding() is no longer used when !CONFIG_NFS_V4_1 Cleanup... Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b21bd0bee784..d6be5299a55a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -716,13 +716,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } -static inline bool -pnfs_layoutcommit_outstanding(struct inode *inode) -{ - return false; -} - - static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; -- cgit v1.2.3 From 8fc3c3862728373e0d0f5abccc6afc56c69e0c63 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 21:32:24 -0400 Subject: NFS: Fix O_DIRECT verifier problems We should not be interested in looking at the value of the stable field, since that could take any value. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 10 ++++++++-- fs/nfs/internal.h | 7 +++++++ fs/nfs/write.c | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 979b3c4dee6a..d6d43b5eafb3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); } +static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, + const struct nfs_writeverf *v2) +{ + return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); +} + /* * nfs_direct_cmp_hdr_verf - compare verifier for pgio header * @dreq - direct request possibly spanning multiple servers @@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, nfs_direct_set_hdr_verf(dreq, hdr); return 0; } - return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &hdr->verf); } /* @@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, if (verfp->committed < 0) return 1; - return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &data->verf); } /** diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..150a8eb0f323 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -506,6 +506,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +static inline int +nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, + const struct nfs_write_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b13d48881d3a..3087fb6f1983 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1789,7 +1789,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { + if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); dprintk(" OK\n"); -- cgit v1.2.3 From a5314a74928fa6dbc4503a8c64f43bb5c1c12ac1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 21:42:32 -0400 Subject: NFS: Ensure we reset the write verifier 'committed' value on resend. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 ++ fs/nfs/internal.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d6d43b5eafb3..fb659bb50678 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -661,6 +661,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->verf.committed = NFS_INVALID_STABLE_HOW; + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) dreq->mirrors[i].count = 0; get_dreq(dreq); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 150a8eb0f323..0eb5c924886d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -499,6 +499,23 @@ int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +#ifdef CONFIG_NFS_V4_1 +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ + int i; + + for (i = 0; i < cinfo->nbuckets; i++) + cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} +#else +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ +} +#endif + + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -- cgit v1.2.3 From 2f3c7d87a347b12f725f6128b3097727b91b230e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 14:38:06 -0400 Subject: NFS: Remove racy size manipulations in O_DIRECT On success, the RPC callbacks will ensure that we make the appropriate calls to nfs_writeback_update_inode() Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index fb659bb50678..826d4dace0e5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -376,15 +376,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { struct inode *inode = dreq->inode; - if (dreq->iocb && write) { - loff_t pos = dreq->iocb->ki_pos + dreq->count; - - spin_lock(&inode->i_lock); - if (i_size_read(inode) < pos) - i_size_write(inode, pos); - spin_unlock(&inode->i_lock); - } - if (write) nfs_zap_mapping(inode, inode->i_mapping); @@ -1058,14 +1049,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!result) { result = nfs_direct_wait(dreq); if (result > 0) { - struct inode *inode = mapping->host; - iocb->ki_pos = pos + result; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); - /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } -- cgit v1.2.3 From 89698b24d24f9c8b470a73351b0b7199c17e0153 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 10:35:48 -0400 Subject: NFS Cleanup: move call to generic_write_checks() into fs/nfs/direct.c Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 12 ++++++++---- fs/nfs/file.c | 6 +----- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 826d4dace0e5..0169eca8eb42 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -988,6 +988,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t result = -EINVAL; + size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; @@ -998,8 +999,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, - iov_iter_count(iter)); + result = generic_write_checks(iocb, iter); + if (result <= 0) + return result; + count = result; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; @@ -1017,7 +1021,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; } - task_io_account_write(iov_iter_count(iter)); + task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); @@ -1025,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index df4dd8e7e62e..c26847c84d00 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -629,12 +629,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result) return result; - if (iocb->ki_flags & IOCB_DIRECT) { - result = generic_write_checks(iocb, from); - if (result <= 0) - return result; + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_write(iocb, from); - } dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, count, (long long) iocb->ki_pos); -- cgit v1.2.3 From 18290650b1c8655cfe6e0d63dd34942a037a130b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 15:00:42 -0400 Subject: NFS: Move buffered I/O locking into nfs_file_write() Preparation for the patch that de-serialises O_DIRECT reads and writes. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c26847c84d00..46cf0afe3c0f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -623,7 +623,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_iter_count(from); result = nfs_key_timeout_notify(file, inode); if (result) @@ -633,9 +632,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return nfs_file_direct_write(iocb, from); dprintk("NFS: write(%pD2, %zu@%Ld)\n", - file, count, (long long) iocb->ki_pos); + file, iov_iter_count(from), (long long) iocb->ki_pos); - result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; /* @@ -647,28 +645,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - result = count; - if (!count) + inode_lock(inode); + result = generic_write_checks(iocb, from); + if (result > 0) { + current->backing_dev_info = inode_to_bdi(inode); + result = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + } + inode_unlock(inode); + if (result <= 0) goto out; - result = generic_file_write_iter(iocb, from); - if (result > 0) - written = result; + written = generic_write_sync(iocb, result); + iocb->ki_pos += written; /* Return error values */ - if (result >= 0 && nfs_need_check_write(file, inode)) { + if (nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; } - if (result > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - goto out; + return -EBUSY; } EXPORT_SYMBOL_GPL(nfs_file_write); -- cgit v1.2.3 From a5864c999de6703f7ce908f72337568520c6cad3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Jun 2016 17:07:19 -0400 Subject: NFS: Do not serialise O_DIRECT reads and writes Allow dio requests to be scheduled in parallel, but ensuring that they do not conflict with buffered I/O. Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/direct.c | 41 +++----------- fs/nfs/file.c | 12 ++-- fs/nfs/internal.h | 8 +++ fs/nfs/io.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 6 files changed, 174 insertions(+), 37 deletions(-) create mode 100644 fs/nfs/io.c (limited to 'fs') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ - direct.o pagelist.o read.o symlink.o unlink.o \ + io.o direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0169eca8eb42..6d0e88096440 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -578,17 +578,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!count) goto out; - inode_lock(inode); - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -603,10 +598,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -614,13 +611,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += result; } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); out: return result; } @@ -1008,25 +1000,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; - inode_lock(inode); - - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - - if (mapping->nrpages) { - result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - if (result) - goto out_unlock; - } - task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -1041,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { @@ -1048,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_SHIFT, end); } - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1058,13 +1039,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) generic_write_sync(iocb, result); } } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 46cf0afe3c0f..9f8da9e1b23f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } + nfs_end_io_read(inode); return result; } EXPORT_SYMBOL_GPL(nfs_file_read); @@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + nfs_start_io_read(inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); } + nfs_end_io_read(inode); return res; } EXPORT_SYMBOL_GPL(nfs_file_splice_read); @@ -645,14 +649,14 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - inode_lock(inode); + nfs_start_io_write(inode); result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); result = generic_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; } - inode_unlock(inode); + nfs_end_io_write(inode); if (result <= 0) goto out; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 0eb5c924886d..159b64ede82a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -411,6 +411,14 @@ extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +/* io.c */ +extern void nfs_start_io_read(struct inode *inode); +extern void nfs_end_io_read(struct inode *inode); +extern void nfs_start_io_write(struct inode *inode); +extern void nfs_end_io_write(struct inode *inode); +extern void nfs_start_io_direct(struct inode *inode); +extern void nfs_end_io_direct(struct inode *inode); + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Trond Myklebust + * + * I/O and data path helper functionality. + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(inode); + } +} + +/** + * nfs_start_io_read - declare the file is being used for buffered reads + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +nfs_start_io_read(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_o_direct(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_read - declare that the buffered read operation is done + * @inode - file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * nfs_start_io_write - declare the file is being used for buffered writes + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +void +nfs_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + nfs_block_o_direct(NFS_I(inode), inode); +} + +/** + * nfs_end_io_write - declare that the buffered write operation is done + * @inode - file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +nfs_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + nfs_wb_all(inode); + } +} + +/** + * nfs_end_io_direct - declare the file is being used for direct i/o + * @inode - file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +nfs_start_io_direct(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_buffered(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_direct - declare that the direct i/o operation is done + * @inode - file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 120dd04b553c..225d17d35277 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -210,6 +210,7 @@ struct nfs_inode { #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ +#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3 From f7b5c340aca87d736a6b15aa40bf135f1baab011 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 09:29:47 -0400 Subject: NFS: Cleanup nfs_direct_complete() There is only one caller that sets the "write" argument to true, so just move the call to nfs_zap_mapping() and get rid of the now redundant argument. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6d0e88096440..c16d33eb1ddf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -372,13 +372,10 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) +static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; - if (write) - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_end(inode); if (dreq->iocb) { @@ -431,7 +428,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); hdr->release(hdr); } @@ -537,7 +534,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); return 0; } @@ -764,7 +761,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_direct_complete(dreq, true); + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); + nfs_direct_complete(dreq); } } -- cgit v1.2.3 From f508d46ae41a796036aef566637685dbf83b554f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 09:55:48 -0400 Subject: NFS: Remove redundant waits for O_DIRECT in fsync() and write_begin() We're now waiting immediately after taking the locks, so waiting in fsync() and write_begin() is either redundant or potentially subject to livelock (if not holding the lock). Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9f8da9e1b23f..0e9b4a068f13 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -276,7 +276,6 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); - inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -361,11 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Wait for O_DIRECT to complete - */ - inode_dio_wait(mapping->host); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; -- cgit v1.2.3 From be527494e02b89e03485955b30de6c1e976a07eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 08:19:36 -0400 Subject: NFS: Remove unused function nfs_revalidate_mapping_protected() Clean up... Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 38 ++++---------------------------------- include/linux/nfs_fs.h | 1 - 2 files changed, 4 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6c0618eb5d57..0e0500f2bb6b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1131,14 +1131,12 @@ out: } /** - * __nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * @may_lock - take inode->i_mutex? */ -static int __nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping, - bool may_lock) +int nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1187,12 +1185,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - if (may_lock) { - inode_lock(inode); - ret = nfs_invalidate_mapping(inode, mapping); - inode_unlock(inode); - } else - ret = nfs_invalidate_mapping(inode, mapping); + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1202,29 +1195,6 @@ out: return ret; } -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) -{ - return __nfs_revalidate_mapping(inode, mapping, false); -} - -/** - * nfs_revalidate_mapping_protected - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex - * while invalidating the mapping. - */ -int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) -{ - return __nfs_revalidate_mapping(inode, mapping, true); -} - static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 225d17d35277..810124b33327 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); -extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, -- cgit v1.2.3 From 651b0e702981304f77091b82870a01480705f4fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:24:46 -0400 Subject: NFS: Do not aggressively cache file attributes in the case of O_DIRECT A file that is open for O_DIRECT is by definition not obeying close-to-open cache consistency semantics, so let's not cache the attributes too aggressively either. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 9 +++++++-- fs/nfs/internal.h | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0e0500f2bb6b..7688436b19ba 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1214,6 +1214,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi) list)->mode & FMODE_WRITE) == FMODE_WRITE; } +static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) +{ + return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1278,7 +1283,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if (!nfs_file_has_writers(nfsi)) { + if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; @@ -1660,7 +1665,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; - bool have_writers = nfs_file_has_writers(nfsi); + bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 159b64ede82a..01dccf18da0a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -419,6 +419,11 @@ extern void nfs_end_io_write(struct inode *inode); extern void nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); +static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) +{ + return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; +} + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, -- cgit v1.2.3 From 79566ef018f53a181f067afdf7bef9cc53f9d34b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:45:40 -0400 Subject: NFS: Getattr doesn't require data sync semantics When retrieving stat() information, NFS unfortunately does require us to sync writes to disk in order to ensure that mtime and ctime are up to date. However we shouldn't have to ensure that those writes are persisted. Relaxing that requirement does mean that we may see an mtime/ctime change if the server reboots and forces us to replay all writes. The exception to this rule are pNFS clients that are required to send layoutcommit, however that is dealt with by the call to pnfs_sync_inode() in _nfs_revalidate_inode(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7688436b19ba..35fda08dc4f6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,9 +661,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - err = nfs_sync_inode(inode); - inode_unlock(inode); + err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; } -- cgit v1.2.3 From 1e564d3dbd684a105582471cb9ff2aada64a9052 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:50:53 -0400 Subject: NFSv4.2: Fix a race in nfs42_proc_deallocate() When punching holes in a file, we want to ensure the operation is serialised w.r.t. other writes, meaning that we want to call nfs_sync_inode() while holding the inode lock. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index aa03ed09ba06..0f9f536e647b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; - nfs_wb_all(inode); inode_lock(inode); + err = nfs_sync_inode(inode); + if (err) + goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - +out_unlock: inode_unlock(inode); return err; } -- cgit v1.2.3 From 837bb1d752d92ea4d870877ffbd6ec5cf76624b3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 18:12:03 -0400 Subject: NFSv4.2: Fix writeback races in nfs4_copy_file_range We need to ensure that any writes to the destination file are serialised with the copy, meaning that the writeback has to occur under the inode lock. Also relax the writeback requirement on the source, and rely on the stateid checking to tell us if the source rebooted. Add the helper nfs_filemap_write_and_wait_range() to call pnfs_sync_inode() as is appropriate for pNFS servers that may need a layoutcommit. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 3 +++ fs/nfs/nfs42proc.c | 9 +++++++++ fs/nfs/nfs4file.c | 14 +------------- fs/nfs/write.c | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 01dccf18da0a..3b01c9146e15 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -512,6 +512,9 @@ int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); + #ifdef CONFIG_NFS_V4_1 static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 0f9f536e647b..b7d457cea03f 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -156,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; + status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + pos_src, pos_src + (loff_t)count - 1); + if (status) + return status; + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; + status = nfs_sync_inode(dst_inode); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 014b0e41ace5..7cdc0ab9e6f5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - struct inode *in_inode = file_inode(file_in); - struct inode *out_inode = file_inode(file_out); - int ret; - - if (in_inode == out_inode) + if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; - /* flush any pending writes */ - ret = nfs_sync_inode(in_inode); - if (ret) - return ret; - ret = nfs_sync_inode(out_inode); - if (ret) - return ret; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3087fb6f1983..538a473b324b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1912,6 +1912,24 @@ out_mark_dirty: } EXPORT_SYMBOL_GPL(nfs_write_inode); +/* + * Wrapper for filemap_write_and_wait_range() + * + * Needed for pNFS in order to ensure data becomes visible to the + * client. + */ +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, lstart, lend); + if (ret == 0) + ret = pnfs_sync_inode(mapping->host, true); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); + /* * flush the inode to disk. */ -- cgit v1.2.3 From e95fc4a06983c14273a39d26aad9cc5a8a09ff64 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:57:39 -0400 Subject: NFSv4.2: llseek(SEEK_HOLE) and llseek(SEEK_DATA) don't require data sync We want to ensure that we write the cached data to the server, but don't require it be synced to disk. If the server reboots, we will get a stateid error, which will cause us to retry anyway. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index b7d457cea03f..616dc254b38b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -269,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, if (status) return status; - nfs_wb_all(inode); + status = nfs_filemap_write_and_wait_range(inode->i_mapping, + offset, LLONG_MAX); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) -- cgit v1.2.3 From 9a773e7c8de2a34ae682624624e95a96b121b6d1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 11:09:04 -0400 Subject: NFS nfs_vm_page_mkwrite: Don't freeze me, Bro... Prevent filesystem freezes while handling the write page fault. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0e9b4a068f13..039d58790629 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -569,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); + sb_start_pagefault(inode->i_sb); + /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); @@ -595,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: unlock_page(page); out: + sb_end_pagefault(inode->i_sb); return ret; } -- cgit v1.2.3 From a4e187d83d88eeaba6252aac0a2ffe5eaa73a818 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Jun 2016 13:55:22 -0400 Subject: NFS: Don't drop CB requests with invalid principals Before commit 778be232a207 ("NFS do not find client in NFSv4 pg_authenticate"), the Linux callback server replied with RPC_AUTH_ERROR / RPC_AUTH_BADCRED, instead of dropping the CB request. Let's restore that behavior so the server has a chance to do something useful about it, and provide a warning that helps admins correct the problem. Fixes: 778be232a207 ("NFS do not find client in NFSv4 ...") Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 6 +++++- net/sunrpc/svc.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d81f96aacd51..656f68f7fe53 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) - return rpc_drop_reply; + goto out_invalidcred; } cps.minorversion = hdr_arg.minorversion; @@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; + +out_invalidcred: + pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); + return rpc_autherr_badcred; } /* diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 87290a5a9ac7..c5b0cb4f4056 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1194,6 +1194,11 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) procp->pc_release(rqstp, NULL, rqstp->rq_resp); goto dropit; } + if (*statp == rpc_autherr_badcred) { + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto err_bad_auth; + } if (*statp == rpc_success && (xdr = procp->pc_encode) && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { -- cgit v1.2.3 From 8b7d9d09b24f4ef16f7ae34b6d9e59857fda0870 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2016 12:42:40 -0400 Subject: NFSv4: Revert "Truncating file opens should also sync O_DIRECT writes" We're not holding any locks, so both nfs_wb_all() and inode_dio_wait() are unenforcible and have livelock potential. Just limit ourselves to flushing out the data. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 7cdc0ab9e6f5..d085ad794884 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_sync_inode(inode); + filemap_write_and_wait(inode->i_mapping); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); -- cgit v1.2.3 From 0173ca0544b682b7b313269dc0600d4774098a14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 Jul 2016 18:41:28 +0900 Subject: nfs/blocklayout: use proper fmode for opening block devices This was fixed for the original block layout code a while ago, but also needs to be fixed for the SCSI layout path. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e5b89675263e..7fb9c07c078c 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -316,7 +316,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return -EINVAL; } - d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); + d->bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(d->bdev)) { pr_warn("pNFS: failed to open device %s (%ld)\n", devname, PTR_ERR(d->bdev)); @@ -352,7 +352,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ); + blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); return error; } -- cgit v1.2.3 From d702d41ed41328487bd3b270467721222f8036e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 Jul 2016 18:41:29 +0900 Subject: nfs/blocklayout: refactor open-by-wwn The current code works with the standard udev/systemd names, but we'll have to add another method in the next patch. Refactor it into a separate helper to make room for the new variant. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 53 ++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 7fb9c07c078c..ea70883a174a 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -287,44 +287,45 @@ bl_validate_designator(struct pnfs_block_volume *v) } } +/* + * Try to open the udev path for the WWN. At least on Debian the udev + * by-id path will always point to the dm-multipath device if one exists. + */ +static struct block_device * +bl_open_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { + pr_warn("pNFS: failed to open device %s (%ld)\n", + devname, PTR_ERR(bdev)); + } + + kfree(devname); + return bdev; +} + static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; const struct pr_ops *ops; - const char *devname; int error; if (!bl_validate_designator(v)) return -EINVAL; - switch (v->scsi.designator_len) { - case 8: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", - v->scsi.designator); - break; - case 12: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", - v->scsi.designator); - break; - case 16: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", - v->scsi.designator); - break; - default: - return -EINVAL; - } - - d->bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(d->bdev)) { - pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(d->bdev)); - kfree(devname); + d->bdev = bl_open_udev_path(v); + if (IS_ERR(d->bdev)) return PTR_ERR(d->bdev); - } - - kfree(devname); d->len = i_size_read(d->bdev->bd_inode); d->map = bl_map_simple; -- cgit v1.2.3 From 11487ddbdb12c36e094bedcc5c906ff219905a03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 Jul 2016 18:41:30 +0900 Subject: nfs/blocklayout: support RH/Fedora dm-mpath device nodes Instead of reusing the wwn-* names for multipath devices nodes RHEL and Fedora introduce new dm-mpath-uuid-* nodes with a slightly different naming scheme. Try these names first to ensure we always get a multipath-capable device if it exists. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index ea70883a174a..436bb303d856 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -312,6 +312,28 @@ bl_open_udev_path(struct pnfs_block_volume *v) return bdev; } +/* + * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the + * wwn- links will only point to the first discovered SCSI device there. + */ +static struct block_device * +bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, + "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", + v->scsi.designator_type, + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + kfree(devname); + return bdev; +} + static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) @@ -323,7 +345,9 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, if (!bl_validate_designator(v)) return -EINVAL; - d->bdev = bl_open_udev_path(v); + d->bdev = bl_open_dm_mpath_udev_path(v); + if (IS_ERR(d->bdev)) + d->bdev = bl_open_udev_path(v); if (IS_ERR(d->bdev)) return PTR_ERR(d->bdev); -- cgit v1.2.3 From ecc2b88c4a48cbabb3ed7617f55e92c123fb634b Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Thu, 14 Jul 2016 12:01:28 +0800 Subject: nfs/blocklayout: Make sure calculate signature length aligned Avoid a bad nfs server return an unaligned length of signature. Signed-off-by: Kinglong Mee Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 436bb303d856..7d9f570e1b52 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) memcpy(&b->simple.sigs[i].sig, p, b->simple.sigs[i].sig_len); - b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + b->simple.len += 8 + 4 + \ + (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); } break; case PNFS_BLOCK_VOLUME_SLICE: -- cgit v1.2.3 From c77efc1e7884c818ba67ec36b08e220202d9428c Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Thu, 14 Jul 2016 12:02:01 +0800 Subject: nfs/blocklayout: Check max uuids and devices before decoding Avoid nfs return uuids/devices larger than maximum. Signed-off-by: Kinglong Mee Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 7d9f570e1b52..118252fd1d64 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) if (!p) return -EIO; b->simple.nr_sigs = be32_to_cpup(p++); - if (!b->simple.nr_sigs) { - dprintk("no signature\n"); + if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { + dprintk("Bad signature count: %d\n", b->simple.nr_sigs); return -EIO; } @@ -105,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 4); if (!p) return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->concat.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); if (!p) @@ -117,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 8 + 4); if (!p) return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); b->stripe.volumes_count = be32_to_cpup(p++); + if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->stripe.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); if (!p) -- cgit v1.2.3 From 10b7e9ad44881fcd46ac24eb7374377c6e8962ed Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Jul 2016 00:51:01 -0400 Subject: pNFS: Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding We know that the attributes will need updating if there is still a LAYOUTCOMMIT outstanding. Reported-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 ++++- fs/nfs/pnfs.h | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 35fda08dc4f6..9df45832e28b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1664,7 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); - bool cache_revalidated = true; + bool cache_revalidated; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1713,6 +1713,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do atomic weak cache consistency updates */ invalid |= nfs_wcc_update_inode(inode, fattr); + + cache_revalidated = !pnfs_layoutcommit_outstanding(inode); + /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (inode->i_version != fattr->change_attr) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d6be5299a55a..181283c4ebc3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -628,6 +628,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync) return 0; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + static inline bool pnfs_roc(struct inode *ino) { -- cgit v1.2.3 From 56b38a1f7c781519eef09c1668a3c97ea911f86b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2016 18:34:12 -0400 Subject: pNFS: Fix post-layoutget error handling in pnfs_update_layout() The non-retry error path is currently broken and ends up releasing the reference to the layout twice. It also can end up clearing the NFS_LAYOUT_FIRST_LAYOUTGET flag twice, causing a race. In addition, the retry path will fail to decrement the plh_outstanding counter. Fixes: 183d9e7b112aa ("pnfs: rework LAYOUTGET retry handling") Cc: stable@vger.kernel.org # 4.7 Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/pnfs.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0fbe734cc38c..563f131c9abe 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1645,6 +1645,7 @@ lookup_again: lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + atomic_dec(&lo->plh_outstanding); if (IS_ERR(lseg)) { switch(PTR_ERR(lseg)) { case -ERECALLCONFLICT: @@ -1652,26 +1653,26 @@ lookup_again: lseg = NULL; /* Fallthrough */ case -EAGAIN: - pnfs_put_layout_hdr(lo); - if (first) - pnfs_clear_first_layoutget(lo); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, - iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); - goto lookup_again; - } - /* Fallthrough */ + break; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); lseg = NULL; } + goto out_put_layout_hdr; + } + if (lseg) { + if (first) + pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + pnfs_put_layout_hdr(lo); + goto lookup_again; } } else { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); } - atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); -- cgit v1.2.3 From e85d7ee42003314652ab3ae2c60e3b8cd793b65f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2016 18:46:24 -0400 Subject: pNFS: Separate handling of NFS4ERR_LAYOUTTRYLATER and RECALLCONFLICT They are not the same error, and need to be handled differently. Fixes: 183d9e7b112aa ("pnfs: rework LAYOUTGET retry handling") Cc: stable@vger.kernel.org # 4.7 Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/nfs4proc.c | 23 ++++++++++++++--------- fs/nfs/pnfs.c | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 519368b98762..ee8efe0a5202 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -437,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -7883,11 +7884,12 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - int status = task->tk_status; + int nfs4err = task->tk_status; + int err, status = 0; dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - switch (status) { + switch (nfs4err) { case 0: goto out; @@ -7919,12 +7921,11 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EOVERFLOW; goto out; } - /* Fallthrough */ + status = -EBUSY; + break; case -NFS4ERR_RECALLCONFLICT: - nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, - exception); status = -ERECALLCONFLICT; - goto out; + break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; @@ -7955,9 +7956,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, spin_unlock(&inode->i_lock); } - status = nfs4_handle_exception(server, status, exception); - if (exception->retry) - status = -EAGAIN; + err = nfs4_handle_exception(server, nfs4err, exception); + if (!status) { + if (exception->retry) + status = -EAGAIN; + else + status = err; + } out: dprintk("<-- %s\n", __func__); return status; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 563f131c9abe..c50d4ebab5c5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1648,6 +1648,7 @@ lookup_again: atomic_dec(&lo->plh_outstanding); if (IS_ERR(lseg)) { switch(PTR_ERR(lseg)) { + case -EBUSY: case -ERECALLCONFLICT: if (time_after(jiffies, giveup)) lseg = NULL; -- cgit v1.2.3 From 66b53f325876703b7ab815c482cd104609f8772c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2016 14:28:31 -0400 Subject: pNFS: Handle NFS4ERR_RECALLCONFLICT correctly in LAYOUTGET Instead of giving up altogether and falling back to doing I/O through the MDS, which may make the situation worse, wait for 2 lease periods for the callback to resolve itself, and then try destroying the existing layout. Only if this was an attempt at getting a first layout, do we give up altogether, as the server is clearly crazy. Fixes: 183d9e7b112aa ("pnfs: rework LAYOUTGET retry handling") Cc: stable@vger.kernel.org # 4.7 Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/pnfs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c50d4ebab5c5..7d992362ff04 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1505,7 +1505,7 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; nfs4_stateid stateid; long timeout = 0; - unsigned long giveup = jiffies + rpc_get_timeout(server->client); + unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { @@ -1649,9 +1649,18 @@ lookup_again: if (IS_ERR(lseg)) { switch(PTR_ERR(lseg)) { case -EBUSY: - case -ERECALLCONFLICT: if (time_after(jiffies, giveup)) lseg = NULL; + break; + case -ERECALLCONFLICT: + /* Huh? We hold no layouts, how is there a recall? */ + if (first) { + lseg = NULL; + break; + } + /* Destroy the existing layout and start over */ + if (time_after(jiffies, giveup)) + pnfs_destroy_layout(NFS_I(ino)); /* Fallthrough */ case -EAGAIN: break; -- cgit v1.2.3 From f7db0b283868411dc6bc8a223fd032b211d2d91f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2016 15:14:02 -0400 Subject: pNFS: Fix LAYOUTGET handling of NFS4ERR_BAD_STATEID and NFS4ERR_EXPIRED We want to recover the open stateid if there is no layout stateid and/or the stateid argument matches an open stateid. Otherwise throw out the existing layout and recover from scratch, as the layout stateid is bad. Fixes: 183d9e7b112aa ("pnfs: rework LAYOUTGET retry handling") Cc: stable@vger.kernel.org # 4.7 Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- fs/nfs/nfs4proc.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ee8efe0a5202..a1a3b4c9a563 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7886,6 +7886,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, struct pnfs_layout_hdr *lo; int nfs4err = task->tk_status; int err, status = 0; + LIST_HEAD(head); dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); @@ -7930,30 +7931,25 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - if (nfs4_stateid_match(&lgp->args.stateid, + lo = NFS_I(inode)->layout; + /* If the open stateid was bad, then recover it. */ + if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + nfs4_stateid_match_other(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); - /* If the open stateid was bad, then recover it. */ exception->state = lgp->args.ctx->state; break; } - lo = NFS_I(inode)->layout; - if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && - nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { - LIST_HEAD(head); - - /* - * Mark the bad layout state as invalid, then retry - * with the current stateid. - */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); - spin_unlock(&inode->i_lock); - pnfs_free_lseg_list(&head); - status = -EAGAIN; - goto out; - } else - spin_unlock(&inode->i_lock); + + /* + * Mark the bad layout state as invalid, then retry + */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + status = -EAGAIN; + goto out; } err = nfs4_handle_exception(server, nfs4err, exception); -- cgit v1.2.3 From e68fd7c8071d541d3f2f7eed5814b63e865dd277 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Wed, 25 May 2016 10:36:50 -0400 Subject: mount: use sec= that was specified on the command line When older servers return RPC_AUTH_NULL, it means the rpc creds will be ignored. In that case use the sec= that was specified instead of setting sec=null Fixes Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1112983 Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2137e0202f25..18d446e1a82b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; + int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor - * can be used. + * can be used but still use the sec= that was specified. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; - if (nfs_auth_info_match(&args->auth_info, flavor) || - flavor == RPC_AUTH_NULL) + if (nfs_auth_info_match(&args->auth_info, flavor)) goto out; + + if (flavor == RPC_AUTH_NULL) + use_auth_null = true; + } + + if (use_auth_null) { + flavor = RPC_AUTH_NULL; + goto out; } dfprintk(MOUNT, -- cgit v1.2.3 From ce52914eb76efd62aa48d738cf845b37852bf920 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 7 Jun 2016 15:14:48 -0400 Subject: sunrpc: move NO_CRKEY_TIMEOUT to the auth->au_flags A generic_cred can be used to look up a unx_cred or a gss_cred, so it's not really safe to use the the generic_cred->acred->ac_flags to store the NO_CRKEY_TIMEOUT flag. A lookup for a unx_cred triggered while the KEY_EXPIRE_SOON flag is already set will cause both NO_CRKEY_TIMEOUT and KEY_EXPIRE_SOON to be set in the ac_flags, leaving the user associated with the auth_cred to be in a state where they're perpetually doing 4K NFS_FILE_SYNC writes. This can be reproduced as follows: 1. Mount two NFS filesystems, one with sec=krb5 and one with sec=sys. They do not need to be the same export, nor do they even need to be from the same NFS server. Also, v3 is fine. $ sudo mount -o v3,sec=krb5 server1:/export /mnt/krb5 $ sudo mount -o v3,sec=sys server2:/export /mnt/sys 2. As the normal user, before accessing the kerberized mount, kinit with a short lifetime (but not so short that renewing the ticket would leave you within the 4-minute window again by the time the original ticket expires), e.g. $ kinit -l 10m -r 60m 3. Do some I/O to the kerberized mount and verify that the writes are wsize, UNSTABLE: $ dd if=/dev/zero of=/mnt/krb5/file bs=1M count=1 4. Wait until you're within 4 minutes of key expiry, then do some more I/O to the kerberized mount to ensure that RPC_CRED_KEY_EXPIRE_SOON gets set. Verify that the writes are 4K, FILE_SYNC: $ dd if=/dev/zero of=/mnt/krb5/file bs=1M count=1 5. Now do some I/O to the sec=sys mount. This will cause RPC_CRED_NO_CRKEY_TIMEOUT to be set: $ dd if=/dev/zero of=/mnt/sys/file bs=1M count=1 6. Writes for that user will now be permanently 4K, FILE_SYNC for that user, regardless of which mount is being written to, until you reboot the client. Renewing the kerberos ticket (assuming it hasn't already expired) will have no effect. Grabbing a new kerberos ticket at this point will have no effect either. Move the flag to the auth->au_flags field (which is currently unused) and rename it slightly to reflect that it's no longer associated with the auth_cred->ac_flags. Add the rpc_auth to the arg list of rpcauth_cred_key_to_expire and check the au_flags there too. Finally, add the inode to the arg list of nfs_ctx_key_to_expire so we can determine the rpc_auth to pass to rpcauth_cred_key_to_expire. Signed-off-by: Scott Mayhew Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ++-- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 6 ++++-- include/linux/sunrpc/auth.h | 6 ++++-- net/sunrpc/auth.c | 4 +++- net/sunrpc/auth_generic.c | 9 +-------- net/sunrpc/auth_gss/auth_gss.c | 1 + net/sunrpc/auth_null.c | 1 + net/sunrpc/auth_unix.c | 1 + 9 files changed, 18 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6af52d..6bcd8913e8a9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -432,7 +432,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx)) { + if (nfs_ctx_key_to_expire(ctx, mapping->host)) { status = nfs_wb_all(mapping->host); if (status < 0) return status; @@ -645,7 +645,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx)) + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fa88609f85e3..d2260e67334f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -497,7 +497,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); #ifdef CONFIG_MIGRATION diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..0b949a06b297 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1195,9 +1195,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) /* * Test if the open context credential key is marked to expire soon. */ -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) { - return rpcauth_cred_key_to_expire(ctx->cred); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_cred_key_to_expire(auth, ctx->cred); } /* diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 899791573a40..f890a295a7ff 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -37,7 +37,6 @@ struct rpcsec_gss_info; /* auth_cred ac_flags bits */ enum { - RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */ RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying key will expire soon */ @@ -82,6 +81,9 @@ struct rpc_cred { #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 +/* rpc_auth au_flags */ +#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */ + /* * Client authentication handle */ @@ -196,7 +198,7 @@ void rpcauth_destroy_credcache(struct rpc_auth *); void rpcauth_clear_credcache(struct rpc_cred_cache *); int rpcauth_key_timeout_notify(struct rpc_auth *, struct rpc_cred *); -bool rpcauth_cred_key_to_expire(struct rpc_cred *); +bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *); char * rpcauth_stringify_acceptor(struct rpc_cred *); static inline diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff627c18a..696eb39fc1cb 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -359,8 +359,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); bool -rpcauth_cred_key_to_expire(struct rpc_cred *cred) +rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) { + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) + return false; if (!cred->cr_ops->crkey_to_expire) return false; return cred->cr_ops->crkey_to_expire(cred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fdead54..168219535a34 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) return 0; /* Fast track for the normal case */ @@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) if (IS_ERR(tcred)) return -EACCES; - if (!tcred->cr_ops->crkey_timeout) { - set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); - ret = 0; - goto out_put; - } - /* Test for the almost error case */ ret = tcred->cr_ops->crkey_timeout(tcred); if (ret != 0) { @@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); } -out_put: put_rpccred(tcred); return ret; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93d5b4f..813a3cdfb573 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1015,6 +1015,7 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_flags = 0; auth->au_ops = &authgss_ops; auth->au_flavor = flavor; atomic_set(&auth->au_count, 1); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d5ddd8..4d17376b2acb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -115,6 +115,7 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452b7cbc..a99278c984e8 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -228,6 +228,7 @@ static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), -- cgit v1.2.3 From d9c0ce0e45723a4924d22ac2e5e13c2dede76cad Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 10 Jun 2016 16:37:35 -0400 Subject: pnfs/blocklayout: put deviceid node after releasing bl_ext_lock The last put of deviceid nodes for SCSI layouts may sleep, so we shouldn't hold any spinlocks. Make sure we put them outside the bl_ext_lock. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extent_tree.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 720b3ff55fa9..992bcb19c11e 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) return be; } +static void __ext_put_deviceids(struct list_head *head) +{ + struct pnfs_block_extent *be, *tmp; + + list_for_each_entry_safe(be, tmp, head, be_list) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } +} + static void __ext_tree_insert(struct rb_root *root, struct pnfs_block_extent *new, bool merge_ok) @@ -163,7 +173,8 @@ free_new: } static int -__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +__ext_tree_remove(struct rb_root *root, + sector_t start, sector_t end, struct list_head *tmp) { struct pnfs_block_extent *be; sector_t len1 = 0, len2 = 0; @@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) struct pnfs_block_extent *next = ext_tree_next(be); rb_erase(&be->be_node, root); - nfs4_put_deviceid_node(be->be_device); - kfree(be); + list_add_tail(&be->be_list, tmp); be = next; } @@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end) { int err, err2; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (rw) { - err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); if (!err) err = err2; } spin_unlock(&bl->bl_ext_lock); + __ext_put_deviceids(&tmp); return err; } @@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, sector_t end = start + len; struct pnfs_block_extent *be; int err = 0; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); /* * First remove all COW extents or holes from written to range. */ - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (err) goto out; @@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } out: spin_unlock(&bl->bl_ext_lock); + + __ext_put_deviceids(&tmp); return err; } -- cgit v1.2.3 From 3fc75f12089eab6bf3f5350a5f760f241060bd5d Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Mon, 13 Jun 2016 19:57:35 +0200 Subject: nfs4: clnt: respect noresvport when establishing connections to DSes result: $ mount -o vers=4.1 dcache-lab007:/ /pnfs $ cp /etc/profile /pnfs tcp 0 0 131.169.185.68:1005 131.169.191.141:32049 ESTABLISHED tcp 0 0 131.169.185.68:751 131.169.191.144:2049 ESTABLISHED $ $ mount -o vers=4.1,noresvport dcache-lab007:/ /pnfs $ cp /etc/profile /pnfs tcp 0 0 131.169.185.68:34894 131.169.191.141:32049 ESTABLISHED tcp 0 0 131.169.185.68:35722 131.169.191.144:2049 ESTABLISHED $ Signed-off-by: Tigran Mkrtchyan Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 +- fs/nfs/nfs4client.c | 6 +++++- fs/nfs/pnfs_nfs.c | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d2260e67334f..b257a2eaf92c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -186,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, rpc_authflavor_t); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); -extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 5fc7fbbfdcef..8d7d08d4f95f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -844,12 +844,13 @@ error: * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version, rpc_authflavor_t au_flavor) { struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, @@ -868,6 +869,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index b38e3c0dc790..5856b2c66234 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv->nfs_client, + clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, minor_version, -- cgit v1.2.3 From b224f7cb635f0a1a0a80c1dae93699a2a1161604 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Mon, 13 Jun 2016 20:52:00 +0200 Subject: nfs4: flexfiles: respect noresvport when establishing connections to DSes Signed-off-by: Tigran Mkrtchyan Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 +- fs/nfs/nfs3client.c | 6 +++++- fs/nfs/pnfs_nfs.c | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b257a2eaf92c..f7e33a5984d8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -195,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, rpc_authflavor_t au_flavor); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); -extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor); diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 0457b4129421..ee753547fb0a 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -76,12 +76,13 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor) { struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, @@ -100,6 +101,9 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); clp = nfs_get_client(&cl_init, au_flavor); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 5856b2c66234..fe183fbc4b90 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) } static struct nfs_client *(*get_v3_ds_connect)( - struct nfs_client *mds_clp, + struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, @@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); } else - clp = get_v3_ds_connect(mds_srv->nfs_client, + clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, au_flavor); -- cgit v1.2.3 From e033fb51ebb2983ee17b4a1b96ccbaedb137d9e9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Jul 2016 09:43:43 -0400 Subject: pNFS/files: filelayout_write_done_cb must call nfs_writeback_update_inode() All write callbacks are required to call nfs_writeback_update_inode() upon success to ensure that file size changes are recorded, and the attribute cache is invalidated. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 25bd91a6e088..a3fc48ba4931 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -357,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task, } filelayout_set_layoutcommit(hdr); + + /* zero out the fattr */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } -- cgit v1.2.3 From 297fae4d0bee5d683533f3324baf1b363e7b48bf Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Thu, 21 Jul 2016 13:32:04 +0200 Subject: Fix NULL pointer dereference in bl_free_device(). When bl_parse_deviceid() fails in bl_alloc_deviceid_node() on blkdev_get_by_*() step we get an pnfs_block_dev struct that is uninitialized except for bdev field which is set to whatever error blkdev_get_by_*() returns. bl_free_device() then tries to call blkdev_put() if bdev is not 0 resulting in a wrong pointer dereference. Fixing this by setting bdev in struct pnfs_block_dev only if we didn't get an error from blkdev_get_by_*(). Signed-off-by: Artem Savkov Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 118252fd1d64..a69ef4e9c24c 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -235,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(d->bdev)) { + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); - return PTR_ERR(d->bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); + return PTR_ERR(bdev); } + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); @@ -350,17 +352,19 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; const struct pr_ops *ops; int error; if (!bl_validate_designator(v)) return -EINVAL; - d->bdev = bl_open_dm_mpath_udev_path(v); - if (IS_ERR(d->bdev)) - d->bdev = bl_open_udev_path(v); - if (IS_ERR(d->bdev)) - return PTR_ERR(d->bdev); + bdev = bl_open_dm_mpath_udev_path(v); + if (IS_ERR(bdev)) + bdev = bl_open_udev_path(v); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); d->map = bl_map_simple; -- cgit v1.2.3 From 149a4fddd0a72d526abbeac0c8deaab03559836a Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 18 Jul 2016 10:41:57 -0400 Subject: nfs: don't create zero-length requests NFS doesn't expect requests with wb_bytes set to zero and may make unexpected decisions about how to handle that request at the page IO layer. Skip request creation if we won't have any wb_bytes in the request. Signed-off-by: Benjamin Coddington Signed-off-by: Alexey Dobriyan Reviewed-by: Weston Andros Adamson Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0b949a06b297..b5f3da346f1d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1291,6 +1291,9 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", file, count, (long long)(page_file_offset(page) + offset)); + if (!count) + goto out; + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; @@ -1301,7 +1304,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_set_pageerror(page); else __set_page_dirty_nobuffers(page); - +out: dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; -- cgit v1.2.3 From 45fcc7bca7004687e9ba28e08b3dfb6787a0b466 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 12:26:34 -0400 Subject: pNFS: LAYOUTRETURN should only update the stateid if the layout is valid If the layout was completely returned, then ignore the returned layout stateid. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff416d0e24bc..dc50ba4d84ad 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8130,7 +8130,7 @@ static void nfs4_layoutreturn_release(void *calldata) pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, be32_to_cpu(lrp->args.stateid.seqid)); pnfs_mark_layout_returned_if_empty(lo); - if (lrp->res.lrs_present) + if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b21bd0bee784..2f4f26905c03 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -375,6 +375,11 @@ static inline bool nfs_have_layout(struct inode *inode) return NFS_I(inode)->layout != NULL; } +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0; +} + static inline struct nfs4_deviceid_node * nfs4_get_deviceid(struct nfs4_deviceid_node *d) { -- cgit v1.2.3 From 8e0acf9046b868a02e9afdddd25c8a132d50e99c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Jul 2016 11:53:29 -0400 Subject: pNFS: Clear the layout return tracking on layout reinitialisation Ensure that we don't carry over layoutreturn info from a previous incarnation of this layout. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0fbe734cc38c..dcccded1aeed 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -873,15 +873,21 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) { if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; pnfs_get_layout_hdr(lo); - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + pnfs_clear_layoutreturn_info(lo); return true; } @@ -1764,10 +1770,13 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) lo->plh_barrier = be32_to_cpu(res->stateid.seqid); } - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); + if (!pnfs_layout_is_valid(lo)) { + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + } + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); -- cgit v1.2.3 From 13bede18de41e2cfe8f67c1cd8b6d10be42ef473 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 11:39:03 -0400 Subject: pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set If the layout stateid is invalid, then pnfs_set_layout_stateid() must always initialise it. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dcccded1aeed..eef844785bc6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -762,7 +762,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { u32 oldseq, newseq, new_barrier; - int empty = list_empty(&lo->plh_segs); + bool empty = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); -- cgit v1.2.3 From ecebb80bf3ee8c5f3172f00bb17ba55f9e3ae24f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 11:46:06 -0400 Subject: pNFS: Always update the layout barrier seqid on LAYOUTGET Currently, pnfs_set_layout_stateid() will update the layout sequence id barrier only if the stateid itself is newer than the current layout stateid. However in a situation where multiple LAYOUTGET calls and a LAYOUTRETURN raced, it is entirely possible for one of the LAYOUTGET to set the current stateid to something newer than the LAYOUTRETURN that needs to set the barrier. The fix is to allow the "update_barrier" flag to force a check as to whether or not the barrier needs to be updated. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index eef844785bc6..85c3e7b47ddb 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -761,24 +761,25 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { - u32 oldseq, newseq, new_barrier; - bool empty = !pnfs_layout_is_valid(lo); + u32 oldseq, newseq, new_barrier = 0; + bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { + if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); - if (update_barrier) { - new_barrier = be32_to_cpu(new->seqid); - } else { - /* Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. + */ + new_barrier = newseq - atomic_read(&lo->plh_outstanding); } + if (update_barrier) + new_barrier = be32_to_cpu(new->seqid); + else if (new_barrier == 0) + return; + if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; } static bool -- cgit v1.2.3 From 793b7fe55858dca1f5bd3e42185b541a9eddc144 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 23 Jul 2016 21:11:43 -0400 Subject: pNFS: Fix CB_LAYOUTRECALL stateid verification We want to evaluate in this order: If the client holds no layout for this inode, then return NFS4ERR_NOMATCHING_LAYOUT; it probably forgot the layout. If the client finds the inode among the list of layouts, but the corresponding stateid has not yet been initialised, then return NFS4ERR_DELAY to ask the server to retry once the outstanding LAYOUTGET is complete. If the current layout stateid's "other" field does not match the recalled stateid, return NFS4ERR_BAD_STATEID. If already processing a layout recall with a newer stateid, return NFS4ERR_OLD_STATEID. This can only happens for servers that are non-compliant with the NFSv4.1 protocol. If already processing a layout recall with an older stateid, return NFS4ERR_DELAY to ask the server to retry once the outstanding LAYOUTRETURN is complete. Again, this is technically incompliant with the NFSv4.1 protocol. If the current layout sequence id is newer than the recalled stateid's sequence id, return NFS4ERR_OLD_STATEID. This too implies protocol non-compliance. If the current layout sequence id is older than the recalled stateid's sequence id+1, return NFS4ERR_DELAY. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 63 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index aaa2e8d3df6f..837da8a02d35 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -119,27 +119,30 @@ out: * hashed by filehandle. */ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct nfs_server *server; + struct nfs_inode *nfsi; struct inode *ino; struct pnfs_layout_hdr *lo; +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + nfsi = NFS_I(lo->plh_inode); + if (nfs_compare_fh(fh, &nfsi->fh)) continue; - if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) + if (nfsi->layout != lo) continue; ino = igrab(lo->plh_inode); if (!ino) break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { + if (nfsi->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - break; + goto restart; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, } static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh, stateid); + lo = get_layout_by_fh_locked(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, /* * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ -static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, +static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new) { u32 oldseq, newseq; - oldseq = be32_to_cpu(lo->plh_stateid.seqid); + /* Is the stateid still not initialised? */ + if (!pnfs_layout_is_valid(lo)) + return NFS4ERR_DELAY; + + /* Mismatched stateid? */ + if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) + return NFS4ERR_BAD_STATEID; + newseq = be32_to_cpu(new->seqid); + /* Are we already in a layout recall situation? */ + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && + lo->plh_return_seq != 0) { + if (newseq < lo->plh_return_seq) + return NFS4ERR_OLD_STATEID; + if (newseq > lo->plh_return_seq) + return NFS4ERR_DELAY; + goto out; + } + /* Check that the stateid matches what we think it should be. */ + oldseq = be32_to_cpu(lo->plh_stateid.seqid); if (newseq > oldseq + 1) - return false; - return true; + return NFS4ERR_DELAY; + /* Crazy server! */ + if (newseq <= oldseq) + return NFS4ERR_OLD_STATEID; +out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, @@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); + lo = get_layout_by_fh(clp, &args->cbl_fh); if (!lo) { trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, &args->cbl_stateid, -rv); @@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, } ino = lo->plh_inode; + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); - if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { - rv = NFS4ERR_DELAY; + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + if (rv != NFS_OK) goto unlock; - } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); - spin_unlock(&ino->i_lock); - - pnfs_layoutcommit_inode(ino, false); - spin_lock(&ino->i_lock); /* * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) */ @@ -223,6 +245,9 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } + /* Embrace your forgetfulness! */ + rv = NFS4ERR_NOMATCHING_LAYOUT; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); -- cgit v1.2.3 From e5fd1904b8422615a2a286777e2b7c881ad53e73 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Jul 2016 12:44:15 -0400 Subject: pNFS: Ensure layoutreturn acts as a completion for layout callbacks When we return NFS_OK to the CB_LAYOUTRECALL, we are required to send a layoutreturn that "completes" that layout recall request, using the correct stateid. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 85c3e7b47ddb..878dc4b7085a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -883,12 +883,28 @@ pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) } static bool -pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, + nfs4_stateid *stateid, + enum pnfs_iomode *iomode) { if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; pnfs_get_layout_hdr(lo); - pnfs_clear_layoutreturn_info(lo); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { + if (stateid != NULL) { + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); + } + if (iomode != NULL) + *iomode = lo->plh_return_iomode; + pnfs_clear_layoutreturn_info(lo); + return true; + } + if (stateid != NULL) + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (iomode != NULL) + *iomode = IOMODE_ANY; return true; } @@ -956,10 +972,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) enum pnfs_iomode iomode; bool send; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - stateid.seqid = cpu_to_be32(lo->plh_return_seq); - iomode = lo->plh_return_iomode; - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ @@ -996,7 +1009,6 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1020,7 +1032,7 @@ _pnfs_return_layout(struct inode *ino) } set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) @@ -1087,11 +1099,10 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ - if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo, + &stateid, NULL); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ @@ -1874,10 +1885,9 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, lseg->pls_seq)) { nfs4_stateid stateid; - enum pnfs_iomode iomode = lo->plh_return_iomode; + enum pnfs_iomode iomode; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - return_now = pnfs_prepare_layoutreturn(lo); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (return_now) pnfs_send_layoutreturn(lo, &stateid, iomode, false); -- cgit v1.2.3 From 2d6cf5ab0b5d13d06c4b7920d6a12dbedf003190 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Jul 2016 13:06:18 -0400 Subject: pNFS: Do not set plh_return_seq for non-callback related layoutreturns In cases where we need to send a layoutreturn in order to propagate an error, we should not tie that to a specific layout stateid. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 878dc4b7085a..c57cbddca760 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1808,14 +1808,14 @@ static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) { - if (lo->plh_return_iomode == iomode) - return; - if (lo->plh_return_iomode != 0) + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) + if (seq != 0) { + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); lo->plh_return_seq = seq; + } } /** @@ -1876,14 +1876,13 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); + pnfs_set_plh_return_info(lo, range.iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, - &range, lseg->pls_seq)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { nfs4_stateid stateid; enum pnfs_iomode iomode; -- cgit v1.2.3 From e036f46453f252539cb62bf91d82c3d08e37e73c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Jul 2016 11:13:22 -0400 Subject: NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id When determining which layout segments to return, we do want pnfs_mark_matching_lsegs_return to check that they match the layout sequence id. This ensures that we don't waste time if the server is replaying a layout recall that has already been satisfied. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c57cbddca760..52b2a4dfdcb0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -486,15 +486,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, (end2 == NFS4_MAX_UINT64 || end2 > start1); } -static bool -should_free_lseg(const struct pnfs_layout_range *lseg_range, - const struct pnfs_layout_range *recall_range) -{ - return (recall_range->iomode == IOMODE_ANY || - lseg_range->iomode == recall_range->iomode) && - pnfs_lseg_range_intersecting(lseg_range, recall_range); -} - static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { @@ -533,6 +524,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) return (s32)(s1 - s2) > 0; } +static bool +pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool +pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *recall_range, + u32 seq) +{ + if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + return false; + if (recall_range == NULL) + return true; + return pnfs_should_free_range(&lseg->pls_range, recall_range); +} + /** * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later * @lo: layout header containing the lsegs @@ -562,10 +574,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, if (list_empty(&lo->plh_segs)) return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (!recall_range || - should_free_lseg(&lseg->pls_range, recall_range)) { - if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) - continue; + if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, @@ -1845,7 +1854,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (should_free_lseg(&lseg->pls_range, return_range)) { + if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, -- cgit v1.2.3 From 5f46be049b0dfdd36188058f044c165e1d4b8f56 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Jul 2016 11:25:27 -0400 Subject: pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid() Ensure nfs42_layoutstat_done() layoutget don't open code layout stateid invalidation. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 3 +-- fs/nfs/nfs4proc.c | 3 +-- fs/nfs/pnfs.c | 2 +- fs/nfs/pnfs.h | 2 ++ 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index aa03ed09ba06..6ea5ad6f0d44 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -336,8 +336,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * Mark the bad layout state as invalid, then retry * with the current stateid. */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dc50ba4d84ad..8c453515d98c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7931,8 +7931,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, * Mark the bad layout state as invalid, then retry * with the current stateid. */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); status = -EAGAIN; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 52b2a4dfdcb0..1374fcd40484 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) * is required. * Note that caller must hold inode->i_lock. */ -static int +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2f4f26905c03..d71c9493693a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, const struct pnfs_layout_range *recall_range, u32 seq); +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -- cgit v1.2.3 From d9b61708fe6dcab7b50abfb87cb47690d00269a1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 15:04:07 -0400 Subject: pNFS: Clear the layout metadata if the server changed the layout stateid If the server changed the layout stateid's "other" field, then we should treat the old layout as being completely gone. In that case, we want to clear the metadata such as scheduled layoutreturns. Do this by calling pnfs_mark_layout_stateid_invalid(). Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1374fcd40484..e16997da157f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1785,7 +1785,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &free_me); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid); -- cgit v1.2.3 From f71dfe8fc947e04a9e3d10723d4ec4c5c44ce0f5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 12:45:47 -0400 Subject: pNFS: Remove redundant pnfs_mark_layout_returned_if_empty() That's already being taken care of in pnfs_layout_remove_lseg(). Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 1 - fs/nfs/nfs4proc.c | 1 - fs/nfs/pnfs.c | 1 - fs/nfs/pnfs.h | 13 ------------- 4 files changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 837da8a02d35..c92a75e066a6 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -252,7 +252,6 @@ static u32 initiate_file_draining(struct nfs_client *clp, NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } - pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8c453515d98c..d38cc5f0282f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8128,7 +8128,6 @@ static void nfs4_layoutreturn_release(void *calldata) spin_lock(&lo->plh_inode->i_lock); pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, be32_to_cpu(lrp->args.stateid.seqid)); - pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); pnfs_clear_layoutreturn_waitbit(lo); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e16997da157f..49e952968ede 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1159,7 +1159,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d71c9493693a..595648a3ad7a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -552,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } -/** - * pnfs_mark_layout_returned_if_empty - marks the layout as returned - * @lo: layout header - * - * Note: Caller must hold inode->i_lock - */ -static inline void -pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) -{ - if (list_empty(&lo->plh_segs)) - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); -} - static inline void pnfs_copy_range(struct pnfs_layout_range *dst, const struct pnfs_layout_range *src) -- cgit v1.2.3 From 28c1acffead59a461e552708cc9daa9c1cb5a085 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Jul 2016 14:45:19 -0400 Subject: pNFS: Remove redundant stateid invalidation The layout stateid will be invalidated once it holds no more layout segments anyway. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 49e952968ede..c9a684c128b8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1040,7 +1040,6 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; } - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); -- cgit v1.2.3 From 119cef97a46e2a4b4fbebcf6655358a0b277732d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 15:10:12 -0400 Subject: pNFS: Cleanup - do layout segment initialisation in one place ...instead of splitting the initialisation over init_lseg() and pnfs_layout_process(). Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c9a684c128b8..09b77a68422f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -334,7 +334,9 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) } static void -init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid) { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); @@ -342,6 +344,8 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; + lseg->pls_range = *range; + lseg->pls_seq = be32_to_cpu(stateid->seqid); } static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) @@ -1760,9 +1764,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) return lseg; } - init_lseg(lo, lseg); - lseg->pls_range = res->range; - lseg->pls_seq = be32_to_cpu(res->stateid.seqid); + pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { -- cgit v1.2.3 From 01d7b29f0edc1a19ff2b960dae1c7dade3bb1753 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 15:14:44 -0400 Subject: pNFS: Remove redundant smp_mb() from pnfs_init_lseg() It's not visible yet, and won't be until after we grab the inode->i_lock. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 09b77a68422f..1d47a845d77a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -341,7 +341,6 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); atomic_set(&lseg->pls_refcount, 1); - smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; lseg->pls_range = *range; -- cgit v1.2.3 From 139978239b53a981b45c53b835c020015c6c819e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 17:10:52 -0400 Subject: NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it static Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d30f88c667b7..4b1491da16eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8848,7 +8848,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; -ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) +static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { ssize_t error, error2; -- cgit v1.2.3 From 6fdf339b0ca73abd879394ad03a9e4695d644e13 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Jul 2016 17:17:16 -0400 Subject: NFSv4.2: Fix warning "variable ‘stateids’ set but not used" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace it with a test for whether or not the sent a stateid in violation of what we asked for. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6dc6f2aea0d6..8b2605882a20 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr, struct nfs42_write_res *res) { __be32 *p; - int stateids; p = xdr_inline_decode(xdr, 4 + 8 + 4); if (unlikely(!p)) goto out_overflow; - stateids = be32_to_cpup(p++); + /* + * We never use asynchronous mode, so warn if a server returns + * a stateid. + */ + if (unlikely(*p != 0)) { + pr_err_once("%s: server has set unrequested " + "asynchronous mode\n", __func__); + return -EREMOTEIO; + } + p++; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); -- cgit v1.2.3 From 698c937b0d17dd55227622b919482fc720cc1095 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Jul 2016 13:31:14 -0400 Subject: NFSv4: Clean up lookup of SECINFO_NO_NAME Use the minor version ops cached in struct nfs_client instead of looking them up again. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4b1491da16eb..da5c9e58e907 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3281,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs4_do_find_root_sec(struct nfs_server *server, - struct nfs_fh *fhandle, struct nfs_fsinfo *info) -{ - int mv = server->nfs_client->cl_minorversion; - return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); -} - /** * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle @@ -3307,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = nfs4_do_find_root_sec(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec(server, + fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); -- cgit v1.2.3 From 944171cbf499d3445c749f7c13c46de0a564a905 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 28 Jul 2016 14:41:10 -0400 Subject: pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding A LAYOUTCOMMIT then subsequent GETATTR may both return the same attributes, and in that case NFS_INO_INVALID_ATTR is never set on the second pass through nfs_update_inode(). The existing check to skip the clearing of NFS_INO_INVALID_ATTR if a LAYOUTCOMMIT is outstanding does not help in this case (see commit 10b7e9ad4488: "pNFS: Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding"). We know that if a LAYOUTCOMMIT is outstanding then attributes will need upating, so always set NFS_INO_INVALID_ATTR. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f108d58101f8..bf4ec5ecc97e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1665,7 +1665,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); - bool cache_revalidated; + bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1714,8 +1714,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do atomic weak cache consistency updates */ invalid |= nfs_wcc_update_inode(inode, fattr); - - cache_revalidated = !pnfs_layoutcommit_outstanding(inode); + if (pnfs_layoutcommit_outstanding(inode)) { + nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; + cache_revalidated = false; + } /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { -- cgit v1.2.3