From d5e66348bbe39dc78509e7561f7252aa443df8c0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Sep 2008 17:28:35 -0400 Subject: NFS: Fix nfs_file_llseek() After the BKL removal patches were applied to the rest of the NFS code, the BKL protection in nfs_file_llseek() is no longer sufficient to ensure that inode->i_size is read safely in generic_file_llseek_unlocked(). In order to fix the situation, we either have to replace the naked read of inode->i_size in generic_file_llseek_unlocked() with i_size_read(), or the whole thing needs to be executed under the inode->i_lock; In order to avoid disrupting other filesystems, avoid touching generic_file_llseek_unlocked() for now... Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 78460657f5cb..3ddb00433f4f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; + int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; - } - lock_kernel(); /* BKL needed? */ - loff = generic_file_llseek_unlocked(filp, offset, origin); - unlock_kernel(); + + spin_lock(&inode->i_lock); + loff = generic_file_llseek_unlocked(filp, offset, origin); + spin_unlock(&inode->i_lock); + } else + loff = generic_file_llseek_unlocked(filp, offset, origin); return loff; } -- cgit v1.2.3 From 1daef0a868370c5a96d031b9202e3354bea060e6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 27 Jul 2008 18:19:01 -0400 Subject: NFS: Clean up nfs_sb_active/nfs_sb_deactive Instead of causing umount requests to block on server->active_wq while the asynchronous sillyrename deletes are executing, we can use the sb->s_active counter to obtain a reference to the super_block, and then release that reference in nfs_async_unlink_release(). Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 1 - fs/nfs/internal.h | 4 ++-- fs/nfs/super.c | 24 ++++++++---------------- fs/nfs/unlink.c | 5 +++-- include/linux/nfs_fs_sb.h | 1 - 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5ee23e7058b3..2accb67427c6 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); - init_waitqueue_head(&server->active_wq); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 24241fcbb98d..7bcf6ec2d458 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -163,8 +163,8 @@ extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct nfs_server *server); -extern void nfs_sb_deactive(struct nfs_server *server); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ extern char *nfs_path(const char *base, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e9b20173fef3..e527fab40419 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -209,7 +209,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); -static void nfs_put_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { @@ -232,7 +231,6 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -337,26 +335,20 @@ void __exit unregister_nfs_fs(void) unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct nfs_server *server) +void nfs_sb_active(struct super_block *sb) { - atomic_inc(&server->active); -} + struct nfs_server *server = NFS_SB(sb); -void nfs_sb_deactive(struct nfs_server *server) -{ - if (atomic_dec_and_test(&server->active)) - wake_up(&server->active_wq); + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); } -static void nfs_put_super(struct super_block *sb) +void nfs_sb_deactive(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - /* - * Make sure there are no outstanding ops to this server. - * If so, wait for them to finish before allowing the - * unmount to continue. - */ - wait_event(server->active_wq, atomic_read(&server->active) == 0); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); } /* diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index f089e5839d7d..ecc295347775 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(NFS_SB(sb)); + nfs_sb_deactive(sb); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .rpc_message = &msg, .callback_ops = &nfs_unlink_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; @@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_dec_sillycount(dir); return 0; } - nfs_sb_active(NFS_SERVER(dir)); + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(&data->res.dir_attr); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c9beacd16c00..4e477ae58699 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -119,7 +119,6 @@ struct nfs_server { void (*destroy)(struct nfs_server *); atomic_t active; /* Keep trace of any activity to this server */ - wait_queue_head_t active_wq; /* Wait for any activity to stop */ /* mountd-related mount options */ struct sockaddr_storage mountd_address; -- cgit v1.2.3 From 4eec952e42314b53e48fef1f54dd89cbf9789734 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Jul 2008 17:58:13 -0400 Subject: NFS: Add options for finer control of the lookup cache Add the flag NFS_MOUNT_LOOKUP_CACHE_NONEG to turn off the caching of negative dentries. In reality what we do is to force nfs_lookup_revalidate() to always discard negative dentries. Add the flag NFS_MOUNT_LOOKUP_CACHE_NONE for enforcing stricter revalidation of dentries. It forces the revalidate code to always do a lookup instead of just checking the cached mtime of the parent directory. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++++ include/linux/nfs_mount.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 74f92b717f78..49d565412827 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -667,6 +667,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ @@ -750,6 +752,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, /* Don't revalidate a negative dentry if we're creating a new file */ if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; return !nfs_check_verifier(dir, dentry); } diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index df7c6b7a7ebb..6549a06ac16e 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -65,4 +65,8 @@ struct nfs_mount_data { #define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ #define NFS_MOUNT_FLAGMASK 0xFFFF +/* The following are for internal use only */ +#define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 +#define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000 + #endif -- cgit v1.2.3 From ff3525a539f5cc81970d08304bdedb4ffba984da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Aug 2008 16:59:14 -0400 Subject: NFS: Don't apply NFS_MOUNT_FLAGMASK to text-based mounts The point of introducing text-based mounts was to allow us to add functionality without having to worry about legacy binary mount formats. The mask should be there in order to ensure that binary formats don't start enabling features that they cannot support. There is no justification for applying it to the text mount path. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 ++-- fs/nfs/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2accb67427c6..7547600b6174 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server, server->nfs_client = clp; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1072,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server, goto error; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; if (data->rsize) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e527fab40419..81686aeb1b5d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1550,7 +1550,7 @@ static int nfs_validate_mount_data(void *options, * Translate to nfs_parsed_mount_data, which nfs_fill_super * can deal with. */ - args->flags = data->flags; + args->flags = data->flags & NFS_MOUNT_FLAGMASK; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; -- cgit v1.2.3 From 7973c1f15a0687f47ed70e591e4642d6fc4334d0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Jul 2008 17:58:14 -0400 Subject: NFS: Add mount options for controlling the lookup cache Add the following NFS-specific mount options to the parser. -o lookupcache=all /* Default: cache positive & negative dentries */ -o lookupcache=pos[itive] /* Don't cache negative dentries */ -o lookupcache=none /* Strict revalidation of all dentries */ Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 81686aeb1b5d..1e3558697219 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -91,6 +91,7 @@ enum { /* Mount options that take string arguments */ Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -154,6 +155,8 @@ static match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_err, NULL } }; @@ -200,6 +203,22 @@ static match_table_t nfs_secflavor_tokens = { { Opt_sec_err, NULL } }; +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -1250,6 +1269,30 @@ static int nfs_parse_mount_options(char *raw, &mnt->mount_server.addrlen); kfree(string); break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + errors++; + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + }; + break; /* * Special options -- cgit v1.2.3 From 870a5be8b92151332da65021b7b21104e9c1de07 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Oct 2008 12:07:23 -0400 Subject: NFS: Clean up nfs_refresh_inode() and nfs_post_op_update_inode() Try to avoid taking and dropping the inode->i_lock more than once. Do so by moving the code in nfs_refresh_inode() that needs to be done under the spinlock into a function nfs_refresh_inode_locked(), and then having both nfs_refresh_inode() and nfs_post_op_update_inode() call it directly. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52daefa2f521..f189169348b1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -948,6 +948,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; } +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (time_after(fattr->time_start, nfsi->last_updated)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + /** * nfs_refresh_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -960,17 +969,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr); - else - status = nfs_check_inode_attributes(inode, fattr); - + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } @@ -992,13 +996,16 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); + int status = 0; spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) != 0) + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); - return nfs_refresh_inode(inode, fattr); + return status; } /** -- cgit v1.2.3 From a10ad17630024bf7aae8e7f18352f816ee483091 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Sep 2008 17:28:41 -0400 Subject: NFS: Fix the NFS attribute update Currently nfs_refresh_inode() will only update the inode metadata if it sees that the RPC call that returned the nfs_fattr was started after the last update of the inode. This means that if we have parallel RPC calls to the same inode (when sending WRITE calls, for instance), we may often miss updates. This patch attempts to recover those missed updates by also accepting them if the ctime in the nfs_fattr is more recent than the inode's cached ctime. It also recovers the case where the file size has increased, but the ctime has not been updated due to limited ctime resolution. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f189169348b1..8c514a1353c0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -948,11 +948,49 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; } -static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} - if (time_after(fattr->time_start, nfsi->last_updated)) +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for jiffy wraparound if the last_updated timestamp + * is later than 'jiffies'. + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + time_after(fattr->time_start, nfsi->last_updated) || + time_after(nfsi->last_updated, jiffies); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) return nfs_update_inode(inode, fattr); return nfs_check_inode_attributes(inode, fattr); } -- cgit v1.2.3 From d65f557f39448c2d9e58cd564037b81e646aed2c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Oct 2008 12:27:55 -0400 Subject: NFS: Fix nfs_post_op_update_inode_force_wcc() If we believe that the attributes are old (see nfs_refresh_inode()), then we shouldn't force an update. Also ensure that we hold the inode->i_lock across attribute checks and the call to nfs_refresh_inode_locked() to ensure that we don't race with other attribute updates. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8c514a1353c0..610d022fc7a5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1017,6 +1017,18 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) return status; } +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -1033,15 +1045,10 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); - int status = 0; + int status; spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - if ((fattr->valid & NFS_ATTR_FATTR) != 0) - status = nfs_refresh_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } @@ -1059,6 +1066,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + goto out_noforce; + } if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && (fattr->valid & NFS_ATTR_WCC_V4) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; @@ -1071,7 +1087,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } - return nfs_post_op_update_inode(inode, fattr); +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; } /* -- cgit v1.2.3 From 4dc05efb86239321d43a9d74fd2ecd5c21bfc2ad Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Sep 2008 17:28:42 -0400 Subject: NFS: Convert __nfs_revalidate_inode() to use nfs_refresh_inode() In the case where there are parallel RPC calls to the same inode, we may receive stale metadata due to the lack of ordering, hence the sanity checking of metadata in nfs_refresh_inode(). Currently, __nfs_revalidate_inode() is calling nfs_update_inode() directly, without any further sanity checks, and hence may end up setting the inode up with stale metadata. Fix is to use nfs_refresh_inode() instead of nfs_update_inode(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 610d022fc7a5..697157c1fdd1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -724,16 +724,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr); + status = nfs_refresh_inode(inode, &fattr); if (status) { - spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } - spin_unlock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); -- cgit v1.2.3 From 076f1fc94c44be2664172c63b4a2b51ae2d265ea Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Oct 2008 13:31:21 -0400 Subject: NFS: Don't clear nfsi->cache_validity in nfs_check_inode_attributes() If we're merely checking the inode attributes because we suspect that the 'updated' attributes returned by the RPC call are stale, then we shouldn't be doing weak cache consistency updates or clearing the cache_validity flags. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 697157c1fdd1..a2f54154d825 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -905,9 +905,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; } - /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; @@ -936,10 +933,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (invalid != 0) nfsi->cache_validity |= invalid; - else - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); nfsi->read_cache_jiffies = fattr->time_start; return 0; -- cgit v1.2.3 From 2f28ea614ff497202d5a52af82da523ae4a20718 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Oct 2008 14:26:11 -0400 Subject: NFS: Fix up nfs_setattr_update_inode() Ensure that it sets the inode metadata under the correct spinlock. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a2f54154d825..f3b8ed904df7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -453,6 +453,7 @@ out_big: void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) { if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -462,7 +463,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } -- cgit v1.2.3 From 691beb13cdc88358334ef0ba867c080a247a760f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 5 Oct 2008 14:48:22 -0400 Subject: NFS: Allow concurrent inode revalidation Currently, if two processes are both trying to revalidate metadata for the same inode, they will find themselves being serialised. There is no good justification for this now that we have improved our ability to detect stale attribute data, so we should remove that serialisation. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 43 ++----------------------------------------- include/linux/nfs_fs.h | 9 ++++----- 2 files changed, 6 insertions(+), 46 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f3b8ed904df7..e25009f35cc2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -472,37 +472,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } } -static int nfs_wait_schedule(void *word) -{ - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - -/* - * Wait for the inode to get unlocked. - */ -static int nfs_wait_on_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, - nfs_wait_schedule, TASK_KILLABLE); - - return error; -} - -static void nfs_wake_up_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); -} - int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -697,20 +666,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (is_bad_inode(inode)) - goto out_nowait; + goto out; if (NFS_STALE(inode)) - goto out_nowait; - - status = nfs_wait_on_inode(inode); - if (status < 0) goto out; - status = -ESTALE; if (NFS_STALE(inode)) goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -740,9 +704,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode)); out: - nfs_wake_up_inode(inode); - - out_nowait: return status; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 78a5922a2f11..ca563ee13e32 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -200,11 +200,10 @@ struct nfs_inode { /* * Bit offsets in flags field */ -#define NFS_INO_REVALIDATING (0) /* revalidating attrs */ -#define NFS_INO_ADVISE_RDPLUS (1) /* advise readdirplus */ -#define NFS_INO_STALE (2) /* possible stale inode */ -#define NFS_INO_ACL_LRU_SET (3) /* Inode is on the LRU list */ -#define NFS_INO_MOUNTPOINT (4) /* inode is remote mountpoint */ +#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */ +#define NFS_INO_STALE (1) /* possible stale inode */ +#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ +#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3 From bb8a3b53c20f2c07164a23ff6c320794fee8b95f Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 02:55:49 +0300 Subject: fix fs/nfs/nfsroot.c compilation This patch fixes the following compile error caused by commit f9247273cb69ba101877e946d2d83044409cc8c5 (UFS: add const to parser token tabl): <-- snip --> ... CC fs/nfs/nfsroot.o /home/bunk/linux/kernel-2.6/git/linux-2.6/fs/nfs/nfsroot.c:130: error: tokens causes a section type conflict make[3]: *** [fs/nfs/nfsroot.o] Error 1 <-- snip --> Signed-off-by: Adrian Bunk Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 46763d1cd397..8478fc25daee 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -127,7 +127,7 @@ enum { Opt_err }; -static match_table_t __initdata tokens = { +static match_table_t __initconst tokens = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, -- cgit v1.2.3 From fd08d7e9d196ca49afcce0181f1f0ca68f241aa2 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Thu, 31 Jul 2008 09:38:55 +0400 Subject: nfs: ERR_PTR is expected on failure from nfs_do_clone_mount Replace NULL with ERR_PTR(-EINVAL). Signed-off-by: Denis V. Lunev Signed-off-by: Trond Myklebust --- fs/nfs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 66df08dd1caf..d398775a3af5 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -189,7 +189,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, struct nfs_clone_mount *mountdata) { #ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = NULL; + struct vfsmount *mnt = ERR_PTR(-EINVAL); switch (server->nfs_client->rpc_ops->version) { case 2: case 3: -- cgit v1.2.3 From c9f6cde6e26ef98ee9c4b6288b126ac9c580d88b Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Thu, 31 Jul 2008 09:53:56 +0400 Subject: sunrpc: do not pin sunrpc module in the memory Basically, try_module_get here are pretty useless. Any other module using this API will pin sunrpc in memory due using exported symbols. Signed-off-by: Denis V. Lunev Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 99a52aabe332..29e401bb612e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -108,13 +108,10 @@ int xprt_register_transport(struct xprt_class *transport) goto out; } - result = -EINVAL; - if (try_module_get(THIS_MODULE)) { - list_add_tail(&transport->list, &xprt_list); - printk(KERN_INFO "RPC: Registered %s transport module.\n", - transport->name); - result = 0; - } + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; out: spin_unlock(&xprt_list_lock); @@ -143,7 +140,6 @@ int xprt_unregister_transport(struct xprt_class *transport) "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); - module_put(THIS_MODULE); goto out; } } -- cgit v1.2.3 From 44d5759d3fdad660f000ef319f0ec33a6ac6ae28 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Mon, 11 Aug 2008 12:02:34 +0400 Subject: nfs: BUG_ON in nfs_follow_mountpoint Unfortunately, BUG_ON(IS_ROOT(dentry)) can happen inside nfs_follow_mountpoint with NFS running Fedora 8 using a specific setup. https://bugzilla.redhat.com/show_bug.cgi?id=458622 So, the situation should be handled on NFS client gracefully. Signed-off-by: Denis V. Lunev CC: Trond Myklebust CC: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/namespace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index d398775a3af5..64a288ee046d 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) dprintk("--> nfs_follow_mountpoint()\n"); - BUG_ON(IS_ROOT(dentry)); + err = -ESTALE; + if (IS_ROOT(dentry)) + goto out_err; + dprintk("%s: enter\n", __func__); dput(nd->path.dentry); nd->path.dentry = dget(dentry); -- cgit v1.2.3 From f200c11c257b8db5c49dfc0b7f84bceae3109779 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 14 Aug 2008 18:32:55 -0400 Subject: nfs: remove an obsolete nfs_flock comment We *do* now allow bsd flocks over nfs. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3ddb00433f4f..d319b49f8f06 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -702,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); - /* - * No BSD flocks over NFS allowed. - * Note: we could try to fake a POSIX lock request here by - * using ((u32) filp | 0x80000000) or some such as the pid. - * Not sure whether that would be unique, though, or whether - * that would break in other places. - */ if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; -- cgit v1.2.3 From f25b874d39461935b1b5bbffaa622e735e79d49e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Aug 2008 09:17:58 -0400 Subject: NFS: missing nfs_fattr_init in nfs3_proc_getacl and nfs3_proc_setacls (resend #2) The fattrs used in the NFSv3 getacl/setacl calls are not being properly initialized. This occasionally causes nfs_update_inode to fall into NFSv4 specific codepaths when handling post-op attrs from these calls. Thanks to Cai Qian for noticing the spurious NFSv4 messages in debug output from a v3 mount... Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 423842f51ac9..cef62557c87d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); -- cgit v1.2.3 From 37ca8f5c6041516aac603a5abb89b05675493802 Mon Sep 17 00:00:00 2001 From: EG Keizer Date: Tue, 19 Aug 2008 16:34:36 -0400 Subject: nfs: authenticated deep mounting Allow mount to do authenticated mounts below the root of the exported tree. The wording in RFC 2623, sec 2.3.2. allows fsinfo with UNIX authentication on the root of the export. Mounts are not always done on the root of the exported tree. Especially autoumounts often mount below the root of the exported tree. Some server implementations (justly) require full authentication for the so-called deep mounts. The old code used AUTH_SYS only. This caused deep mounts to fail on systems requiring stronger authentication.. The client should try both authentication types and use the first one that succeeds. This method was already partially implemented. This patch completes the implementation for NFS2 and NFS3. This patch was developed to allow Debian systems to automount home directories on Solaris servers with krb5 authentication. Tested on kernel 2.6.24-etchnhalf.1 Signed-off-by: E.G. Keizer Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 20 ++++++++++++++++++-- fs/nfs/proc.c | 10 ++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1e750e4574a9..c55be7a7679e 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, } static int -nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { struct rpc_message msg = { @@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4dbb84df1b68..193465210d7c 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("%s: call getattr\n", __func__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply getattr: %d\n", __func__, status); if (status) return status; dprintk("%s: call statfs\n", __func__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply statfs: %d\n", __func__, status); if (status) return status; -- cgit v1.2.3 From 4ada29d5c4dd2d3ba89510bdbc64be22961fd1cb Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Aug 2008 16:10:20 -0400 Subject: nfs: break up nfs_follow_referral This function is a little longer and more deeply nested than necessary. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 84 +++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b112857301f7..956cbbc2ae9f 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -110,6 +110,48 @@ static inline int valid_ipaddr4(const char *buf) return 0; } +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int s = 0; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return mnt; + mountdata->mnt_path = mnt_path; + + while (s < location->nservers) { + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons(NFS_PORT), + }; + + if (location->servers[s].len <= 0 || + valid_ipaddr4(location->servers[s].data) < 0) { + s++; + continue; + } + + mountdata->hostname = location->servers[s].data; + addr.sin_addr.s_addr = in_aton(mountdata->hostname), + mountdata->addr = (struct sockaddr *)&addr; + mountdata->addrlen = sizeof(addr); + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; + s++; + } + return mnt; +} + /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error * @mnt_parent - mountpoint of parent directory @@ -128,7 +170,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - unsigned int s; int loc, error; if (locations == NULL || locations->nlocations <= 0) @@ -153,9 +194,8 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, } loc = 0; - while (loc < locations->nlocations && IS_ERR(mnt)) { + while (loc < locations->nlocations) { const struct nfs4_fs_location *location = &locations->locations[loc]; - char *mnt_path; if (location == NULL || location->nservers <= 0 || location->rootpath.ncomponents == 0) { @@ -163,41 +203,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, continue; } - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) { - loc++; - continue; - } - mountdata.mnt_path = mnt_path; - - s = 0; - while (s < location->nservers) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; - - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; - continue; - } - - mountdata.hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata.hostname), - mountdata.addr = (struct sockaddr *)&addr; - mountdata.addrlen = sizeof(addr); - - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata.hostname, - mountdata.mnt_path); - - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); - if (!IS_ERR(mnt)) { - break; - } - s++; - } + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; loc++; } -- cgit v1.2.3 From 460cdbc83268dd9641b57d893b03ef52fcc3f96d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Aug 2008 16:10:21 -0400 Subject: nfs: replace while loop by for loops in nfs_follow_referral Whoever wrote this had a bizarre allergy to for loops. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 956cbbc2ae9f..6bcc5696f911 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -116,24 +116,22 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, { struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; - unsigned int s = 0; + unsigned int s; mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) return mnt; mountdata->mnt_path = mnt_path; - while (s < location->nservers) { + for (s = 0; s < location->nservers; s++) { struct sockaddr_in addr = { .sin_family = AF_INET, .sin_port = htons(NFS_PORT), }; if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; + valid_ipaddr4(location->servers[s].data) < 0) continue; - } mountdata->hostname = location->servers[s].data; addr.sin_addr.s_addr = in_aton(mountdata->hostname), @@ -147,7 +145,6 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); if (!IS_ERR(mnt)) break; - s++; } return mnt; } @@ -193,20 +190,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; } - loc = 0; - while (loc < locations->nlocations) { + for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; if (location == NULL || location->nservers <= 0 || - location->rootpath.ncomponents == 0) { - loc++; + location->rootpath.ncomponents == 0) continue; - } mnt = try_location(&mountdata, page, page2, location); if (!IS_ERR(mnt)) break; - loc++; } out: -- cgit v1.2.3 From f0c929251e01a7a86b6254c775cb6b65c6457f10 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Aug 2008 16:10:22 -0400 Subject: nfs: prepare to share nfs_set_port We plan to use this function elsewhere. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 20 ++++++++++++++++++++ fs/nfs/super.c | 19 ------------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7bcf6ec2d458..8d91bd88e310 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -276,3 +276,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } + +/* + * Set the port number in an address. Be agnostic about the address + * family. + */ +static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) +{ + switch (sap->sa_family) { + case AF_INET: { + struct sockaddr_in *ap = (struct sockaddr_in *)sap; + ap->sin_port = htons(port); + break; + } + case AF_INET6: { + struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; + ap->sin6_port = htons(port); + break; + } + } +} diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 1e3558697219..b99096b8e827 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -674,25 +674,6 @@ static void nfs_umount_begin(struct super_block *sb) rpc_killall_tasks(rpc); } -/* - * Set the port number in an address. Be agnostic about the address family. - */ -static void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; - } - } -} - /* * Sanity-check a server address provided by the mount command. * -- cgit v1.2.3 From ea31a4437c59219bf3ea946d58984b01a45a289c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Aug 2008 16:10:23 -0400 Subject: nfs: Fix misparsing of nfsv4 fs_locations attribute The code incorrectly assumes here that the server name (or ip address) is null-terminated. This can cause referrals to fail in some cases. Also support ipv6 addresses. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 ++ fs/nfs/nfs4namespace.c | 44 ++++++++++++++++++-------------------------- fs/nfs/super.c | 4 +--- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8d91bd88e310..5d2a5d3c4241 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); /* super.c */ +void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -276,6 +277,7 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +#define IPV6_SCOPE_DELIMITER '%' /* * Set the port number in an address. Be agnostic about the address diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 6bcc5696f911..30befc39b3c6 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -93,50 +93,42 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } -/* - * Check if the string represents a "valid" IPv4 address - */ -static inline int valid_ipaddr4(const char *buf) -{ - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; - } - return 0; -} - static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, char *page, char *page2, const struct nfs4_fs_location *location) { struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; + int page2len; unsigned int s; mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) return mnt; mountdata->mnt_path = mnt_path; + page2 += strlen(mnt_path) + 1; + page2len = PAGE_SIZE - strlen(mnt_path) - 1; for (s = 0; s < location->nservers; s++) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; + const struct nfs4_string *buf = &location->servers[s]; + struct sockaddr_storage addr; - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) + if (buf->len <= 0 || buf->len >= PAGE_SIZE) continue; - mountdata->hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata->hostname), mountdata->addr = (struct sockaddr *)&addr; - mountdata->addrlen = sizeof(addr); + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + nfs_parse_ip_address(buf->data, buf->len, + mountdata->addr, &mountdata->addrlen); + if (mountdata->addr->sa_family == AF_UNSPEC) + continue; + nfs_set_port(mountdata->addr, NFS_PORT); + + strncpy(page2, buf->data, page2len); + page2[page2len] = '\0'; + mountdata->hostname = page2; snprintf(page, PAGE_SIZE, "%s:%s", mountdata->hostname, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b99096b8e827..20dc4ccdff56 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -716,8 +716,6 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, *addr_len = 0; } -#define IPV6_SCOPE_DELIMITER '%' - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, const char *delim, @@ -790,7 +788,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, * If there is a problem constructing the new sockaddr, set the address * family to AF_UNSPEC. */ -static void nfs_parse_ip_address(char *string, size_t str_len, +void nfs_parse_ip_address(char *string, size_t str_len, struct sockaddr *sap, size_t *addr_len) { unsigned int i, colons; -- cgit v1.2.3 From 9fa8d66f1e55bf197568c8c689043c2aad1ffc97 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Tue, 26 Aug 2008 16:23:20 +0100 Subject: NFS: remove 8 bytes of padding from struct nfs_fattr on 64 bit builds remove 8 bytes of padding from struct nfs_fattr on 64 bit builds This also removes padding from several nfs structures, including 16 bytes from nfs4_opendata, nfs4_createdata,nfs3_createdata & 8 bytes from nfs_read_data,nfs_write_data,nfs_removeres,nfs4_closedata This also reduces the reported stack usage of many nfs functions (30+). Signed-off-by: Richard Kennedy ---- This patch is against the latest git 2.6.27-rc4. I've built & run this on my AMD64 desktop, & successfully run _simple_ tests with a 64 bit client => 32 bit server & 32 bit client to 64 bit server. On fedora with gcc (GCC) 4.3.0 20080428 (Red Hat 4.3.0-8) checkpatch reports 33 functions with reduced stack usage. e.g. __nfs_revalidate_inode [nfs] 216 => 200 _nfs4_proc_access [nfs] 304 => 288 _nfs4_proc_link [nfs] 536 => 504 _nfs4_proc_remove [nfs] 304 => 288 _nfs4_proc_rename [nfs] 584 => 552 nfs3_proc_access [nfs] 272 => 256 nfs3_proc_getacl [nfs] 384 => 368 nfs3_proc_link [nfs] 496 => 464 etc I can supply the complete list if anyone is interested. regards Richard Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 8c77c11224d1..9cabbb3a9e6d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -36,6 +36,7 @@ struct nfs_fattr { __u32 nlink; __u32 uid; __u32 gid; + dev_t rdev; __u64 size; union { struct { @@ -46,7 +47,6 @@ struct nfs_fattr { __u64 used; } nfs3; } du; - dev_t rdev; struct nfs_fsid fsid; __u64 fileid; struct timespec atime; -- cgit v1.2.3 From d1ce02e1689dff9d413138f60a79b4e3affb4708 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Sep 2008 11:57:12 -0400 Subject: NFS: SETCLIENTID truncates client ID and netid The sc_name field is currently 56 bytes long. This is not large enough to hold a pair of IPv6 addresses, the authentication type, the protocol name, and a uniquifier number. The maximum possible size of the name string using IPv6 addresses is just under 110 bytes, so I increased the size of the sc_name field to accomodate this maximum. In addition, the strings in the nfs4_setclientid structure are constructed with scnprintf(), which wants to terminate its output with '\0'. The sc_netid field was large enough only for a three byte netid string and a '\0' so inet6 netids were being truncated. Perhaps we don't need the overhead of scnprintf() to do a simple string copy, but I fixed this by increasing the size of the buffer by one byte. Since all three of the string buffers in nfs4_setclientid are constructed with scnprintf(), I increased the size of all three by one byte to document the requirement, although I don't think either the universal address field or the name field will be so small that these strings get truncated in this way. The size of the Linux client's client ID on the wire will be larger than before. RFC 3530 suggests the size limit for client IDs is 1024, and we are still well below that. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9cabbb3a9e6d..f6e95bfad5de 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -672,16 +672,16 @@ struct nfs4_rename_res { struct nfs_fattr * new_fattr; }; -#define NFS4_SETCLIENTID_NAMELEN (56) +#define NFS4_SETCLIENTID_NAMELEN (128) struct nfs4_setclientid { const nfs4_verifier * sc_verifier; unsigned int sc_name_len; - char sc_name[NFS4_SETCLIENTID_NAMELEN]; + char sc_name[NFS4_SETCLIENTID_NAMELEN + 1]; u32 sc_prog; unsigned int sc_netid_len; - char sc_netid[RPCBIND_MAXNETIDLEN]; + char sc_netid[RPCBIND_MAXNETIDLEN + 1]; unsigned int sc_uaddr_len; - char sc_uaddr[RPCBIND_MAXUADDRLEN]; + char sc_uaddr[RPCBIND_MAXUADDRLEN + 1]; u32 sc_cb_ident; }; -- cgit v1.2.3 From 9a4bd29fe8f6d3f015fe1c8e5450eb62cfebfcc9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Oct 2008 16:48:34 -0400 Subject: SUNRPC: Fix autobind on cloned rpc clients Despite the fact that cloned rpc clients won't have the cl_autobind flag set, they may still find themselves calling rpcb_getport_async(). For this to happen, it suffices for a _parent_ rpc_clnt to use autobinding, in which case any clone may find itself triggering the !xprt_bound() case in call_bind(). The correct fix for this is to walk back up the tree of cloned rpc clients, in order to find the parent that 'owns' the transport, either because it has clnt->cl_autobind set, or because it originally created the transport... Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 24db2b4d12d3..172935b046de 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -469,6 +469,28 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi return rpc_run_task(&task_setup_data); } +/* + * In the case where rpc clients have been cloned, we want to make + * sure that we use the program number/version etc of the actual + * owner of the xprt. To do so, we walk back up the tree of parents + * to find whoever created the transport and/or whoever has the + * autobind flag set. + */ +static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) +{ + struct rpc_clnt *parent = clnt->cl_parent; + + while (parent != clnt) { + if (parent->cl_xprt != clnt->cl_xprt) + break; + if (clnt->cl_autobind) + break; + clnt = parent; + parent = parent->cl_parent; + } + return clnt; +} + /** * rpcb_getport_async - obtain the port for a given RPC service on a given host * @task: task that is waiting for portmapper request @@ -478,10 +500,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi */ void rpcb_getport_async(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_clnt *clnt; struct rpc_procinfo *proc; u32 bind_version; - struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; static struct rpcbind_args *map; struct rpc_task *child; @@ -490,13 +512,13 @@ void rpcb_getport_async(struct rpc_task *task) size_t salen; int status; + clnt = rpcb_find_transport_owner(task->tk_client); + xprt = clnt->cl_xprt; + dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __func__, clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot); - /* Autobind on cloned rpc clients is discouraged */ - BUG_ON(clnt->cl_parent != clnt); - /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ rpc_sleep_on(&xprt->binding, task, NULL); @@ -578,9 +600,9 @@ void rpcb_getport_async(struct rpc_task *task) task->tk_pid, __func__); return; } - rpc_put_task(child); - task->tk_xprt->stat.bind_count++; + xprt->stat.bind_count++; + rpc_put_task(child); return; bailout_nofree: -- cgit v1.2.3 From 96165e2b7c4e2c82a0b60c766d4a2036444c21a0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Oct 2008 16:48:40 -0400 Subject: SUNRPC: Fix a memory leak in rpcb_getport_async Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 172935b046de..0a22f00734a4 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -580,7 +580,7 @@ void rpcb_getport_async(struct rpc_task *task) status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", task->tk_pid, __func__); - goto bailout_nofree; + goto bailout_release_client; } map->r_prog = clnt->cl_prog; map->r_vers = clnt->cl_vers; @@ -605,6 +605,8 @@ void rpcb_getport_async(struct rpc_task *task) rpc_put_task(child); return; +bailout_release_client: + rpc_release_client(rpcb_clnt); bailout_nofree: rpcb_wake_rpcbind_waiters(xprt, status); task->tk_status = status; -- cgit v1.2.3 From 8491945f11c227400ef294d560f6d7aace76bc33 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Fri, 11 Apr 2008 20:03:06 -0400 Subject: NFS: Client mounts hang when exported directory do not exist This patch fixes a regression that was introduced by the string based mounts. nfs_mount() statically returns -EACCES for every error returned by the remote mounted. This is incorrect because -EACCES is an non-fatal error to the mount.nfs command. This error causes mount.nfs to retry the mount even in the case when the exported directory does not exist. This patch maps the errors returned by the remote mountd into valid errno values, exactly how it was done pre-string based mounts. By returning the correct errno enables mount.nfs to do the right thing. Signed-off-by: Steve Dickson [Trond.Myklebust@netapp.com: nfs_stat_to_errno() now correctly returns negative errors, so remove the sign change.] Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 779d2eb649c5..086a6830d785 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,6 +14,7 @@ #include #include #include +#include "internal.h" #ifdef RPC_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT @@ -98,7 +99,7 @@ out_call_err: out_mnt_err: dprintk("NFS: MNT server returned result %d\n", result.status); - status = -EACCES; + status = nfs_stat_to_errno(result.status); goto out; } -- cgit v1.2.3 From d7fb120774f062ce7db439863ab5d4190d6f989c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 6 Oct 2008 20:08:56 -0400 Subject: NFS: Don't use range_cyclic for data integrity syncs It is more efficient to write linearly starting from the beginning of the file. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3229e217c773..9f9845859fc1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, .for_writepages = 1, - .range_cyclic = 1, }; int ret; -- cgit v1.2.3 From 63ffc23d307c9534c732edd87895e37b223004a3 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 3 Oct 2008 23:41:51 -0400 Subject: sunrpc: fix oops in rpc_create when the mount namespace is unshared On a system with nfs mounts, if a task unshares its mount namespace, a oops can occur when the system is rebooted if the task is the last to unreference the nfs mount. It will try to create a rpc request using utsname() which has been invalidated by free_nsproxy(). The patch fixes the issue by using the global init_utsname() which is always valid. the capability of identifying rpc clients per uts namespace stills needs some extra work so this should not be a problem. BUG: unable to handle kernel NULL pointer dereference at 00000004 IP: [] rpc_create+0x332/0x42f Oops: 0000 [#1] DEBUG_PAGEALLOC Pid: 1857, comm: uts-oops Not tainted (2.6.27-rc5-00319-g7686ad5 #4) EIP: 0060:[] EFLAGS: 00210287 CPU: 0 EIP is at rpc_create+0x332/0x42f EAX: 00000000 EBX: df26adf0 ECX: c0251887 EDX: 00000001 ESI: df26ae58 EDI: c02f293c EBP: dda0fc9c ESP: dda0fc2c DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 Process uts-oops (pid: 1857, ti=dda0e000 task=dd9a0778 task.ti=dda0e000) Stack: c0104532 dda0fffc dda0fcac dda0e000 dda0e000 dd93b7f0 00000009 c02f2880 df26aefc dda0fc68 c01096b7 00000000 c0266ee0 c039a070 c039a070 dda0fc74 c012ca67 c039a064 dda0fc8c c012cb20 c03daf74 00000011 00000000 c0275c90 Call Trace: [] ? dump_trace+0xc2/0xe2 [] ? save_stack_trace+0x1c/0x3a [] ? save_trace+0x37/0x8c [] ? add_lock_to_list+0x64/0x96 [] ? rpcb_register_call+0x62/0xbb [] ? rpcb_register+0xab/0xb3 [] ? svc_register+0xb4/0x128 [] ? svc_destroy+0xec/0x103 [] ? svc_exit_thread+0x87/0x8d [] ? lockd_down+0x61/0x81 [] ? nlmclnt_done+0xd/0xf [] ? nfs_destroy_server+0x14/0x16 [] ? nfs_free_server+0x4c/0xaa [] ? nfs_kill_super+0x23/0x27 [] ? deactivate_super+0x3f/0x51 [] ? mntput_no_expire+0x95/0xb4 [] ? release_mounts+0x6b/0x7a [] ? __put_mnt_ns+0x62/0x70 [] ? free_nsproxy+0x25/0x80 [] ? switch_task_namespaces+0x3e/0x43 [] ? exit_task_namespaces+0xa/0xc [] ? do_exit+0x4fd/0x666 [] ? do_group_exit+0x5d/0x83 [] ? get_signal_to_deliver+0x2c8/0x2e0 [] ? do_notify_resume+0x69/0x700 [] ? do_sigaction+0x134/0x145 [] ? hrtimer_nanosleep+0x8f/0xce [] ? hrtimer_wakeup+0x0/0x1c [] ? work_notifysig+0x13/0x1b ======================= Code: 70 20 68 cb c1 2c c0 e8 75 4e 01 00 8b 83 ac 00 00 00 59 3d 00 f0 ff ff 5f 77 63 eb 57 a1 00 80 2d c0 8b 80 a8 02 00 00 8d 73 68 <8b> 40 04 83 c0 45 e8 41 46 f7 ff ba 20 00 00 00 83 f8 21 0f 4c EIP: [] rpc_create+0x332/0x42f SS:ESP 0068:dda0fc2c Signed-off-by: Cedric Le Goater Cc: Chuck Lever Cc: Trond Myklebust Cc: "Eric W. Biederman" Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 76739e928d0d..a59cdf423c1c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -213,10 +213,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru } /* save the nodename */ - clnt->cl_nodelen = strlen(utsname()->nodename); + clnt->cl_nodelen = strlen(init_utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen); rpc_register_client(clnt); return clnt; -- cgit v1.2.3 From 19d771f3caccaf66ce2fb539319222139e5b4e88 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Oct 2008 13:54:52 -0400 Subject: NFS: Save padding bytes in struct nfs4_setclientid Peter Staubach suggested reducing NFS4_SETCLIENTID_NAMELEN by one byte so as to avoid 7 bytes of unnecessary padding. Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index f6e95bfad5de..6ee6ae3f095c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -672,7 +672,7 @@ struct nfs4_rename_res { struct nfs_fattr * new_fattr; }; -#define NFS4_SETCLIENTID_NAMELEN (128) +#define NFS4_SETCLIENTID_NAMELEN (127) struct nfs4_setclientid { const nfs4_verifier * sc_verifier; unsigned int sc_name_len; -- cgit v1.2.3 From 03254e65a60d3113164672dbbadc023c4a56ecd1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 9 Oct 2008 13:27:55 -0400 Subject: NFS: Fix attribute updates This fixes a regression seen when running the Connectathon testsuite against an ext3 filesystem. The reason was that the inode was constantly being marked as 'just updated' by the jiffy wraparound test. This again meant that newer GETATTR calls were failing to pass the nfs_inode_attrs_need_update() test unless the changes caused a ctime update on the server, since they were perceived as having been started before the latest inode update. Given that nfs_inode_attrs_need_update() already checks for wraparound of nfsi->last_updated, we can drop the buggy "protection" in nfs_update_inode(). Also make a slight micro-optimisation of nfs_inode_attrs_need_update(): we are more often going to see time_after(fattr->time_start, nfsi->last_updated) be true, rather than seeing an update of ctime/size, so put that test first to ensure that we optimise away the ctime/size tests. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e25009f35cc2..6554281e24a2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -933,10 +933,10 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n { const struct nfs_inode *nfsi = NFS_I(inode); - return nfs_ctime_need_update(inode, fattr) || - nfs_size_need_update(inode, fattr) || - time_after(fattr->time_start, nfsi->last_updated) || - time_after(nfsi->last_updated, jiffies); + return time_after(fattr->time_start, nfsi->last_updated) || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + time_after(nfsi->last_updated, jiffies); } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) @@ -1167,11 +1167,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } - /* - * Avoid jiffy wraparound issues with nfsi->last_updated - */ - if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) - nfsi->last_updated = nfsi->read_cache_jiffies; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ -- cgit v1.2.3 From 456018d791ff4ef03d610f72486c637056bcd749 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 8 Oct 2008 15:31:14 -0400 Subject: NFS: Cleanup nfs_set_port Signed-off-by: "J. Bruce Fields" Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5d2a5d3c4241..d212ee41caf2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -285,16 +285,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) */ static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) { + struct sockaddr_in *ap = (struct sockaddr_in *)sap; + struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; + switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; - } + case AF_INET: + ap->sin_port = htons(port); + break; + case AF_INET6: + ap6->sin6_port = htons(port); + break; } } -- cgit v1.2.3 From 5e2e7721f04c11e6dc4a74b33f05a0e1c0381e2e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 8 Oct 2008 15:38:10 -0400 Subject: NFS: fix nfs_parse_ip_address() corner case Bruce observed that nfs_parse_ip_address() will successfully parse an IPv6 address that looks like this: "::1%" A scope delimiter is present, but there is no scope ID following it. This is harmless, as it would simply set the scope ID to zero. However, in some cases we would like to flag this as an improperly formed address. We are now also careful to reject addresses where garbage follows the address (up to the length of the string), instead of ignoring the non-address characters; and where the scope ID is nonsense (not a valid device name, but also not numeric). Before, both of these cases would result in a harmless zero scope ID. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 20dc4ccdff56..d496e4016224 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -717,17 +717,21 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) +static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) { char *p; size_t len; - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return ; + if ((string + str_len) == delim) + return 1; + if (*delim != IPV6_SCOPE_DELIMITER) - return; + return 0; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; len = (string + str_len) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); @@ -740,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, scope_id = dev->ifindex; dev_put(dev); } else { - /* scope_id is set to zero on error */ - strict_strtoul(p, 10, &scope_id); + if (strict_strtoul(p, 10, &scope_id) == 0) { + kfree(p); + return 0; + } } kfree(p); + sin6->sin6_scope_id = scope_id; dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + return 1; } + + return 0; } static void nfs_parse_ipv6_address(char *string, size_t str_len, @@ -763,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, sin6->sin6_family = AF_INET6; *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { - nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); - return; + if (in6_pton(string, str_len, addr, + IPV6_SCOPE_DELIMITER, &delim) != 0) { + if (nfs_parse_ipv6_scope_id(string, str_len, + delim, sin6) != 0) + return; } } -- cgit v1.2.3 From 8d4ba0347ccfea4f12e56e2484954b891411b74d Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 14:59:49 -0400 Subject: RPC/RDMA: refactor the inline memory registration code. Refactor the memory registration and deregistration routines. This saves stack space, makes the code more readable and prepares to add the new FRMR registration methods. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 365 +++++++++++++++++++++++++------------------- 1 file changed, 207 insertions(+), 158 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8ea283ecc522..d04208a02f67 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -863,6 +863,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len; int i, rc; + struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -873,7 +874,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, if any + * 5. mw's or fmr's, if any * Send/recv buffers in req/rep need to be registered */ @@ -927,15 +928,13 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * and also reduce unbind-to-bind collision. */ INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { case RPCRDMA_MTHCAFMR: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; - struct ib_fmr_attr fa = { - RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT - }; /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + static struct ib_fmr_attr fa = + { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; r->r.fmr = ib_alloc_fmr(ia->ri_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, &fa); @@ -948,12 +947,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; /* Allocate one extra request's worth, for full cycling */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { r->r.mw = ib_alloc_mw(ia->ri_pd); @@ -966,7 +962,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, list_add(&r->mw_list, &buf->rb_mws); ++r; } - } break; default: break; @@ -1046,6 +1041,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *r; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) @@ -1065,7 +1061,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { while (!list_empty(&buf->rb_mws)) { - struct rpcrdma_mw *r; r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); @@ -1115,6 +1110,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_req *req; unsigned long flags; + int i; + struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1135,9 +1132,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; if (!list_empty(&buffers->rb_mws)) { - int i = RPCRDMA_MAX_SEGS - 1; + i = RPCRDMA_MAX_SEGS - 1; do { - struct rpcrdma_mw *r; r = list_entry(buffers->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); @@ -1329,15 +1325,202 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } +static int +rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, i, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + LIST_HEAD(l); + int rc; + + list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + return rc; +} + +static int +rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct ib_mw_bind param; + int rc; + + *nsegs = 1; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + } + return rc; +} + +static int +rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt, void **r) +{ + struct ib_mw_bind param; + LIST_HEAD(l); + int rc; + + BUG_ON(seg->mr_nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (*r) { + param.wr_id = (u64) (unsigned long) *r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + *r = NULL; /* will upcall on completion */ + return rc; +} + +static int +rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) +{ + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len, i, rc = 0; + + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (len = 0, i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, i, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int rc; + + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + return rc; +} + int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - int i; int rc = 0; switch (ia->ri_memreg_strategy) { @@ -1352,114 +1535,20 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; #endif - /* Registration using fast memory registration */ + /* Registration using fmr memory registration */ case RPCRDMA_MTHCAFMR: - { - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff = offset_in_page(seg->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, - physaddrs, nsegs, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; /* Registration using memory windows */ case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - rpcrdma_map_one(ia, seg, writing); - param.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.addr = seg->mr_dma; - param.length = seg->mr_len; - param.send_flags = 0; - param.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.addr; - seg->mr_nsegs = 1; - nsegs = 1; - } - } + rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); break; /* Default registration each time */ default: - { - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len = 0; - if (nsegs > RPCRDMA_MAX_DATA_SEGS) - nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - nsegs = i; - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, nsegs, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, nsegs, rc); - while (nsegs--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = nsegs; - seg1->mr_len = len; - } - } + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); break; } if (rc) @@ -1473,7 +1562,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt, void *r) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; int nsegs = seg->mr_nsegs, rc; switch (ia->ri_memreg_strategy) { @@ -1487,55 +1575,16 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, #endif case RPCRDMA_MTHCAFMR: - { - LIST_HEAD(l); - list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - } - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_fmr_external(seg, ia); break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: - { - struct ib_mw_bind param; - BUG_ON(nsegs != 1); - param.mr = ia->ri_bind_mem; - param.addr = 0ULL; /* unbind */ - param.length = 0; - param.mw_access_flags = 0; - if (r) { - param.wr_id = (u64) (unsigned long) r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, - seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - } - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - r = NULL; /* will upcall on completion */ + rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); break; default: - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); + rc = rpcrdma_deregister_default_external(seg, ia); break; } if (r) { -- cgit v1.2.3 From fe9053b30bb48b99f7b45541249f5cfe96bdf7f7 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 14:59:59 -0400 Subject: RPC/RDMA: add data types and new FRMR memory registration enum. Internal RPC/RDMA structure updates in preparation for FRMR support. Signed-off-by: Tom Talpey Acked-by: Tom Tucker Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtrdma.h | 1 + net/sunrpc/xprtrdma/xprt_rdma.h | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 4de56b1d372b..55a5d92ca1e2 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -78,6 +78,7 @@ enum rpcrdma_memreg { RPCRDMA_MEMWINDOWS, RPCRDMA_MEMWINDOWS_ASYNC, RPCRDMA_MTHCAFMR, + RPCRDMA_FRMR, RPCRDMA_ALLPHYSICAL, RPCRDMA_LAST }; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2427822f8bd4..05b7898e1f4b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -58,6 +58,8 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; + u32 ri_dma_lkey; + int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; @@ -156,6 +158,10 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ union { struct ib_mw *mw; struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + } frmr; } r; struct list_head mw_list; } *rl_mw; @@ -198,7 +204,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; -- cgit v1.2.3 From bd7ed1d13304d914648dacec4dbb9145aaae614e Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:00:09 -0400 Subject: RPC/RDMA: check selected memory registration mode at runtime. At transport creation, check for, and use, any local dma lkey. Then, check that the selected memory registration mode is in fact supported by the RDMA adapter selected for the mount. Fall back to best alternative if not. Signed-off-by: Tom Talpey Acked-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 95 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d04208a02f67..0f3b43148b7f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -423,7 +423,8 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc; + int rc, mem_priv; + struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; init_completion(&ia->ri_done); @@ -442,6 +443,53 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } + /* + * Query the device to determine if the requested memory + * registration strategy is supported. If it isn't, set the + * strategy to a globally supported model. + */ + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + goto out2; + } + + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_have_dma_lkey = 1; + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + + switch (memreg) { + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: + if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { + dprintk("RPC: %s: MEMWINDOWS registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; + } + break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; +#endif + } + break; + } + /* * Optionally obtain an underlying physical identity mapping in * order to do a memory window-based bind. This base registration @@ -450,22 +498,27 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * revoked after the corresponding completion similar to a storage * adapter. */ - if (memreg > RPCRDMA_REGISTER) { - int mem_priv = IB_ACCESS_LOCAL_WRITE; - switch (memreg) { + switch (memreg) { + case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + break; #if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: - mem_priv |= IB_ACCESS_REMOTE_WRITE; - mem_priv |= IB_ACCESS_REMOTE_READ; - break; + case RPCRDMA_ALLPHYSICAL: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv |= IB_ACCESS_MW_BIND; - break; - default: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_MW_BIND; + goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) break; - } + mem_priv = IB_ACCESS_LOCAL_WRITE; + register_setup: ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -475,7 +528,15 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) memreg = RPCRDMA_REGISTER; ia->ri_bind_mem = NULL; } + break; + default: + printk(KERN_ERR "%s: invalid memory registration mode %d\n", + __func__, memreg); + rc = -EINVAL; + goto out2; } + dprintk("RPC: %s: memory registration strategy is %d\n", + __func__, memreg); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -1248,7 +1309,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, va, len, DMA_BIDIRECTIONAL); iov->length = len; - if (ia->ri_bind_mem != NULL) { + if (ia->ri_have_dma_lkey) { + *mrp = NULL; + iov->lkey = ia->ri_dma_lkey; + return 0; + } else if (ia->ri_bind_mem != NULL) { *mrp = NULL; iov->lkey = ia->ri_bind_mem->lkey; return 0; -- cgit v1.2.3 From 3197d309f5fb042499b2c4c8f2fcb67372df5201 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:00:20 -0400 Subject: RPC/RDMA: support FRMR client memory registration. Configure, detect and use "fastreg" support from IB/iWARP verbs layer to perform RPC/RDMA memory registration. Make FRMR the default memreg mode (will fall back if not supported by the selected RDMA adapter). This allows full and optimal operation over the cxgb3 adapter, and others. Signed-off-by: Tom Talpey Acked-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/transport.c | 6 +- net/sunrpc/xprtrdma/verbs.c | 167 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 167 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a564c1a39ec5..89970b0a4cc9 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -70,11 +70,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -#if !RPCRDMA_PERSISTENT_REGISTRATION -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ -#else -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; -#endif +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; #ifdef RPC_DEBUG diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0f3b43148b7f..39a165202d8f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -485,6 +485,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) "using slower RPCRDMA_REGISTER\n", __func__); memreg = RPCRDMA_REGISTER; +#endif + } + break; + case RPCRDMA_FRMR: + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +#else + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using slower RPCRDMA_REGISTER\n", + __func__); + memreg = RPCRDMA_REGISTER; #endif } break; @@ -501,6 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) switch (memreg) { case RPCRDMA_BOUNCEBUFFERS: case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: break; #if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: @@ -602,6 +623,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + /* Add room for frmr register and invalidate WRs */ + ep->rep_attr.cap.max_send_wr *= 3; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ @@ -684,6 +711,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, break; case RPCRDMA_MTHCAFMR: case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: ep->rep_remote_cma.responder_resources = cdata->max_requests * (RPCRDMA_MAX_DATA_SEGS / 8); break; @@ -935,7 +963,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's or fmr's, if any + * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ @@ -943,6 +971,10 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * @@ -991,6 +1023,30 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, INIT_LIST_HEAD(&buf->rb_mws); r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } + r->r.frmr.fr_pgl = + ib_alloc_fast_reg_page_list(ia->ri_id->device, + RPCRDMA_MAX_SEGS); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + break; case RPCRDMA_MTHCAFMR: /* TBD we are perhaps overallocating here */ for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { @@ -1126,6 +1182,15 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) struct rpcrdma_mw, mw_list); list_del(&r->mw_list); switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; case RPCRDMA_MTHCAFMR: rc = ib_dealloc_fmr(r->r.fmr); if (rc) @@ -1228,6 +1293,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: @@ -1390,6 +1456,96 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } +static int +rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia, + struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr frmr_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (*nsegs > RPCRDMA_MAX_DATA_SEGS) + *nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < *nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments\n", + __func__, seg1->mr_chunk.rl_mw, i); + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.opcode = IB_WR_FAST_REG_MR; + frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma; + frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; + frmr_wr.wr.fast_reg.page_list_len = i; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ); + frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + + if (rc) { + dprintk("RPC: %s: failed ib_post_send for register," + " status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + } + *nsegs = i; + return rc; +} + +static int +rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + if (rc) + dprintk("RPC: %s: failed ib_post_send for invalidate," + " status %i\n", __func__, rc); + return rc; +} + static int rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, int *nsegs, int writing, struct rpcrdma_ia *ia) @@ -1600,6 +1756,11 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; #endif + /* Registration using frmr registration */ + case RPCRDMA_FRMR: + rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); + break; + /* Registration using fmr memory registration */ case RPCRDMA_MTHCAFMR: rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); @@ -1639,6 +1800,10 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, break; #endif + case RPCRDMA_FRMR: + rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); + break; + case RPCRDMA_MTHCAFMR: rc = rpcrdma_deregister_fmr_external(seg, ia); break; -- cgit v1.2.3 From b334eaabf4f92226d2df13c613888a507f03da99 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 9 Oct 2008 15:00:30 -0400 Subject: RPC/RDMA: fix connection IRD/ORD setting This logic sets the connection parameter that configures the local device and informs the remote peer how many concurrent incoming RDMA_READ requests are supported. The original logic didn't really do what was intended for two reasons: - The max number supported by the device is typically smaller than any one factor in the calculation used, and - The field in the connection parameter structure where the value is stored is a u8 and always overflows for the default settings. So what really happens is the value requested for responder resources is the left over 8 bits from the "desired value". If the desired value happened to be a multiple of 256, the result was zero and it wouldn't connect at all. Given the above and the fact that max_requests is almost always larger than the max responder resources supported by the adapter, this patch simplifies this logic and simply requests the max supported by the device, subject to a reasonable limit. This bug was found by Jim Schutt at Sandia. Signed-off-by: Tom Tucker Acked-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 51 +++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 39a165202d8f..e3fe9054fef6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -705,30 +705,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.private_data_len = 0; /* Client offers RDMA Read but does not initiate */ - switch (ia->ri_memreg_strategy) { - case RPCRDMA_BOUNCEBUFFERS: + ep->rep_remote_cma.initiator_depth = 0; + if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) ep->rep_remote_cma.responder_resources = 0; - break; - case RPCRDMA_MTHCAFMR: - case RPCRDMA_REGISTER: - case RPCRDMA_FRMR: - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 8); - break; - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: -#if RPCRDMA_PERSISTENT_REGISTRATION - case RPCRDMA_ALLPHYSICAL: -#endif - ep->rep_remote_cma.responder_resources = cdata->max_requests * - (RPCRDMA_MAX_DATA_SEGS / 2); - break; - default: - break; - } - if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) + else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; - ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -858,14 +841,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { } } - /* Theoretically a client initiator_depth > 0 is not needed, - * but many peers fail to complete the connection unless they - * == responder_resources! */ - if (ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources) - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -894,14 +869,16 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { if (ep->rep_connected <= 0) { /* Sometimes, the only way to reliably connect to remote * CMs is to use same nonzero values for ORD and IRD. */ - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; - if (ep->rep_remote_cma.initiator_depth == 0) - ++ep->rep_remote_cma.initiator_depth; - if (ep->rep_remote_cma.responder_resources == 0) - ++ep->rep_remote_cma.responder_resources; - if (retry_count++ == 0) + if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && + (ep->rep_remote_cma.responder_resources == 0 || + ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources)) { + if (ep->rep_remote_cma.responder_resources == 0) + ep->rep_remote_cma.responder_resources = 1; + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; goto retry; + } rc = ep->rep_connected; } else { dprintk("RPC: %s: connected\n", __func__); -- cgit v1.2.3 From 575448bd36208f99fe0dd554a43518d798966740 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:00:40 -0400 Subject: RPC/RDMA: suppress retransmit on RPC/RDMA clients. An RPC/RDMA client cannot retransmit on an unbroken connection, doing so violates its flow control with the server. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 2 ++ net/sunrpc/xprtrdma/transport.c | 16 ++++++++++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e55427f73dfe..721dae795d68 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -681,6 +681,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); + if (++xprt->connect_cookie == 0) /* maintain a reserved value */ + ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 89970b0a4cc9..0aefc6485385 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -587,6 +587,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); out: + req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; outfail: @@ -690,13 +691,20 @@ xprt_rdma_send_request(struct rpc_task *task) req->rl_reply->rr_xprt = xprt; } - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ - } + /* Must suppress retransmit to maintain credits */ + if (req->rl_connect_cookie == xprt->connect_cookie) + goto drop_connection; + req->rl_connect_cookie = xprt->connect_cookie; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; rqst->rq_bytes_sent = 0; return 0; + +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ } static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 05b7898e1f4b..2db2344d487e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -181,6 +181,7 @@ struct rpcrdma_req { size_t rl_size; /* actual length of buffer */ unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ + unsigned int rl_connect_cookie; /* retry detection */ struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ -- cgit v1.2.3 From ad0e9e01da4ece70ff524b49c77c5e850d5dd53e Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:00:50 -0400 Subject: RPC/RDMA: maintain the RPC task bytes-sent statistic. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/transport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 0aefc6485385..ec6d1e7a1941 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -699,6 +699,7 @@ xprt_rdma_send_request(struct rpc_task *task) if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; + task->tk_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; return 0; -- cgit v1.2.3 From fee08caf943e8ed3446ce42fa085b5e7e5f08d92 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:01:00 -0400 Subject: RPC/RDMA: avoid an oops due to disconnect racing with async upcalls. RDMA disconnects yield an upcall from the RDMA connection manager, which can race with rpc transport close, e.g. on ^C of a mount. Ensure any rdma cm_id and qp are fully destroyed before continuing. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e3fe9054fef6..d94f379f36d7 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -565,6 +565,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) return 0; out2: rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; out1: return rc; } @@ -585,15 +586,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; + } if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { rc = ib_dealloc_pd(ia->ri_pd); dprintk("RPC: %s: ib_dealloc_pd returned %i\n", __func__, rc); } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); } /* @@ -751,21 +754,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: rpcrdma_ep_disconnect" " returned %i\n", __func__, rc); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; } - ep->rep_func = NULL; - /* padding - could be done in rpcrdma_buffer_destroy... */ if (ep->rep_pad_mr) { rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); ep->rep_pad_mr = NULL; } - if (ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - rpcrdma_clean_cq(ep->rep_cq); rc = ib_destroy_cq(ep->rep_cq); if (rc) -- cgit v1.2.3 From 9191ca3b381b15b9a88785a8ae2fa4db8e553b0c Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:01:11 -0400 Subject: RPC/RDMA: adhere to protocol for unpadded client trailing write chunks. The RPC/RDMA protocol allows clients and servers to avoid RDMA operations for data which is purely the result of XDR padding. On the client, automatically insert the necessary padding for such server replies, and optionally don't marshal such chunks. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 21 +++++++++++++++++++-- net/sunrpc/xprtrdma/transport.c | 9 +++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 721dae795d68..d245c0bf7873 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing + * xdr pad bytes, saving the server an RDMA operation. */ + if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) + return n; if (n == nsegs) return 0; seg[n].mr_page = NULL; @@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b * Scatter inline received data back into provided iov's. */ static void -rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; @@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) } else rqst->rq_rcv_buf.tail[0].iov_len = 0; + if (pad) { + /* implicit padding on terminal chunk */ + unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; + while (pad--) + p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + } + if (copy_len) dprintk("RPC: %s: %d bytes in" " %d extra segments (%d lost)\n", @@ -794,14 +805,20 @@ repost: ((unsigned char *)iptr - (unsigned char *)headerp); status = rep->rr_len + rdmalen; r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* special case - last chunk may omit padding */ + if (rdmalen &= 3) { + rdmalen = 4 - rdmalen; + status += rdmalen; + } } else { /* else ordinary inline */ + rdmalen = 0; iptr = (__be32 *)((unsigned char *)headerp + 28); rep->rr_len -= 28; /*sizeof *headerp;*/ status = rep->rr_len; } /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); break; case __constant_htonl(RDMA_NOMSG): diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ec6d1e7a1941..c7d2380bb5e3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -71,6 +71,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; + int xprt_rdma_pad_optimize = 0; #ifdef RPC_DEBUG @@ -135,6 +136,14 @@ static ctl_table xr_tunables_table[] = { .extra1 = &min_memreg, .extra2 = &max_memreg, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0, }, diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2db2344d487e..fde6499a53b2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -280,6 +280,11 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + /* * Interface Adapter calls - xprtrdma/verbs.c */ -- cgit v1.2.3 From 926449ba66ce2a45c619bbe755b00d6bdbf0d83e Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:01:21 -0400 Subject: RPC/RDMA: return a consistent error, when connect fails. The xprt_connect call path does not expect such errors as ECONNREFUSED to be returned from failed transport connection attempts, otherwise it translates them to EIO and signals fatal errors. For example, mount.nfs prints simply "internal error". Translate all such errors to ENOTCONN from RPC/RDMA to match sockets behavior. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index d245c0bf7873..94ecf1b65ff6 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -699,7 +699,7 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) xprt_wake_pending_tasks(xprt, 0); } else { if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, ep->rep_connected); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } spin_unlock_bh(&xprt->transport_lock); } -- cgit v1.2.3 From 1a954051b0cf79bd67e5f9db40333e3a9b1d05d2 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:01:31 -0400 Subject: RPC/RDMA: fix connect/reconnect resource leak. The RPC/RDMA code can leak RDMA connection manager endpoints in certain error cases on connect. Don't signal unwanted events, and be certain to destroy any allocated qp. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d94f379f36d7..a63d0c0ec017 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -338,10 +338,8 @@ connected: wake_up_all(&ep->rep_connect_wait); break; default: - ia->ri_async_rc = -EINVAL; - dprintk("RPC: %s: unexpected CM event %X\n", + dprintk("RPC: %s: unexpected CM event %d\n", __func__, event->event); - complete(&ia->ri_done); break; } @@ -355,6 +353,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rdma_cm_id *id; int rc; + init_completion(&ia->ri_done); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -427,8 +427,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; - init_completion(&ia->ri_done); - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -815,6 +813,7 @@ retry: goto out; } /* END TEMP */ + rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; } -- cgit v1.2.3 From 5675add36e76b9487e7f9e689f854cb8d6afd9b4 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:01:41 -0400 Subject: RPC/RDMA: harden connection logic against missing/late rdma_cm upcalls. Add defensive timeouts to wait_for_completion() calls in RDMA address resolution, and make them interruptible. Fix the timeout units to milliseconds (formerly jiffies) and move to private header. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtrdma.h | 3 --- net/sunrpc/xprtrdma/verbs.c | 11 +++++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 55a5d92ca1e2..54a379c9e8eb 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -66,9 +66,6 @@ #define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ -#define RDMA_RESOLVE_TIMEOUT (5*HZ) /* TBD 5 seconds */ -#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ - /* memory registration strategies */ #define RPCRDMA_PERSISTENT_REGISTRATION (1) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a63d0c0ec017..f46fb93f421b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -284,6 +284,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: @@ -363,26 +364,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, return id; } - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; - ia->ri_async_rc = 0; + ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); goto out; } - wait_for_completion(&ia->ri_done); + wait_for_completion_interruptible_timeout(&ia->ri_done, + msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) goto out; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index fde6499a53b2..c7a7eba991bc 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,6 +51,9 @@ #include /* RPC/RDMA protocol */ #include /* xprt parameters */ +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + /* * Interface Adapter -- one per transport instance */ -- cgit v1.2.3 From 5f37d561e0f0cd98017c389cbc22080290f11c3c Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:01:52 -0400 Subject: RPC/RDMA: reformat a debug printk to keep lines together. The send marshaling code split a particular dprintk across two lines, which makes it hard to extract from logfiles. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 94ecf1b65ff6..15101f294e00 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -512,8 +512,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen == 0) return -1; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" - " headerp 0x%p base 0x%p lkey 0x%x\n", + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); -- cgit v1.2.3 From b3cd8d45a764e6edb06e7bd386faf99a879569b8 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Thu, 9 Oct 2008 15:02:02 -0400 Subject: RPC/RDMA: optionally emit useful transport info upon connect/disconnect. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c7d2380bb5e3..c2da680273c5 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -784,7 +784,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f46fb93f421b..170e69cba6c4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -344,6 +344,27 @@ connected: break; } +#ifdef RPC_DEBUG + if (connstate == 1) { + int ird = attr.max_dest_rd_atomic; + int tird = ep->rep_remote_cma.responder_resources; + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "on %s, memreg %d slots %d ird %d%s\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + ia->ri_id->device->name, + ia->ri_memreg_strategy, + xprt->rx_buf.rb_max_requests, + ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); + } else if (connstate < 0) { + printk(KERN_INFO "rpcrdma: connection to %u.%u.%u.%u:%u " + "closed (%d)\n", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + connstate); + } +#endif + return 0; } -- cgit v1.2.3 From 08ca0dce1eafa419059ac4cad9ed522af7052526 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Fri, 10 Oct 2008 11:32:34 -0400 Subject: RPC/RDMA: correct the reconnect timer backoff The RPC/RDMA code had a constant 5-second reconnect backoff, and always performed it, even when re-establishing a connection to a server after the RPC layer closed it due to being idle. Make it an geometric backoff (up to 30 seconds), and don't delay idle reconnect. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/transport.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c2da680273c5..9839c3d94145 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -463,6 +463,8 @@ xprt_rdma_close(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); dprintk("RPC: %s: closing\n", __func__); + if (r_xprt->rx_ep.rep_connected > 0) + xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } @@ -490,6 +492,11 @@ xprt_rdma_connect(struct rpc_task *task) /* Reconnect */ schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > (30 * HZ)) + xprt->reestablish_timeout = (30 * HZ); + else if (xprt->reestablish_timeout < (5 * HZ)) + xprt->reestablish_timeout = (5 * HZ); } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) -- cgit v1.2.3 From c055551e97e1ca00781bc41523f829e05a8afed7 Mon Sep 17 00:00:00 2001 From: Tom Talpey Date: Fri, 10 Oct 2008 11:32:45 -0400 Subject: RPC/RDMA: ensure connection attempt is complete before signalling. The RPC/RDMA connection logic could return early from reconnection attempts, leading to additional spurious retries. Signed-off-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 170e69cba6c4..a5fef5e6c323 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -804,9 +804,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rdma_cm_id *id; int rc = 0; int retry_count = 0; - int reconnect = (ep->rep_connected != 0); - if (reconnect) { + if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: rc = rpcrdma_ep_disconnect(ep, ia); @@ -871,9 +870,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { goto out; } - if (reconnect) - return 0; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); /* -- cgit v1.2.3 From 921615f111108258820226a3258a047d9bf1d96a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Oct 2008 19:23:07 -0400 Subject: NFS: Changes to inode->i_nlinks must set the NFS_INO_INVALID_ATTR flag Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6554281e24a2..de3f11e6234e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1141,6 +1141,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; -- cgit v1.2.3 From 4704f0e274829e3af00737d2d9adace2d71a9605 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Oct 2008 19:16:07 -0400 Subject: NFS: Fix the resolution problem with nfs_inode_attrs_need_update() It appears that 'jiffies' timestamps do not have high enough resolution for nfs_inode_attrs_need_update(). One problem is that a GETATTR can be launched within < 1 jiffy of the last operation that updated the attribute. Another problem is that RPC calls can take < 1 jiffy to execute. We can fix this by switching the variables to use a simple global counter that gets incremented every time we start another GETATTR call. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 14 ++++++++++---- fs/nfs/inode.c | 37 ++++++++++++++++++++++++++++++------- include/linux/nfs_fs.h | 10 +++------- include/linux/nfs_xdr.h | 1 + 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 49d565412827..4807074ada8c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -156,6 +156,7 @@ typedef struct { decode_dirent_t decode; int plus; unsigned long timestamp; + unsigned long gencount; int timestamp_valid; } nfs_readdir_descriptor_t; @@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) struct file *file = desc->file; struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); - unsigned long timestamp; + unsigned long timestamp, gencount; int error; dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", @@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; SetPageUptodate(page); /* Ensure consistent page alignment of the data. @@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc) if (IS_ERR(p)) return PTR_ERR(p); desc->ptr = p; - if (desc->timestamp_valid) + if (desc->timestamp_valid) { desc->entry->fattr->time_start = desc->timestamp; - else + desc->entry->fattr->gencount = desc->gencount; + } else desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; return 0; } @@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp; + unsigned long timestamp, gencount; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, @@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (status >= 0) { desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; if ((status = dir_decode(desc)) == 0) desc->entry->prev_cookie = *desc->dir_cookie; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index de3f11e6234e..116a3bd2bc9b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -305,7 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = now; + nfsi->attr_gencount = fattr->gencount; nfsi->cache_change_attribute = now; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; @@ -909,6 +909,30 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); } +static unsigned long nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + smp_rmb(); + return nfs_attr_generation_counter; +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + unsigned long ret; + smp_rmb(); + ret = ++nfs_attr_generation_counter; + smp_wmb(); + return ret; +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + /** * nfs_inode_attrs_need_update - check if the inode attributes need updating * @inode - pointer to inode @@ -922,8 +946,7 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt * catch the case where ctime either didn't change, or went backwards * (if someone reset the clock on the server) by looking at whether * or not this RPC call was started after the inode was last updated. - * Note also the check for jiffy wraparound if the last_updated timestamp - * is later than 'jiffies'. + * Note also the check for wraparound of 'attr_gencount' * * The function returns 'true' if it thinks the attributes in 'fattr' are * more recent than the ones cached in the inode. @@ -933,10 +956,10 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n { const struct nfs_inode *nfsi = NFS_I(inode); - return time_after(fattr->time_start, nfsi->last_updated) || + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || nfs_ctime_need_update(inode, fattr) || nfs_size_need_update(inode, fattr) || - time_after(nfsi->last_updated, jiffies); + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) @@ -1107,7 +1130,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1163,7 +1186,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - nfsi->last_updated = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index ca563ee13e32..ac8d0233b05c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -137,7 +137,7 @@ struct nfs_inode { unsigned long attrtimeo_timestamp; __u64 change_attr; /* v4 only */ - unsigned long last_updated; + unsigned long attr_gencount; /* "Generation counter" for the attribute cache. This is * bumped whenever we update the metadata on the * server. @@ -344,15 +344,11 @@ extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ct extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); extern u64 nfs_compat_user_ino64(u64 fileid); +extern void nfs_fattr_init(struct nfs_fattr *fattr); /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ - -static inline void nfs_fattr_init(struct nfs_fattr *fattr) -{ - fattr->valid = 0; - fattr->time_start = jiffies; -} +extern unsigned long nfs_inc_attr_generation_counter(void); /* * linux/fs/nfs/file.c diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 6ee6ae3f095c..c1c31acb8a2b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -56,6 +56,7 @@ struct nfs_fattr { __u64 change_attr; /* NFSv4 change attribute */ __u64 pre_change_attr;/* pre-op NFSv4 change attribute */ unsigned long time_start; + unsigned long gencount; }; #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ -- cgit v1.2.3 From 011935a0a710c20bb7ae63523b78856848db1926 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Oct 2008 19:24:50 -0400 Subject: NFS: Fix a resolution problem with nfs_inode->cache_change_attribute The cache_change_attribute is used to decide whether or not a directory has changed, in which case we may need to look it up again. Again, the use of 'jiffies' leads to an issue of resolution. Once again, the fix is to change nfs_inode->cache_change_attribute, and just make it a simple counter. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/inode.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4807074ada8c..2ab70d46ecbc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -661,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute = jiffies; + NFS_I(dir)->cache_change_attribute++; } /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 116a3bd2bc9b..b9195c02a863 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -306,7 +306,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; - nfsi->cache_change_attribute = now; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; -- cgit v1.2.3