From 235a09821c2bc71d9d07f12217ce2ac00db99eba Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 30 Mar 2016 17:18:34 +0800 Subject: ceph: multiple filesystem support To access non-default filesystem, we just need to subscribe to mdsmap. and add a new mount option for mds namespace id. Signed-off-by: Yan, Zheng [idryomov@gmail.com: switch to a new libceph API] Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e705c4d612d7..db2200f5ba80 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,6 +62,7 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + int mds_namespace; /* * everything above this point can be memcmp'd; everything below -- cgit v1.2.3 From 77310320c299b0dc050037ff8fc29fd1861fb005 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 8 Apr 2016 15:27:16 +0800 Subject: ceph: renew caps for read/write if mds session got killed. When mds session gets killed, read/write operation may hang. Client waits for Frw caps, but mds does not know what caps client wants. To recover this, client sends an open request to mds. The request will tell mds what caps client wants. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 43 ++++++++++++++++++++++++++++++++---------- fs/ceph/file.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/mds_client.c | 6 +++++- fs/ceph/super.h | 2 ++ 4 files changed, 93 insertions(+), 11 deletions(-) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cfaeef18cbca..fab93c66d879 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2317,7 +2317,7 @@ again: /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { + if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; @@ -2412,12 +2412,26 @@ again: goto out_unlock; } - if (!__ceph_is_any_caps(ci) && - ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; - goto out_unlock; + if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { + int mds_wanted; + if (ACCESS_ONCE(mdsc->fsc->mount_state) == + CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci); + if ((mds_wanted & need) != need) { + dout("get_cap_refs %p caps were dropped" + " (session killed?)\n", inode); + *err = -ESTALE; + ret = 1; + goto out_unlock; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } dout("get_cap_refs %p have %s needed %s\n", inode, @@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err == -EAGAIN) continue; if (err < 0) - return err; + ret = err; } else { ret = wait_event_interruptible(ci->i_cap_wq, try_get_cap_refs(ci, need, want, endoff, @@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, continue; if (err < 0) ret = err; - if (ret < 0) - return ret; + } + if (ret < 0) { + if (err == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; + } + return ret; } if (ci->i_inline_version != CEPH_INLINE_NONE && @@ -3226,6 +3247,8 @@ retry: if (target < 0) { __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; goto out_unlock; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 30fd49eb25b4..996e9ec81e4e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) return ret; } +/* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_fmode = -1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + /* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index cff85af425d4..1e5965d17cb1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1133,6 +1133,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); if (!n) @@ -1181,7 +1183,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_del(&cf->list); ceph_free_cap_flush(cf); } - while (drop--) + + wake_up_all(&ci->i_cap_wq); + if (drop) iput(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index db2200f5ba80..5fef3a6397db 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -470,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -932,6 +933,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; +extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, -- cgit v1.2.3 From 3f38495409b613071021fca86629df7ae81820ad Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 21 Apr 2016 11:09:55 +0800 Subject: ceph: report mount root in session metadata Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 4 +++- fs/ceph/super.c | 33 +++++++++++++++++++-------------- fs/ceph/super.h | 1 + 3 files changed, 23 insertions(+), 15 deletions(-) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index cbe6c0afdadc..047f723bdbe6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -839,12 +839,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; void *p; const char* metadata[][2] = { {"hostname", utsname()->nodename}, {"kernel_version", utsname()->release}, - {"entity_id", opt->name ? opt->name : ""}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d714ab20ad24..91e02481ce06 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -302,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->server_path); kfree(args); } @@ -333,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } static int parse_mount_options(struct ceph_mount_options **pfsopt, struct ceph_options **popt, int flags, char *options, - const char *dev_name, - const char **path) + const char *dev_name) { struct ceph_mount_options *fsopt; const char *dev_name_end; @@ -386,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } else { - /* path is empty */ dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ @@ -401,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, goto out; } dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); @@ -793,8 +799,7 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -823,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto fail; } - if (path[0] == 0) { + if (!fsc->mount_options->server_path) { root = fsc->sb->s_root; dget(root); } else { - dout("mount opening base mountpoint\n"); + const char *path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -943,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; @@ -952,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, #ifdef CONFIG_CEPH_FS_POSIX_ACL flags |= MS_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { res = ERR_PTR(err); goto out_final; @@ -995,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } } - res = ceph_real_mount(fsc, path); + res = ceph_real_mount(fsc); if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5fef3a6397db..0ea86406f463 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -70,6 +70,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *server_path; /* default "/" */ }; struct ceph_fs_client { -- cgit v1.2.3 From 8974eebd38737c9534d81c4131c5fdb1fe24d3e9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 28 Apr 2016 15:17:40 +0800 Subject: ceph: record 'offset' for each entry of readdir result This is preparation for using hash value as dentry 'offset' Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 83 ++++++++++++++++++++++++++++++++++------------------ fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 2 ++ fs/ceph/mds_client.h | 1 + fs/ceph/super.h | 1 - 5 files changed, 59 insertions(+), 29 deletions(-) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ebcbd1c946b4..6ae635605be5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -277,12 +277,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + int i; int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); if (fi->flags & CEPH_F_ATEND) return 0; @@ -294,7 +294,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { ino_t ino = parent_ino(file->f_path.dentry); @@ -304,7 +303,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } /* can we use the dcache? */ @@ -320,7 +318,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if (err != -EAGAIN) return err; frag = fpos_frag(ctx->pos); - off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -386,12 +383,12 @@ more: rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - off = req->r_readdir_offset; - fi->next_offset = off; + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, fi->next_offset); } fi->frag = frag; - fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_did_prepopulate) { @@ -399,7 +396,8 @@ more: if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ fi->dir_ordered_count = 0; - } else if (ceph_frag_is_leftmost(frag) && off == 2) { + } else if (ceph_frag_is_leftmost(frag) && + fi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ fi->dir_release_count = req->r_dir_release_cnt; @@ -421,37 +419,54 @@ more: struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + (rinfo->dir_nr-1); err = note_last_dentry(fi, rde->name, rde->name_len, - fi->next_offset + rinfo->dir_nr); + fpos_off(rde->offset) + 1); if (err) return err; } } rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_dir_entry *rde = - rinfo->dir_entries + (off - fi->offset); + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, rde->name_len, rde->name, &rde->inode.in); + BUG_ON(!rde->inode.in); ftype = le32_to_cpu(rde->inode.in->mode) >> 12; vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); + if (!dir_emit(ctx, rde->name, rde->name_len, ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } - off++; ctx->pos++; } @@ -464,8 +479,7 @@ more: /* more frags? */ if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); - off = 2; - ctx->pos = ceph_make_fpos(frag, off); + ctx->pos = ceph_make_fpos(frag, 2); dout("readdir next frag is %x\n", frag); goto more; } @@ -497,7 +511,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +static void reset_readdir(struct ceph_file_info *fi) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -511,6 +525,23 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->flags &= ~CEPH_F_ATEND; } +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + if (new_pos == 0) + return true; + if (fpos_frag(new_pos) != fi->frag) + return true; + rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + return new_pos < rinfo->dir_entries[0].offset;; +} + static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; @@ -539,13 +570,9 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } retval = offset; - if (offset == 0 || - fpos_frag(offset) != fi->frag || - fpos_off(offset) < fi->offset) { - /* discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk */ + if (need_reset_readdir(fi, offset)) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi, fpos_frag(offset)); + reset_readdir(fi); } else if (fpos_cmp(offset, old_offset) > 0) { /* reset dir_release_count if we did a forward seek */ fi->dir_release_count = 0; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 40d081d7028e..b53c95903aeb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1523,6 +1523,7 @@ retry_lookup: di = dn->d_fsdata; di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + rde->offset = di->offset; update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1c2befcd24fb..48def22fc7b9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -214,6 +214,8 @@ static int parse_reply_info_dir(void **p, void *end, err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2a865812a41b..4ce19d852657 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -52,6 +52,7 @@ struct ceph_mds_reply_dir_entry { u32 name_len; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; + loff_t offset; }; /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0ea86406f463..0628099ba1f2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -635,7 +635,6 @@ struct ceph_file_info { struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; -- cgit v1.2.3 From f3c4ebe65ea149ec892f94474233cfebe9cbe299 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 29 Apr 2016 11:27:30 +0800 Subject: ceph: using hash value to compose dentry offset If MDS sorts dentries in dirfrag in hash order, we use hash value to compose dentry offset. dentry offset is: (0xff << 52) | ((24 bits hash) << 28) | (the nth entry hash hash collision) This offset is stable across directory fragmentation. This alos means there is no need to reset readdir offset if directory get fragmented in the middle of readdir. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 140 ++++++++++++++++++++++++++++++++----------- fs/ceph/inode.c | 31 ++++++++-- fs/ceph/mds_client.c | 1 + fs/ceph/mds_client.h | 4 +- fs/ceph/super.h | 6 +- include/linux/ceph/ceph_fs.h | 1 + 6 files changed, 136 insertions(+), 47 deletions(-) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e954ea2fb710..4850c3624a87 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -69,16 +69,42 @@ out_unlock: } /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + static unsigned fpos_frag(loff_t p) { - return p >> 32; + return p >> OFFSET_BITS; } + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + static unsigned fpos_off(loff_t p) { - return p & 0xffffffff; + return p & OFFSET_MASK; } static int fpos_cmp(loff_t l, loff_t r) @@ -177,7 +203,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, u64 idx = 0; int err = 0; - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); /* search start position */ if (ctx->pos > 2) { @@ -234,7 +260,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dout(" %llx dentry %p %pd %p\n", di->offset, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, @@ -269,6 +295,16 @@ out: return err; } +static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +{ + if (!fi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + else + return fi->frag != fpos_frag(pos); +} + static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -276,7 +312,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); int i; int err; u32 ftype; @@ -317,7 +352,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; - frag = fpos_frag(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -325,8 +359,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; + unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -336,6 +371,13 @@ more: fi->last_readdir = NULL; } + if (is_hash_order(ctx->pos)) { + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -373,19 +415,23 @@ more: ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); - + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); - /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - fi->next_offset = req->r_readdir_offset; - /* adjust ctx->pos to beginning of frag */ - ctx->pos = ceph_make_fpos(frag, fi->next_offset); + if (!rinfo->hash_order) { + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + fi->next_offset, + false); + } } fi->frag = frag; @@ -411,23 +457,25 @@ more: fi->dir_release_count = 0; } - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - fi->next_offset = 2; - } else { + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(fi, rde->name, rde->name_len, - fpos_off(rde->offset) + 1); + next_offset); if (err) return err; + } else if (req->r_reply_info.dir_end) { + fi->next_offset = 2; + /* keep last name */ } } rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d pos %llx chunk first %llx\n", - frag, rinfo->dir_nr, ctx->pos, + fi->frag, rinfo->dir_nr, ctx->pos, rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); i = 0; @@ -470,16 +518,26 @@ more: ctx->pos++; } - if (fi->last_name) { + if (fi->next_offset > 2) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - ctx->pos = ceph_make_fpos(frag, 2); + if (!ceph_frag_is_rightmost(fi->frag)) { + unsigned frag = ceph_frag_next(fi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + fi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); + kfree(fi->last_name); + fi->last_name = NULL; + } dout("readdir next frag is %x\n", frag); goto more; } @@ -532,14 +590,21 @@ static void reset_readdir(struct ceph_file_info *fi) static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) { struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; if (new_pos == 0) return true; - if (fpos_frag(new_pos) != fi->frag) + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (fi->frag |= fpos_frag(new_pos)) { return true; + } rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; if (!rinfo || !rinfo->dir_nr) return true; - return new_pos < rinfo->dir_entries[0].offset;; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) @@ -562,17 +627,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { + if (need_reset_readdir(fi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; fi->flags &= ~CEPH_F_ATEND; } retval = offset; - - if (need_reset_readdir(fi, offset)) { - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); - } } out: inode_unlock(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b53c95903aeb..f51b6fd5f570 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1387,6 +1387,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1394,19 +1395,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - struct ceph_dentry_info *di; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order && req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } + if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - req->r_readdir_offset = 2; + if (!rinfo->hash_order) + req->r_readdir_offset = 2; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { @@ -1424,13 +1433,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_readdir_cache_idx = 0; } cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { @@ -1444,6 +1453,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } + retry_lookup: dn = d_lookup(parent, &dname); dout("d_lookup on parent=%p name=%.*s got %p\n", @@ -1521,9 +1542,7 @@ retry_lookup: dn = realdn; } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - rde->offset = di->offset; + ceph_dentry(dn)->offset = rde->offset; update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 48def22fc7b9..7ad31283d510 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -185,6 +185,7 @@ static int parse_reply_info_dir(void **p, void *end, u16 flags = ceph_decode_16(p); info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); } if (num == 0) goto done; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4ce19d852657..e7d38aac7109 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -81,7 +81,9 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - bool dir_complete, dir_end; + bool dir_complete; + bool dir_end; + bool hash_order; struct ceph_mds_reply_dir_entry *dir_entries; }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0628099ba1f2..c9b671dfff81 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -540,11 +540,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -949,6 +944,7 @@ extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index a811c5e98bfa..dfce616002ad 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -357,6 +357,7 @@ extern const char *ceph_mds_op_name(int op); */ #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) union ceph_mds_request_args { struct { -- cgit v1.2.3 From 1b1bc16d66a7c7af3b4f30d1cf5a363168b217f4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 May 2016 11:40:30 +0800 Subject: ceph: improve fragtree change detection check if number of splits in i_fragtree is equal to number of splits in mds reply Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 24 ++++++++++++++++++++---- fs/ceph/super.h | 1 + 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'fs/ceph/super.h') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 09713f32ea88..7ba8e1c2557b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -326,13 +326,15 @@ static int ceph_fill_fragtree(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; - int i; - u32 id, nsplits; + unsigned i, split_by, nsplits; + u32 id; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); - if (nsplits) { + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) @@ -360,6 +362,13 @@ static int ceph_fill_fragtree(struct inode *inode, rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } frag = NULL; while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); @@ -375,6 +384,8 @@ static int ceph_fill_fragtree(struct inode *inode, if (frag->split_by > 0 || !is_frag_child(frag->frag, prev_frag)) { rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; kfree(frag); } frag = NULL; @@ -384,7 +395,9 @@ static int ceph_fill_fragtree(struct inode *inode, if (IS_ERR(frag)) continue; } - frag->split_by = le32_to_cpu(fragtree->splits[i].by); + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; dout(" frag %x split by %d\n", frag->frag, frag->split_by); prev_frag = frag; } @@ -395,6 +408,8 @@ static int ceph_fill_fragtree(struct inode *inode, if (frag->split_by > 0 || !is_frag_child(frag->frag, prev_frag)) { rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; kfree(frag); } } @@ -546,6 +561,7 @@ void ceph_destroy_inode(struct inode *inode) rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c9b671dfff81..a268f18d2119 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -297,6 +297,7 @@ struct ceph_inode_info { u64 i_files, i_subdirs; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; -- cgit v1.2.3