From 4d7ace02ba5c6ef1f8eeb32a86fef7c528bd7f36 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 26 Nov 2019 07:24:21 -0500 Subject: ceph: fix mdsmap cluster available check based on laggy number In case the max_mds > 1 in MDS cluster and there is no any standby MDS and all the max_mds MDSs are in up:active state, if one of the up:active MDSs is dead, the m->m_num_laggy in kclient will be 1. Then the mount will fail without considering other healthy MDSs. There manybe some MDSs still "in" the cluster but not in up:active state, we will ignore them. Only when all the up:active MDSs in the cluster are laggy will treat the cluster as not be available. In case decreasing the max_mds, the cluster will not stop the extra up:active MDSs immediately and there will be a latency. During it the up:active MDS number will be larger than the max_mds, so later the m_info memories will 100% be reallocated. Here will pick out the up:active MDSs as the m_num_mds and allocate the needed memories once. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/mdsmap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 0067d767c9ae..3a66f4f926ce 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,8 +25,9 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; - u32 m_max_mds; /* size of m_addr, m_state arrays */ - int m_num_mds; + u32 m_max_mds; /* expected up:active mds number */ + int m_num_active_mds; /* actual up:active mds number */ + int m_num_mds; /* size of m_info array */ struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ -- cgit v1.2.3 From b38c9eb4757d5bac1eb8634a9516ef918fca2525 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 4 Dec 2019 06:57:39 -0500 Subject: ceph: add possible_max_rank and make the code more readable The m_num_mds here is actually the number for MDSs which are in up:active status, and it will be duplicated to m_num_active_mds, so remove it. Add possible_max_rank to the mdsmap struct and this will be the correctly possible largest rank boundary. Remove the special case for one mds in __mdsmap_get_random_mds(), because the validate mds rank may not always be 0. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 2 +- fs/ceph/mds_client.c | 10 ++++----- fs/ceph/mdsmap.c | 49 ++++++++++++++++++--------------------------- include/linux/ceph/mdsmap.h | 10 ++++----- 4 files changed, 31 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index c281f32b54f7..fb7cabd98e7b 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p) seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); - for (i = 0; i < mdsmap->m_num_mds; i++) { + for (i = 0; i < mdsmap->possible_max_rank; i++) { struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 07ecdfc8438d..aba7a56d055d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -598,7 +598,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; - if (mds >= mdsc->mdsmap->m_num_mds) + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); @@ -1231,7 +1231,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *ts; int i, mds = session->s_mds; - if (mds >= mdsc->mdsmap->m_num_mds) + if (mds >= mdsc->mdsmap->possible_max_rank) return; mi = &mdsc->mdsmap->m_info[mds]; @@ -3785,7 +3785,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); - for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; @@ -3799,7 +3799,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); - if (i >= newmap->m_num_mds) { + if (i >= newmap->possible_max_rank) { /* force close session for stopped mds */ get_session(s); __unregister_session(mdsc, s); @@ -3856,7 +3856,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } - for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index a77e0ecb9a6b..889627817e52 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -14,22 +14,15 @@ #include "super.h" #define CEPH_MDS_IS_READY(i, ignore_laggy) \ - (m->m_info[i].state > 0 && (ignore_laggy ? true : !m->m_info[i].laggy)) + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) { int n = 0; int i, j; - /* - * special case for one mds, no matter it is laggy or - * not we have no choice - */ - if (1 == m->m_num_mds && m->m_info[0].state > 0) - return 0; - /* count */ - for (i = 0; i < m->m_num_mds; i++) + for (i = 0; i < m->possible_max_rank; i++) if (CEPH_MDS_IS_READY(i, ignore_laggy)) n++; if (n == 0) @@ -37,7 +30,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) /* pick */ n = prandom_u32() % n; - for (j = 0, i = 0; i < m->m_num_mds; i++) { + for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; if (j > n) @@ -55,10 +48,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) int mds; mds = __mdsmap_get_random_mds(m, false); - if (mds == m->m_num_mds || mds == -1) + if (mds == m->possible_max_rank || mds == -1) mds = __mdsmap_get_random_mds(m, true); - return mds == m->m_num_mds ? -1 : mds; + return mds == m->possible_max_rank ? -1 : mds; } #define __decode_and_drop_type(p, end, type, bad) \ @@ -129,7 +122,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) int err; u8 mdsmap_v, mdsmap_cv; u16 mdsmap_ev; - u32 possible_max_rank; m = kzalloc(sizeof(*m), GFP_NOFS); if (!m) @@ -157,24 +149,23 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_max_mds = ceph_decode_32(p); /* - * pick out the active nodes as the m_num_mds, the m_num_mds - * maybe larger than m_max_mds when decreasing the max_mds in - * cluster side, in other case it should less than or equal - * to m_max_mds. + * pick out the active nodes as the m_num_active_mds, the + * m_num_active_mds maybe larger than m_max_mds when decreasing + * the max_mds in cluster side, in other case it should less + * than or equal to m_max_mds. */ - m->m_num_mds = n = ceph_decode_32(p); - m->m_num_active_mds = m->m_num_mds; + m->m_num_active_mds = n = ceph_decode_32(p); /* - * the possible max rank, it maybe larger than the m->m_num_mds, + * the possible max rank, it maybe larger than the m_num_active_mds, * for example if the mds_max == 2 in the cluster, when the MDS(0) * was laggy and being replaced by a new MDS, we will temporarily * receive a new mds map with n_num_mds == 1 and the active MDS(1), - * and the mds rank >= m->m_num_mds. + * and the mds rank >= m_num_active_mds. */ - possible_max_rank = max((u32)m->m_num_mds, m->m_max_mds); + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); - m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); if (!m->m_info) goto nomem; @@ -248,7 +239,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_mds_state_name(state), laggy ? "(laggy)" : ""); - if (mds < 0 || mds >= possible_max_rank) { + if (mds < 0 || mds >= m->possible_max_rank) { pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); continue; } @@ -318,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) for (i = 0; i < n; i++) { s32 mds = ceph_decode_32(p); - if (mds >= 0 && mds < m->m_num_mds) { + if (mds >= 0 && mds < m->possible_max_rank) { if (m->m_info[mds].laggy) num_laggy++; } } m->m_num_laggy = num_laggy; - if (n > m->m_num_mds) { + if (n > m->possible_max_rank) { void *new_m_info = krealloc(m->m_info, n * sizeof(*m->m_info), GFP_NOFS | __GFP_ZERO); @@ -333,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) goto nomem; m->m_info = new_m_info; } - m->m_num_mds = n; + m->possible_max_rank = n; } /* inc */ @@ -404,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->m_num_mds; i++) + for (i = 0; i < m->possible_max_rank; i++) kfree(m->m_info[i].export_targets); kfree(m->m_info); kfree(m->m_data_pg_pools); @@ -420,7 +411,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_num_laggy == m->m_num_active_mds) return false; - for (i = 0; i < m->m_num_mds; i++) { + for (i = 0; i < m->possible_max_rank; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) nr_active++; } diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 3a66f4f926ce..35d385296fbb 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -26,8 +26,8 @@ struct ceph_mdsmap { u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; u32 m_max_mds; /* expected up:active mds number */ - int m_num_active_mds; /* actual up:active mds number */ - int m_num_mds; /* size of m_info array */ + u32 m_num_active_mds; /* actual up:active mds number */ + u32 possible_max_rank; /* possible max rank index */ struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ @@ -43,7 +43,7 @@ struct ceph_mdsmap { static inline struct ceph_entity_addr * ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) { - if (w >= m->m_num_mds) + if (w >= m->possible_max_rank) return NULL; return &m->m_info[w].addr; } @@ -51,14 +51,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) { BUG_ON(w < 0); - if (w >= m->m_num_mds) + if (w >= m->possible_max_rank) return CEPH_MDS_STATE_DNE; return m->m_info[w].state; } static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) { - if (w >= 0 && w < m->m_num_mds) + if (w >= 0 && w < m->possible_max_rank) return m->m_info[w].laggy; return false; } -- cgit v1.2.3 From 78beb0ff2feceb1d7568333f93195e1a4d95a49a Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 8 Jan 2020 10:03:53 +0000 Subject: ceph: use copy-from2 op in copy_file_range Instead of using the copy-from operation, switch copy_file_range to the new copy-from2 operation, which allows to send the truncate_seq and truncate_size parameters. If an OSD does not support the copy-from2 operation it will return -EOPNOTSUPP. In that case, the kernel client will stop trying to do remote object copies for this fs client and will always use the generic VFS copy_file_range. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 11 ++++++++++- fs/ceph/super.c | 1 + fs/ceph/super.h | 2 ++ include/linux/ceph/osd_client.h | 1 + include/linux/ceph/rados.h | 2 ++ net/ceph/osd_client.c | 18 ++++++++++++------ 6 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 11929d2bb594..c3b8e8e0bf17 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1974,6 +1974,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; + if (!src_fsc->have_copy_from2) + return -EOPNOTSUPP; + /* * Striped file layouts require that we copy partial objects, but the * OSD copy-from operation only supports full-object copies. Limit @@ -2101,8 +2104,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, &dst_oid, &dst_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, + dst_ci->i_truncate_seq, dst_ci->i_truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); if (err) { + if (err == -EOPNOTSUPP) { + src_fsc->have_copy_from2 = false; + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); + } dout("ceph_osdc_copy_from returned %d\n", err); if (!ret) ret = err; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 112927dbd2f2..bfb8aead0555 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -718,6 +718,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; fsc->filp_gen = 1; + fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3bf1a01cd536..1e456a9011bb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -106,6 +106,8 @@ struct ceph_fs_client { unsigned long last_auto_reconnect; bool blacklisted; + bool have_copy_from2; + u32 filp_gen; loff_t max_file_size; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index eaffbdddf89a..5a62dbd3f4c2 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -534,6 +534,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, struct ceph_object_id *dst_oid, struct ceph_object_locator *dst_oloc, u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, u8 copy_from_flags); /* watch/notify */ diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 3eb0e55665b4..59bdfd470100 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -256,6 +256,7 @@ extern const char *ceph_osd_state_name(int s); \ /* tiering */ \ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ + f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \ f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ @@ -446,6 +447,7 @@ enum { CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to * cloneid */ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */ }; enum { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ba45b074a362..b68b376d8c2f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -402,7 +402,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_LIST_WATCHERS: ceph_osd_data_release(&op->list_watchers.response_data); break; - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: ceph_osd_data_release(&op->copy_from.osd_data); break; default: @@ -697,7 +697,7 @@ static void get_num_data_items(struct ceph_osd_request *req, case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: *num_request_data_items += 1; break; @@ -1029,7 +1029,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); dst->copy_from.src_version = cpu_to_le64(src->copy_from.src_version); @@ -1966,7 +1966,7 @@ static void setup_request_data(struct ceph_osd_request *req) ceph_osdc_msg_data_add(request_msg, &op->notify_ack.request_data); break; - case CEPH_OSD_OP_COPY_FROM: + case CEPH_OSD_OP_COPY_FROM2: ceph_osdc_msg_data_add(request_msg, &op->copy_from.osd_data); break; @@ -5315,6 +5315,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, struct ceph_object_locator *src_oloc, u32 src_fadvise_flags, u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, u8 copy_from_flags) { struct ceph_osd_req_op *op; @@ -5325,7 +5326,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, if (IS_ERR(pages)) return PTR_ERR(pages); - op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, + dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; @@ -5335,6 +5337,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, end = p + PAGE_SIZE; ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); encode_oloc(&p, end, src_oloc); + ceph_encode_32(&p, truncate_seq); + ceph_encode_64(&p, truncate_size); op->indata_len = PAGE_SIZE - (end - p); ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, @@ -5350,6 +5354,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, struct ceph_object_id *dst_oid, struct ceph_object_locator *dst_oloc, u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, u8 copy_from_flags) { struct ceph_osd_request *req; @@ -5366,7 +5371,8 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, src_oloc, src_fadvise_flags, - dst_fadvise_flags, copy_from_flags); + dst_fadvise_flags, truncate_seq, + truncate_size, copy_from_flags); if (ret) goto out; -- cgit v1.2.3