From 4d1bf79aff7962ab0654d66127ebb6eec17460ab Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Wed, 15 May 2013 10:38:12 -0600 Subject: ceph: fix up comment for ceph_count_locks() as to which lock to hold Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- fs/ceph/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ebbf680378e2..89788515a63d 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } /** - * Must be called with BKL already held. Fills in the passed + * Must be called with lock_flocks() already held. Fills in the passed * counter variables, so you can prepare pagelist metadata before calling * ceph_encode_locks. */ -- cgit v1.2.3 From c213b50b7dcbf06abcfbf1e4eee5b76586718bd9 Mon Sep 17 00:00:00 2001 From: Emil Goode Date: Tue, 28 May 2013 16:59:00 +0200 Subject: ceph: improve error handling in ceph_mdsmap_decode This patch makes the following improvements to the error handling in the ceph_mdsmap_decode function: - Add a NULL check for return value from kcalloc - Make use of the variable err Signed-off-by: Emil Goode Signed-off-by: Sage Weil --- fs/ceph/mdsmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 9278dec9e940..d4d38977dcbb 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -138,6 +138,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_info[mds].export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); + if (m->m_info[mds].export_targets == NULL) + goto badmem; for (j = 0; j < num_export_targets; j++) m->m_info[mds].export_targets[j] = ceph_decode_32(&pexport_targets); @@ -170,7 +172,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) -- cgit v1.2.3 From 6af8652849a15e407b458a271ef9154e472f6dd4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 29 May 2013 06:46:56 -0500 Subject: ceph: tidy ceph_mdsmap_decode() a little I introduced a new temporary variable "info" instead of "m->m_info[mds]". Also I reversed the if condition and pulled everything in one indent level. Signed-off-by: Dan Carpenter Reviewed-by: Alex Elder --- fs/ceph/mdsmap.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index d4d38977dcbb..132b64eeecd4 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; + struct ceph_mds_info *info; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -126,26 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds >= 0 && mds < m->m_max_mds && state > 0) { - m->m_info[mds].global_id = global_id; - m->m_info[mds].state = state; - m->m_info[mds].addr = addr; - m->m_info[mds].laggy = - (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); - m->m_info[mds].num_export_targets = num_export_targets; - if (num_export_targets) { - m->m_info[mds].export_targets = - kcalloc(num_export_targets, sizeof(u32), - GFP_NOFS); - if (m->m_info[mds].export_targets == NULL) - goto badmem; - for (j = 0; j < num_export_targets; j++) - m->m_info[mds].export_targets[j] = - ceph_decode_32(&pexport_targets); - } else { - m->m_info[mds].export_targets = NULL; - } + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; } } -- cgit v1.2.3 From bb137f84d1d8f692233b590f7cae14abbdc1e0c1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 3 Jun 2013 18:22:17 +0800 Subject: ceph: fix cap release race ceph_encode_inode_release() can race with ceph_open() and release caps wanted by open files. So it should call __ceph_caps_wanted() to get the wanted caps. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index da0f9b8a3bcb..54c290b083ab 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3042,21 +3042,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, -- cgit v1.2.3 From 3803da4963db01da6a983ab589ebe2e6ccb97ba9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:26:44 +0800 Subject: ceph: reset iov_len when discarding cap release messages Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4d2920304be8..ddbd5907d41b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, num = le32_to_cpu(head->num); dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); session->s_num_cap_releases += num; /* requeue completed messages */ -- cgit v1.2.3 From fc2744aa12da7182509b1059aa3ab53754d0c83a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:48:29 +0800 Subject: ceph: fix race between page writeback and truncate The client can receive truncate request from MDS at any time. So the page writeback code need to get i_size, truncate_seq and truncate_size atomically Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/addr.c | 84 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 44 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e68ac101040..3500b74c32ed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -438,13 +438,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page_offset(page); - int len = PAGE_CACHE_SIZE; - loff_t i_size; - int err = 0; struct ceph_snap_context *snapc, *oldest; - u64 snap_size = 0; + loff_t page_off = page_offset(page); long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -474,13 +473,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + /* is this a partial page at end of file? */ - if (snap_size) - i_size = snap_size; - else - i_size = i_size_read(inode); - if (i_size < page_off + len) - len = i_size - page_off; + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); @@ -494,7 +500,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, - ci->i_truncate_seq, ci->i_truncate_size, + truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); @@ -631,25 +637,6 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_osdc_put_request(req); } -static struct ceph_osd_request * -ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, - struct ceph_snap_context *snapc, int num_ops) -{ - struct ceph_fs_client *fsc; - struct ceph_inode_info *ci; - struct ceph_vino vino; - - fsc = ceph_inode_to_client(inode); - ci = ceph_inode(inode); - vino = ceph_vino(inode); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - - return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, - snapc, ci->i_truncate_seq, ci->i_truncate_size, true); -} - /* * initiate async writeback */ @@ -658,7 +645,8 @@ static int ceph_writepages_start(struct address_space *mapping, { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -670,7 +658,8 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size; + u64 truncate_size, snap_size; + u32 truncate_seq; /* * Include a 'sync' in the OSD request if this is a data @@ -685,7 +674,6 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - fsc = ceph_inode_to_client(inode); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ @@ -728,6 +716,14 @@ retry: snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the * start of the original file range. */ @@ -739,7 +735,6 @@ retry: while (!done && index <= end) { int num_ops = do_sync ? 2 : 1; - struct ceph_vino vino; unsigned i; int first; pgoff_t next; @@ -833,17 +828,18 @@ get_more_pages: * that it will use. */ if (locked_pages == 0) { - size_t size; - BUG_ON(pages); - /* prepare async write request */ offset = (u64)page_offset(page); len = wsize; - req = ceph_writepages_osd_request(inode, - offset, &len, snapc, - num_ops); - + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); if (IS_ERR(req)) { rc = PTR_ERR(req); unlock_page(page); @@ -854,8 +850,8 @@ get_more_pages: req->r_inode = inode; max_pages = calc_pages_for(0, (u64)len); - size = max_pages * sizeof (*pages); - pages = kmalloc(size, GFP_NOFS); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); if (!pages) { pool = fsc->wb_pagevec_pool; pages = mempool_alloc(pool, GFP_NOFS); -- cgit v1.2.3 From b8c2f3ae2d9f2b975a0e1a9c5652829ef8a4f06c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:37:11 +0800 Subject: ceph: check migrate seq before changing auth cap We may receive old request reply from the exporter MDS after receiving the importer MDS' cap import message. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 54c290b083ab..790f88b15daf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -612,9 +612,11 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) { + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ci->i_auth_cap = cap; + } else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { -- cgit v1.2.3 From 667ca05cd9f02f0a345446abc362484c019d4d71 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:25:36 +0800 Subject: ceph: clear migrate seq when MDS restarts Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ddbd5907d41b..6272c7884e66 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2455,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); -- cgit v1.2.3 From 005c46970e3a2a4b95da220eab43b87307646335 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:40:24 +0800 Subject: ceph: move inode to proper flushing list when auth MDS changes Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 790f88b15daf..9a5ccc9e0d62 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1982,8 +1982,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, __ceph_caps_used(ci), __ceph_caps_wanted(ci), -- cgit v1.2.3 From a1dc1937337a93e699eaa56968b7de6e1a9e77cf Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 19 Jun 2013 14:58:10 +0800 Subject: ceph: fix sleeping function called from invalid context. [ 1121.231883] BUG: sleeping function called from invalid context at kernel/rwsem.c:20 [ 1121.231935] in_atomic(): 1, irqs_disabled(): 0, pid: 9831, name: mv [ 1121.231971] 1 lock held by mv/9831: [ 1121.231973] #0: (&(&ci->i_ceph_lock)->rlock){+.+...},at:[] ceph_getxattr+0x58/0x1d0 [ceph] [ 1121.231998] CPU: 3 PID: 9831 Comm: mv Not tainted 3.10.0-rc6+ #215 [ 1121.232000] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015 11/09/2011 [ 1121.232027] ffff88006d355a80 ffff880092f69ce0 ffffffff8168348c ffff880092f69cf8 [ 1121.232045] ffffffff81070435 ffff88006d355a20 ffff880092f69d20 ffffffff816899ba [ 1121.232052] 0000000300000004 ffff8800b76911d0 ffff88006d355a20 ffff880092f69d68 [ 1121.232056] Call Trace: [ 1121.232062] [] dump_stack+0x19/0x1b [ 1121.232067] [] __might_sleep+0xe5/0x110 [ 1121.232071] [] down_read+0x2a/0x98 [ 1121.232080] [] ceph_vxattrcb_layout+0x60/0xf0 [ceph] [ 1121.232088] [] ceph_getxattr+0x9f/0x1d0 [ceph] [ 1121.232093] [] vfs_getxattr+0xa8/0xd0 [ 1121.232097] [] getxattr+0xab/0x1c0 [ 1121.232100] [] ? final_putname+0x22/0x50 [ 1121.232104] [] ? kmem_cache_free+0xb0/0x260 [ 1121.232107] [] ? final_putname+0x22/0x50 [ 1121.232110] [] ? trace_hardirqs_on+0xd/0x10 [ 1121.232114] [] ? sysret_check+0x1b/0x56 [ 1121.232120] [] SyS_fgetxattr+0x6c/0xc0 [ 1121.232125] [] system_call_fastpath+0x16/0x1b [ 1121.232129] BUG: scheduling while atomic: mv/9831/0x10000002 [ 1121.232154] 1 lock held by mv/9831: [ 1121.232156] #0: (&(&ci->i_ceph_lock)->rlock){+.+...}, at: [] ceph_getxattr+0x58/0x1d0 [ceph] I think move the ci->i_ceph_lock down is safe because we can't free ceph_inode_info at there. CC: stable@vger.kernel.org # 3.8+ Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/xattr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9b6b2b6dd164..be661d8f532a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - spin_lock(&ci->i_ceph_lock); - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { err = vxattr->getxattr_cb(ci, value, size); - goto out; + return err; } + spin_lock(&ci->i_ceph_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; -- cgit v1.2.3 From c62988ec0910a2d480fecb2f0140a36fcdc7b691 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 19 Jun 2013 15:12:06 +0800 Subject: ceph: avoid meaningless calling ceph_caps_revoking if sync_mode == WB_SYNC_ALL. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/addr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3500b74c32ed..afb2fc241061 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -666,8 +666,8 @@ static int ceph_writepages_start(struct address_space *mapping, * integrity write (e.g., O_SYNC write or fsync()), or if our * cap is being revoked. */ - do_sync = wbc->sync_mode == WB_SYNC_ALL; - if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) do_sync = 1; dout("writepages_start %p dosync=%d (mode=%s)\n", inode, do_sync, -- cgit v1.2.3 From 0405a1499df42a2b9fd4906096c6bb950e15e850 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Sun, 23 Jun 2013 09:48:38 -0700 Subject: ceph: remove sb_start/end_write in ceph_aio_write. Either in vfs_write or io_submit,it call file_start/end_write. The different between file_start/end_write and sb_start/end_write is file_ only handle regular file.But i think in ceph_aio_write,it only for regular file. Signed-off-by: Jianpeng Ma Acked-by: Yan, Zheng --- fs/ceph/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e16907430..7c69f4f0dee6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); hold_mutex = true; @@ -809,7 +808,6 @@ retry_snap: out: if (hold_mutex) mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return written ? written : err; -- cgit v1.2.3 From fb3101b6f0db9ae3f35dc8e6ec908d0af8cdf12e Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 25 Jun 2013 14:48:19 +0800 Subject: ceph: Free mdsc if alloc mdsc->mdsmap failed. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6272c7884e66..3eb1b4470c85 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3042,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); -- cgit v1.2.3 From 93faca6ef45822b0825bb181859b1a8911e9c4c1 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 26 Jun 2013 11:15:27 +0800 Subject: ceph: Reconstruct the func ceph_reserve_caps. Drop ignored return value. Fix allocation failure case to not leak. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/caps.c | 21 +++++++-------------- fs/ceph/super.h | 2 +- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9a5ccc9e0d62..8ec27b130cc9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7ccfdb4aea2e..dfbb729b3130 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); -- cgit v1.2.3 From 5446429630257f4723829409337a26c076907d5d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 1 Jul 2013 18:33:39 -0400 Subject: ceph: avoid accessing invalid memory when mounting ceph with a dev name that starts with a slash, ceph would attempt to access the character before that slash. Since we don't actually own that byte of memory, we would trigger an invalid access: [ 43.499934] BUG: unable to handle kernel paging request at ffff880fa3a97fff [ 43.500984] IP: [] parse_mount_options+0x1a4/0x300 [ 43.501491] PGD 743b067 PUD 10283c4067 PMD 10282a6067 PTE 8000000fa3a97060 [ 43.502301] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 43.503006] Dumping ftrace buffer: [ 43.503596] (ftrace buffer empty) [ 43.504046] CPU: 0 PID: 10879 Comm: mount Tainted: G W 3.10.0-sasha #1129 [ 43.504851] task: ffff880fa625b000 ti: ffff880fa3412000 task.ti: ffff880fa3412000 [ 43.505608] RIP: 0010:[] [] parse_mount_options$ [ 43.506552] RSP: 0018:ffff880fa3413d08 EFLAGS: 00010286 [ 43.507133] RAX: ffff880fa3a98000 RBX: ffff880fa3a98000 RCX: 0000000000000000 [ 43.507893] RDX: ffff880fa3a98001 RSI: 000000000000002f RDI: ffff880fa3a98000 [ 43.508610] RBP: ffff880fa3413d58 R08: 0000000000001f99 R09: ffff880fa3fe64c0 [ 43.509426] R10: ffff880fa3413d98 R11: ffff880fa38710d8 R12: ffff880fa3413da0 [ 43.509792] R13: ffff880fa3a97fff R14: 0000000000000000 R15: ffff880fa3413d90 [ 43.509792] FS: 00007fa9c48757e0(0000) GS:ffff880fd2600000(0000) knlGS:000000000000$ [ 43.509792] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 43.509792] CR2: ffff880fa3a97fff CR3: 0000000fa3bb9000 CR4: 00000000000006b0 [ 43.509792] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 43.509792] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 43.509792] Stack: [ 43.509792] 0000e5180000000e ffffffff85ca1900 ffff880fa38710d8 ffff880fa3413d98 [ 43.509792] 0000000000000120 0000000000000000 ffff880fa3a98000 0000000000000000 [ 43.509792] ffffffff85cf32a0 0000000000000000 ffff880fa3413dc8 ffffffff818f3c72 [ 43.509792] Call Trace: [ 43.509792] [] ceph_mount+0xa2/0x390 [ 43.509792] [] ? pcpu_alloc+0x334/0x3c0 [ 43.509792] [] mount_fs+0x8d/0x1a0 [ 43.509792] [] ? __alloc_percpu+0x10/0x20 [ 43.509792] [] vfs_kern_mount+0x79/0x100 [ 43.509792] [] do_new_mount+0xcd/0x1c0 [ 43.509792] [] do_mount+0x15d/0x210 [ 43.509792] [] ? strndup_user+0x45/0x60 [ 43.509792] [] SyS_mount+0x9d/0xe0 [ 43.509792] [] tracesys+0xdd/0xe2 [ 43.509792] Code: 4c 8b 5d c0 74 0a 48 8d 50 01 49 89 14 24 eb 17 31 c0 48 83 c9 ff $ [ 43.509792] RIP [] parse_mount_options+0x1a4/0x300 [ 43.509792] RSP [ 43.509792] CR2: ffff880fa3a97fff [ 43.509792] ---[ end trace 22469cd81e93af51 ]--- Signed-off-by: Sasha Levin Reviewed-by: Sage Weil --- fs/ceph/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7d377c9a5e35..6627b26a800c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ - if (*dev_name_end != ':') { + if (dev_name_end < dev_name || *dev_name_end != ':') { pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; -- cgit v1.2.3 From b415bf4f9fe25f39934f5c464125e4a2dffb6d08 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Jul 2013 12:40:19 +0800 Subject: ceph: fix pending vmtruncate race The locking order for pending vmtruncate is wrong, it can lead to following race: write wmtruncate work ------------------------ ---------------------- lock i_mutex check i_truncate_pending check i_truncate_pending truncate_inode_pages() lock i_mutex (blocked) copy data to page cache unlock i_mutex truncate_inode_pages() The fix is take i_mutex before calling __ceph_do_pending_vmtruncate() Fixes: http://tracker.ceph.com/issues/5453 Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 6 +++++- fs/ceph/file.c | 2 +- fs/ceph/inode.c | 14 ++++++-------- fs/ceph/super.h | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8ec27b130cc9..16266f3e9a33 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2057,7 +2057,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); - __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7c69f4f0dee6..a44d5153179b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -822,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index be0f7e20d62e..4906ada4a97c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - __ceph_do_pending_vmtruncate(inode, true); + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode) * Make sure any pending truncation is applied before doing anything * that may depend on it. */ -void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) +void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; @@ -1525,11 +1527,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - if (needlock) - mutex_lock(&inode->i_mutex); truncate_inode_pages(inode->i_mapping, to); - if (needlock) - mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { @@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); err = inode_change_ok(inode, attr); if (err != 0) @@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dfbb729b3130..cbded572345e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern int ceph_inode_set_size(struct inode *inode, loff_t size); -extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); -- cgit v1.2.3 From b1530f57042297f85330a140a6921b6f95fe74d3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Jul 2013 12:40:20 +0800 Subject: ceph: fix cap revoke race If caps are been revoking by the auth MDS, don't consider them as issued even they are still issued by non-auth MDS. The non-auth MDS should also be revoking/exporting these caps, the client just hasn't received the cap revoke/export message. The race I encountered is: When caps are exporting to new MDS, the client receives cap import message and cap revoke message from the new MDS, then receives cap export message from the old MDS. When the client receives cap revoke message from the new MDS, the revoking caps are still issued by the old MDS, so the client does nothing. Later when the cap export message is received, the client removes the caps issued by the old MDS. (Another way to fix the race is calling ceph_check_caps() in handle_cap_export()) Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 16266f3e9a33..7045a8dfaad4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -690,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } -- cgit v1.2.3 From 6ee6b95373dfa1d0a4c9bc76689ec10a60c1d6f2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Jul 2013 12:40:21 +0800 Subject: ceph: fix race between cap issue and revoke If we receive new caps from the auth MDS and the non-auth MDS is revoking the newly issued caps, we should release the caps from the non-auth MDS. The scenario is filelock's state changes from SYNC to LOCK. Non-auth MDS revokes Fc cap, the client gets Fc cap from the auth MDS at the same time. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7045a8dfaad4..25442b40c25a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -806,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && __cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) + return 1; } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); @@ -2488,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a -- cgit v1.2.3