From 491205a8b45e3d9b594e1e7a997284f2e82f22e9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 13 May 2013 20:35:37 -0500 Subject: rbd: Use min_t() to fix comparison of distinct pointer types warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/block/rbd.c: In function ‘zero_pages’: drivers/block/rbd.c:1102: warning: comparison of distinct pointer types lacks a cast Remove the hackish casts and use min_t() to fix this. Signed-off-by: Geert Uytterhoeven Reviewed-by: Alex Elder --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aff789d6fccd..b4f00e22743d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1153,8 +1153,8 @@ static void zero_pages(struct page **pages, u64 offset, u64 end) unsigned long flags; void *kaddr; - page_offset = (size_t)(offset & ~PAGE_MASK); - length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); + page_offset = offset & ~PAGE_MASK; + length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); local_irq_save(flags); kaddr = kmap_atomic(*page); memset(kaddr + page_offset, 0, length); -- cgit v1.2.3 From 4d1bf79aff7962ab0654d66127ebb6eec17460ab Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Wed, 15 May 2013 10:38:12 -0600 Subject: ceph: fix up comment for ceph_count_locks() as to which lock to hold Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- fs/ceph/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ebbf680378e2..89788515a63d 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } /** - * Must be called with BKL already held. Fills in the passed + * Must be called with lock_flocks() already held. Fills in the passed * counter variables, so you can prepare pagelist metadata before calling * ceph_encode_locks. */ -- cgit v1.2.3 From 912c317d4600b81664ad8f3d3ba6c1f2ff4b49c2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 13 May 2013 20:35:38 -0500 Subject: rbd: drop original request earlier for existence check The reference to the original request dropped at the end of rbd_img_obj_exists_callback() corresponds to the reference taken in rbd_img_obj_exists_submit() to account for the stat request referring to it. Move the put of that reference up right after clearing that pointer to make its purpose more obvious. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b4f00e22743d..291802c52c65 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2535,6 +2535,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) */ orig_request = obj_request->obj_request; obj_request->obj_request = NULL; + rbd_obj_request_put(orig_request); rbd_assert(orig_request); rbd_assert(orig_request->img_request); @@ -2555,7 +2556,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) if (!rbd_dev->parent_overlap) { struct ceph_osd_client *osdc; - rbd_obj_request_put(orig_request); osdc = &rbd_dev->rbd_client->client->osdc; result = rbd_obj_request_submit(osdc, orig_request); if (!result) @@ -2585,7 +2585,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) out: if (orig_request->result) rbd_obj_request_complete(orig_request); - rbd_obj_request_put(orig_request); } static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) -- cgit v1.2.3 From c213b50b7dcbf06abcfbf1e4eee5b76586718bd9 Mon Sep 17 00:00:00 2001 From: Emil Goode Date: Tue, 28 May 2013 16:59:00 +0200 Subject: ceph: improve error handling in ceph_mdsmap_decode This patch makes the following improvements to the error handling in the ceph_mdsmap_decode function: - Add a NULL check for return value from kcalloc - Make use of the variable err Signed-off-by: Emil Goode Signed-off-by: Sage Weil --- fs/ceph/mdsmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 9278dec9e940..d4d38977dcbb 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -138,6 +138,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_info[mds].export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); + if (m->m_info[mds].export_targets == NULL) + goto badmem; for (j = 0; j < num_export_targets; j++) m->m_info[mds].export_targets[j] = ceph_decode_32(&pexport_targets); @@ -170,7 +172,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) -- cgit v1.2.3 From 6af8652849a15e407b458a271ef9154e472f6dd4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 29 May 2013 06:46:56 -0500 Subject: ceph: tidy ceph_mdsmap_decode() a little I introduced a new temporary variable "info" instead of "m->m_info[mds]". Also I reversed the if condition and pulled everything in one indent level. Signed-off-by: Dan Carpenter Reviewed-by: Alex Elder --- fs/ceph/mdsmap.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index d4d38977dcbb..132b64eeecd4 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; + struct ceph_mds_info *info; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -126,26 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds >= 0 && mds < m->m_max_mds && state > 0) { - m->m_info[mds].global_id = global_id; - m->m_info[mds].state = state; - m->m_info[mds].addr = addr; - m->m_info[mds].laggy = - (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); - m->m_info[mds].num_export_targets = num_export_targets; - if (num_export_targets) { - m->m_info[mds].export_targets = - kcalloc(num_export_targets, sizeof(u32), - GFP_NOFS); - if (m->m_info[mds].export_targets == NULL) - goto badmem; - for (j = 0; j < num_export_targets; j++) - m->m_info[mds].export_targets[j] = - ceph_decode_32(&pexport_targets); - } else { - m->m_info[mds].export_targets = NULL; - } + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; } } -- cgit v1.2.3 From 96e4dac66f69d28af2b736e723364efbbdf9fdee Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 22 May 2013 20:54:25 -0500 Subject: libceph: add lingering request reference when registered When an osd request is set to linger, the osd client holds onto the request so it can be re-submitted following certain osd map changes. The osd client holds a reference to the request until it is unregistered. This is used by rbd for watch requests. Currently, the reference is taken when the request is marked with the linger flag. This means that if an error occurs after that time but before the the request completes successfully, that reference is leaked. There's really no reason to take the reference until the request is registered in the the osd client's list of lingering requests, and that only happens when the lingering (watch) request completes successfully. So take that reference only when it gets registered following succesful completion, and drop it (as before) when the request gets unregistered. This avoids the reference problem on error in rbd. Rearrange ceph_osdc_unregister_linger_request() to avoid using the request pointer after it may have been freed. And hold an extra reference in kick_requests() while handling a linger request that has not yet been registered, to ensure it doesn't go away. This resolves: http://tracker.ceph.com/issues/3859 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3a246a6cab47..e0abb83b520e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1174,6 +1174,7 @@ static void __register_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { dout("__register_linger_request %p\n", req); + ceph_osdc_get_request(req); list_add_tail(&req->r_linger_item, &osdc->req_linger); if (req->r_osd) list_add_tail(&req->r_linger_osd, @@ -1196,6 +1197,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } + ceph_osdc_put_request(req); } void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, @@ -1203,9 +1205,8 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, { mutex_lock(&osdc->request_mutex); if (req->r_linger) { - __unregister_linger_request(osdc, req); req->r_linger = 0; - ceph_osdc_put_request(req); + __unregister_linger_request(osdc, req); } mutex_unlock(&osdc->request_mutex); } @@ -1217,11 +1218,6 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, if (!req->r_linger) { dout("set_request_linger %p\n", req); req->r_linger = 1; - /* - * caller is now responsible for calling - * unregister_linger_request - */ - ceph_osdc_get_request(req); } } EXPORT_SYMBOL(ceph_osdc_set_request_linger); @@ -1633,8 +1629,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) dout("%p tid %llu restart on osd%d\n", req, req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); + ceph_osdc_get_request(req); __unregister_request(osdc, req); __register_linger_request(osdc, req); + ceph_osdc_put_request(req); continue; } -- cgit v1.2.3 From e215605417b87732c6debf65da6d953016a1e5bc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 22 May 2013 20:54:25 -0500 Subject: rbd: flush dcache after zeroing page data Neither zero_bio_chain() nor zero_pages() contains a call to flush caches after zeroing a portion of a page. This can cause problems on architectures that have caches that allow virtual address aliasing. This resolves: http://tracker.ceph.com/issues/4777 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 291802c52c65..cb728a01a19f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1126,6 +1126,7 @@ static void zero_bio_chain(struct bio *chain, int start_ofs) buf = bvec_kmap_irq(bv, &flags); memset(buf + remainder, 0, bv->bv_len - remainder); + flush_dcache_page(bv->bv_page); bvec_kunmap_irq(buf, &flags); } pos += bv->bv_len; @@ -1158,6 +1159,7 @@ static void zero_pages(struct page **pages, u64 offset, u64 end) local_irq_save(flags); kaddr = kmap_atomic(*page); memset(kaddr + page_offset, 0, length); + flush_dcache_page(*page); kunmap_atomic(kaddr); local_irq_restore(flags); -- cgit v1.2.3 From 3b5cf2a2f1746a253d56f54ffbb45170c90b1cbd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 May 2013 11:18:59 -0500 Subject: rbd: clean up a few things in the refresh path This includes a few relatively small fixes I found while examining the code that refreshes image information. This resolves: http://tracker.ceph.com/issues/5040 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cb728a01a19f..d4902c52bf26 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2860,7 +2860,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) (unsigned int)opcode); ret = rbd_dev_refresh(rbd_dev); if (ret) - rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret); + rbd_warn(rbd_dev, "header refresh error (%d)\n", ret); rbd_obj_notify_ack(rbd_dev, notify_id); } @@ -3340,8 +3340,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - mapping_size = rbd_dev->mapping.size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + mapping_size = rbd_dev->mapping.size; if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); else @@ -3814,6 +3814,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) void *end; u64 pool_id; char *image_id; + u64 snap_id; u64 overlap; int ret; @@ -3873,24 +3874,56 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) (unsigned long long)pool_id, U32_MAX); goto out_err; } - parent_spec->pool_id = pool_id; image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); goto out_err; } - parent_spec->image_id = image_id; - ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); + ceph_decode_64_safe(&p, end, snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); - if (overlap) { - rbd_spec_put(rbd_dev->parent_spec); + /* + * The parent won't change (except when the clone is + * flattened, already handled that). So we only need to + * record the parent spec we have not already done so. + */ + if (!rbd_dev->parent_spec) { + parent_spec->pool_id = pool_id; + parent_spec->image_id = image_id; + parent_spec->snap_id = snap_id; rbd_dev->parent_spec = parent_spec; parent_spec = NULL; /* rbd_dev now owns this */ - rbd_dev->parent_overlap = overlap; - } else { - rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n"); + } + + /* + * We always update the parent overlap. If it's zero we + * treat it specially. + */ + rbd_dev->parent_overlap = overlap; + smp_mb(); + if (!overlap) { + + /* A null parent_spec indicates it's the initial probe */ + + if (parent_spec) { + /* + * The overlap has become zero, so the clone + * must have been resized down to 0 at some + * point. Treat this the same as a flatten. + */ + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone image now standalone\n", + rbd_dev->disk->disk_name); + } else { + /* + * For the initial probe, if we find the + * overlap is zero we just pretend there was + * no parent image. + */ + rbd_warn(rbd_dev, "ignoring parent of " + "clone with overlap 0\n"); + } } out: ret = 0; -- cgit v1.2.3 From 08f75463c15e26e9d67a7c992ce7dd8964c6cbdd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 May 2013 11:19:00 -0500 Subject: rbd: protect against duplicate client creation If more than one rbd image has the same ceph cluster configuration (same options, same set of monitors, same keys) they normally share a single rbd client. When an image is getting mapped, rbd looks to see if an existing client can be used, and creates a new one if not. The lookup and creation are not done under a common lock though, so mapping two images concurrently could lead to duplicate clients getting set up needlessly. This isn't a major problem, but it's wasteful and different from what's intended. This patch fixes that by using the control mutex to protect both the lookup and (if needed) creation of the client. It was previously used just when creating. This resolves: http://tracker.ceph.com/issues/3094 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d4902c52bf26..fd2795d1136a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -520,7 +520,7 @@ static const struct block_device_operations rbd_bd_ops = { /* * Initialize an rbd client instance. Success or not, this function - * consumes ceph_opts. + * consumes ceph_opts. Caller holds ctl_mutex. */ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) { @@ -535,30 +535,25 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); if (IS_ERR(rbdc->client)) - goto out_mutex; + goto out_rbdc; ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ ret = ceph_open_session(rbdc->client); if (ret < 0) - goto out_err; + goto out_client; spin_lock(&rbd_client_list_lock); list_add_tail(&rbdc->node, &rbd_client_list); spin_unlock(&rbd_client_list_lock); - mutex_unlock(&ctl_mutex); dout("%s: rbdc %p\n", __func__, rbdc); return rbdc; - -out_err: +out_client: ceph_destroy_client(rbdc->client); -out_mutex: - mutex_unlock(&ctl_mutex); +out_rbdc: kfree(rbdc); out_opt: if (ceph_opts) @@ -682,11 +677,13 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { struct rbd_client *rbdc; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rbdc = rbd_client_find(ceph_opts); if (rbdc) /* using an existing client */ ceph_destroy_options(ceph_opts); else rbdc = rbd_client_create(ceph_opts); + mutex_unlock(&ctl_mutex); return rbdc; } -- cgit v1.2.3 From 4974341eb99861720d54db9337bf1fe78eb8b9d0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 May 2013 11:19:00 -0500 Subject: libceph: print more info for short message header If an osd client response message arrives that has a front section that's too big for the buffer set aside to receive it, a warning gets reported and a new buffer is allocated. The warning says nothing about which connection had the problem. Add the peer type and number to what gets reported, to be a bit more informative. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e0abb83b520e..61147fe70181 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2454,8 +2454,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ceph_msg_revoke_incoming(req->r_reply); if (front > req->r_reply->front.iov_len) { - pr_warning("get_reply front %d > preallocated %d\n", - front, (int)req->r_reply->front.iov_len); + pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", + front, (int)req->r_reply->front.iov_len, + (unsigned int)con->peer_name.type, + le64_to_cpu(con->peer_name.num)); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); if (!m) goto out; -- cgit v1.2.3 From 751cc0e3cfabdda87c4c21519253c6751e97a8d4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 May 2013 15:17:01 -0500 Subject: rbd: set removing flag while holding list lock When unmapping a device, its id is supplied, and that is used to look up which rbd device should be unmapped. Looking up the device involves searching the rbd device list while holding a spinlock that protects access to that list. Currently all of this is done under protection of the control lock, but that protection is going away soon. To ensure the rbd_dev is still valid (still on the list) while setting its REMOVING flag, do so while still holding the list lock. To do so, get rid of __rbd_get_dev(), and open code what it did in the one place it was used. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 53 ++++++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fd2795d1136a..9eead4879c90 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5090,23 +5090,6 @@ err_out_module: return (ssize_t)rc; } -static struct rbd_device *__rbd_get_dev(unsigned long dev_id) -{ - struct list_head *tmp; - struct rbd_device *rbd_dev; - - spin_lock(&rbd_dev_list_lock); - list_for_each(tmp, &rbd_dev_list) { - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->dev_id == dev_id) { - spin_unlock(&rbd_dev_list_lock); - return rbd_dev; - } - } - spin_unlock(&rbd_dev_list_lock); - return NULL; -} - static void rbd_dev_device_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); @@ -5151,7 +5134,8 @@ static ssize_t rbd_remove(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev = NULL; - int target_id; + struct list_head *tmp; + int dev_id; unsigned long ul; int ret; @@ -5160,26 +5144,33 @@ static ssize_t rbd_remove(struct bus_type *bus, return ret; /* convert to int; abort if we lost anything in the conversion */ - target_id = (int) ul; - if (target_id != ul) + dev_id = (int)ul; + if (dev_id != ul) return -EINVAL; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbd_dev = __rbd_get_dev(target_id); - if (!rbd_dev) { - ret = -ENOENT; - goto done; + ret = -ENOENT; + spin_lock(&rbd_dev_list_lock); + list_for_each(tmp, &rbd_dev_list) { + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->dev_id == dev_id) { + ret = 0; + break; + } } - - spin_lock_irq(&rbd_dev->lock); - if (rbd_dev->open_count) - ret = -EBUSY; - else - set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); - spin_unlock_irq(&rbd_dev->lock); + if (!ret) { + spin_lock_irq(&rbd_dev->lock); + if (rbd_dev->open_count) + ret = -EBUSY; + else + set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); + } + spin_unlock(&rbd_dev_list_lock); if (ret < 0) goto done; + rbd_bus_del_dev(rbd_dev); ret = rbd_dev_header_watch_sync(rbd_dev, false); if (ret) -- cgit v1.2.3 From 82a442d239695a242c4d584464c9606322cd02aa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 May 2013 17:40:44 -0500 Subject: rbd: protect against concurrent unmaps Make sure two concurrent unmap operations on the same rbd device won't collide, by only proceeding with the removal and cleanup of a device if is not already underway. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9eead4879c90..305c740778c6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5137,6 +5137,7 @@ static ssize_t rbd_remove(struct bus_type *bus, struct list_head *tmp; int dev_id; unsigned long ul; + bool already = false; int ret; ret = strict_strtoul(buf, 10, &ul); @@ -5164,11 +5165,12 @@ static ssize_t rbd_remove(struct bus_type *bus, if (rbd_dev->open_count) ret = -EBUSY; else - set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, + &rbd_dev->flags); spin_unlock_irq(&rbd_dev->lock); } spin_unlock(&rbd_dev_list_lock); - if (ret < 0) + if (ret < 0 || already) goto done; rbd_bus_del_dev(rbd_dev); -- cgit v1.2.3 From 1ba0f1e7975ad07557f7a931522bdcd813ae35f6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 May 2013 15:17:01 -0500 Subject: rbd: don't hold ctl_mutex to get/put device When an rbd device is first getting mapped, its device registration is protected the control mutex. There is no need to do that though, because the device has already been assigned an id that's guaranteed to be unique. An unmap of an rbd device won't proceed if the device has a non-zero open count or is already being unmapped. So there's no need to hold the control mutex in that case either. Finally, an rbd device can't be opened if it is being removed, and it won't go away if there is a non-zero open count. So here too there's no need to hold the control mutex while getting or putting a reference to an rbd device's Linux device structure. Drop the mutex calls in these cases. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 305c740778c6..d735eb13847d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -489,10 +489,8 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) if (removing) return -ENOENT; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); (void) get_device(&rbd_dev->dev); set_device_ro(bdev, rbd_dev->mapping.read_only); - mutex_unlock(&ctl_mutex); return 0; } @@ -507,9 +505,7 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) spin_unlock_irq(&rbd_dev->lock); rbd_assert(open_count_before > 0); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); put_device(&rbd_dev->dev); - mutex_unlock(&ctl_mutex); } static const struct block_device_operations rbd_bd_ops = { @@ -4332,8 +4328,6 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) struct device *dev; int ret; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - dev = &rbd_dev->dev; dev->bus = &rbd_bus_type; dev->type = &rbd_device_type; @@ -4342,8 +4336,6 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); - mutex_unlock(&ctl_mutex); - return ret; } @@ -5149,8 +5141,6 @@ static ssize_t rbd_remove(struct bus_type *bus, if (dev_id != ul) return -EINVAL; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - ret = -ENOENT; spin_lock(&rbd_dev_list_lock); list_for_each(tmp, &rbd_dev_list) { @@ -5171,7 +5161,7 @@ static ssize_t rbd_remove(struct bus_type *bus, } spin_unlock(&rbd_dev_list_lock); if (ret < 0 || already) - goto done; + return ret; rbd_bus_del_dev(rbd_dev); ret = rbd_dev_header_watch_sync(rbd_dev, false); @@ -5179,11 +5169,8 @@ static ssize_t rbd_remove(struct bus_type *bus, rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); rbd_dev_image_release(rbd_dev); module_put(THIS_MODULE); - ret = count; -done: - mutex_unlock(&ctl_mutex); - return ret; + return count; } /* -- cgit v1.2.3 From cfbf6377b696d88461eef6966bef9e6184111183 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 May 2013 17:40:45 -0500 Subject: rbd: use rwsem to protect header updates Updating an image header needs to be protected to ensure it's done consistently. However distinct headers can be updated concurrently without a problem. Instead of using the global control lock to serialize headder updates, just rely on the header semaphore. (It's already used, this just moves it out to cover a broader section of the code.) That leaves the control mutex protecting only the creation of rbd clients, so rename it. This resolves: http://tracker.ceph.com/issues/5222 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d735eb13847d..44c44c6071ed 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -372,7 +372,7 @@ enum rbd_dev_flags { RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ }; -static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ static LIST_HEAD(rbd_dev_list); /* devices */ static DEFINE_SPINLOCK(rbd_dev_list_lock); @@ -516,7 +516,7 @@ static const struct block_device_operations rbd_bd_ops = { /* * Initialize an rbd client instance. Success or not, this function - * consumes ceph_opts. Caller holds ctl_mutex. + * consumes ceph_opts. Caller holds client_mutex. */ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) { @@ -673,13 +673,13 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { struct rbd_client *rbdc; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); rbdc = rbd_client_find(ceph_opts); if (rbdc) /* using an existing client */ ceph_destroy_options(ceph_opts); else rbdc = rbd_client_create(ceph_opts); - mutex_unlock(&ctl_mutex); + mutex_unlock(&client_mutex); return rbdc; } @@ -833,7 +833,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, /* We won't fail any more, fill in the header */ - down_write(&rbd_dev->header_rwsem); if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; @@ -862,8 +861,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, if (rbd_dev->mapping.size != header->image_size) rbd_dev->mapping.size = header->image_size; - up_write(&rbd_dev->header_rwsem); - return 0; out_2big: ret = -EIO; @@ -3333,7 +3330,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + down_write(&rbd_dev->header_rwsem); mapping_size = rbd_dev->mapping.size; if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); @@ -3343,7 +3340,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) /* If it's a mapped snapshot, validate its EXISTS flag */ rbd_exists_validate(rbd_dev); - mutex_unlock(&ctl_mutex); + up_write(&rbd_dev->header_rwsem); + if (mapping_size != rbd_dev->mapping.size) { sector_t size; @@ -4272,16 +4270,14 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) bool first_time = rbd_dev->header.object_prefix == NULL; int ret; - down_write(&rbd_dev->header_rwsem); - ret = rbd_dev_v2_image_size(rbd_dev); if (ret) - goto out; + return ret; if (first_time) { ret = rbd_dev_v2_header_onetime(rbd_dev); if (ret) - goto out; + return ret; } /* @@ -4296,7 +4292,7 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) - goto out; + return ret; /* * Print a warning if this is the initial probe and @@ -4317,8 +4313,6 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); -out: - up_write(&rbd_dev->header_rwsem); return ret; } -- cgit v1.2.3 From d552c6191bcd952991ffdfff585c8849e8be911d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 May 2013 20:13:09 -0500 Subject: rbd: take a little credit Add a name to the list of authors. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 44c44c6071ed..14c6dc92ef0e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5272,6 +5272,7 @@ static void __exit rbd_exit(void) module_init(rbd_init); module_exit(rbd_exit); +MODULE_AUTHOR("Alex Elder "); MODULE_AUTHOR("Sage Weil "); MODULE_AUTHOR("Yehuda Sadeh "); MODULE_DESCRIPTION("rados block device"); -- cgit v1.2.3 From eb845ff13a44477f8a411baedbf11d678b9daf0a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 15:54:44 +0800 Subject: libceph: fix safe completion handle_reply() calls complete_request() only if the first OSD reply has ONDISK flag. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 1 - net/ceph/osd_client.c | 17 ++++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 186db0bf4951..ce6df39f60ff 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -145,7 +145,6 @@ struct ceph_osd_request { s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; - int r_completed; struct ceph_osd_client *r_osdc; struct kref r_kref; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 61147fe70181..3480b058794b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1522,6 +1522,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); + already_completed = req->r_got_reply; + if (!req->r_got_reply) { req->r_result = result; @@ -1552,16 +1554,14 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ((flags & CEPH_OSD_FLAG_WRITE) == 0)) __unregister_request(osdc, req); - already_completed = req->r_completed; - req->r_completed = 1; mutex_unlock(&osdc->request_mutex); - if (already_completed) - goto done; - if (req->r_callback) - req->r_callback(req, msg); - else - complete_all(&req->r_completion); + if (!already_completed) { + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + } if (flags & CEPH_OSD_FLAG_ONDISK) complete_request(req); @@ -2121,7 +2121,6 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, __register_request(osdc, req); req->r_sent = 0; req->r_got_reply = 0; - req->r_completed = 0; rc = __map_request(osdc, req, 0); if (rc < 0) { if (nofail) { -- cgit v1.2.3 From ccca4e37b1a912da3db68aee826557ea66145273 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 2 Jun 2013 18:40:23 +0800 Subject: libceph: fix truncate size calculation check the "not truncated yet" case Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3480b058794b..540dd29c9210 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -733,12 +733,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, object_size = le32_to_cpu(layout->fl_object_size); object_base = off - objoff; - if (truncate_size <= object_base) { - truncate_size = 0; - } else { - truncate_size -= object_base; - if (truncate_size > object_size) - truncate_size = object_size; + if (!(truncate_seq == 1 && truncate_size == -1ULL)) { + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; + } } osd_req_op_extent_init(req, 0, opcode, objoff, objlen, -- cgit v1.2.3 From bb137f84d1d8f692233b590f7cae14abbdc1e0c1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 3 Jun 2013 18:22:17 +0800 Subject: ceph: fix cap release race ceph_encode_inode_release() can race with ceph_open() and release caps wanted by open files. So it should call __ceph_caps_wanted() to get the wanted caps. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index da0f9b8a3bcb..54c290b083ab 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3042,21 +3042,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, -- cgit v1.2.3 From 3803da4963db01da6a983ab589ebe2e6ccb97ba9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:26:44 +0800 Subject: ceph: reset iov_len when discarding cap release messages Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4d2920304be8..ddbd5907d41b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, num = le32_to_cpu(head->num); dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); session->s_num_cap_releases += num; /* requeue completed messages */ -- cgit v1.2.3 From fc2744aa12da7182509b1059aa3ab53754d0c83a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:48:29 +0800 Subject: ceph: fix race between page writeback and truncate The client can receive truncate request from MDS at any time. So the page writeback code need to get i_size, truncate_seq and truncate_size atomically Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/addr.c | 84 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e68ac101040..3500b74c32ed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -438,13 +438,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page_offset(page); - int len = PAGE_CACHE_SIZE; - loff_t i_size; - int err = 0; struct ceph_snap_context *snapc, *oldest; - u64 snap_size = 0; + loff_t page_off = page_offset(page); long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -474,13 +473,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + /* is this a partial page at end of file? */ - if (snap_size) - i_size = snap_size; - else - i_size = i_size_read(inode); - if (i_size < page_off + len) - len = i_size - page_off; + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); @@ -494,7 +500,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, - ci->i_truncate_seq, ci->i_truncate_size, + truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); @@ -631,25 +637,6 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_osdc_put_request(req); } -static struct ceph_osd_request * -ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, - struct ceph_snap_context *snapc, int num_ops) -{ - struct ceph_fs_client *fsc; - struct ceph_inode_info *ci; - struct ceph_vino vino; - - fsc = ceph_inode_to_client(inode); - ci = ceph_inode(inode); - vino = ceph_vino(inode); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - - return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, - snapc, ci->i_truncate_seq, ci->i_truncate_size, true); -} - /* * initiate async writeback */ @@ -658,7 +645,8 @@ static int ceph_writepages_start(struct address_space *mapping, { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -670,7 +658,8 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size; + u64 truncate_size, snap_size; + u32 truncate_seq; /* * Include a 'sync' in the OSD request if this is a data @@ -685,7 +674,6 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - fsc = ceph_inode_to_client(inode); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ @@ -728,6 +716,14 @@ retry: snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the * start of the original file range. */ @@ -739,7 +735,6 @@ retry: while (!done && index <= end) { int num_ops = do_sync ? 2 : 1; - struct ceph_vino vino; unsigned i; int first; pgoff_t next; @@ -833,17 +828,18 @@ get_more_pages: * that it will use. */ if (locked_pages == 0) { - size_t size; - BUG_ON(pages); - /* prepare async write request */ offset = (u64)page_offset(page); len = wsize; - req = ceph_writepages_osd_request(inode, - offset, &len, snapc, - num_ops); - + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); if (IS_ERR(req)) { rc = PTR_ERR(req); unlock_page(page); @@ -854,8 +850,8 @@ get_more_pages: req->r_inode = inode; max_pages = calc_pages_for(0, (u64)len); - size = max_pages * sizeof (*pages); - pages = kmalloc(size, GFP_NOFS); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); if (!pages) { pool = fsc->wb_pagevec_pool; pages = mempool_alloc(pool, GFP_NOFS); -- cgit v1.2.3 From b8c2f3ae2d9f2b975a0e1a9c5652829ef8a4f06c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:37:11 +0800 Subject: ceph: check migrate seq before changing auth cap We may receive old request reply from the exporter MDS after receiving the importer MDS' cap import message. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 54c290b083ab..790f88b15daf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -612,9 +612,11 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) { + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ci->i_auth_cap = cap; + } else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { -- cgit v1.2.3 From 667ca05cd9f02f0a345446abc362484c019d4d71 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:25:36 +0800 Subject: ceph: clear migrate seq when MDS restarts Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ddbd5907d41b..6272c7884e66 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2455,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); -- cgit v1.2.3 From e976cad0f0dbe5440a4ca38e29e1f932d9319125 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 9 Jun 2013 08:40:39 -0700 Subject: rbd: fix a couple warnings gcc isn't quite smart enough and generates these warnings: drivers/block/rbd.c: In function 'rbd_img_request_fill': drivers/block/rbd.c:1266:22: warning: 'bio_list' may be used uninitialized in this function [-Wmaybe-uninitialized] drivers/block/rbd.c:2186:14: note: 'bio_list' was declared here drivers/block/rbd.c:2247:10: warning: 'pages' may be used uninitialized in this function [-Wmaybe-uninitialized] even though they are initialized for their respective code paths. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 14c6dc92ef0e..4ad2ad9a5bb0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2163,9 +2163,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; bool write_request = img_request_write_test(img_request); - struct bio *bio_list; + struct bio *bio_list = 0; unsigned int bio_offset = 0; - struct page **pages; + struct page **pages = 0; u64 img_offset; u64 resid; u16 opcode; -- cgit v1.2.3 From 005c46970e3a2a4b95da220eab43b87307646335 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 31 May 2013 16:40:24 +0800 Subject: ceph: move inode to proper flushing list when auth MDS changes Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 790f88b15daf..9a5ccc9e0d62 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1982,8 +1982,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, __ceph_caps_used(ci), __ceph_caps_wanted(ci), -- cgit v1.2.3 From a1dc1937337a93e699eaa56968b7de6e1a9e77cf Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 19 Jun 2013 14:58:10 +0800 Subject: ceph: fix sleeping function called from invalid context. [ 1121.231883] BUG: sleeping function called from invalid context at kernel/rwsem.c:20 [ 1121.231935] in_atomic(): 1, irqs_disabled(): 0, pid: 9831, name: mv [ 1121.231971] 1 lock held by mv/9831: [ 1121.231973] #0: (&(&ci->i_ceph_lock)->rlock){+.+...},at:[] ceph_getxattr+0x58/0x1d0 [ceph] [ 1121.231998] CPU: 3 PID: 9831 Comm: mv Not tainted 3.10.0-rc6+ #215 [ 1121.232000] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015 11/09/2011 [ 1121.232027] ffff88006d355a80 ffff880092f69ce0 ffffffff8168348c ffff880092f69cf8 [ 1121.232045] ffffffff81070435 ffff88006d355a20 ffff880092f69d20 ffffffff816899ba [ 1121.232052] 0000000300000004 ffff8800b76911d0 ffff88006d355a20 ffff880092f69d68 [ 1121.232056] Call Trace: [ 1121.232062] [] dump_stack+0x19/0x1b [ 1121.232067] [] __might_sleep+0xe5/0x110 [ 1121.232071] [] down_read+0x2a/0x98 [ 1121.232080] [] ceph_vxattrcb_layout+0x60/0xf0 [ceph] [ 1121.232088] [] ceph_getxattr+0x9f/0x1d0 [ceph] [ 1121.232093] [] vfs_getxattr+0xa8/0xd0 [ 1121.232097] [] getxattr+0xab/0x1c0 [ 1121.232100] [] ? final_putname+0x22/0x50 [ 1121.232104] [] ? kmem_cache_free+0xb0/0x260 [ 1121.232107] [] ? final_putname+0x22/0x50 [ 1121.232110] [] ? trace_hardirqs_on+0xd/0x10 [ 1121.232114] [] ? sysret_check+0x1b/0x56 [ 1121.232120] [] SyS_fgetxattr+0x6c/0xc0 [ 1121.232125] [] system_call_fastpath+0x16/0x1b [ 1121.232129] BUG: scheduling while atomic: mv/9831/0x10000002 [ 1121.232154] 1 lock held by mv/9831: [ 1121.232156] #0: (&(&ci->i_ceph_lock)->rlock){+.+...}, at: [] ceph_getxattr+0x58/0x1d0 [ceph] I think move the ci->i_ceph_lock down is safe because we can't free ceph_inode_info at there. CC: stable@vger.kernel.org # 3.8+ Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/xattr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9b6b2b6dd164..be661d8f532a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - spin_lock(&ci->i_ceph_lock); - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { err = vxattr->getxattr_cb(ci, value, size); - goto out; + return err; } + spin_lock(&ci->i_ceph_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; -- cgit v1.2.3 From c62988ec0910a2d480fecb2f0140a36fcdc7b691 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 19 Jun 2013 15:12:06 +0800 Subject: ceph: avoid meaningless calling ceph_caps_revoking if sync_mode == WB_SYNC_ALL. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/addr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3500b74c32ed..afb2fc241061 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -666,8 +666,8 @@ static int ceph_writepages_start(struct address_space *mapping, * integrity write (e.g., O_SYNC write or fsync()), or if our * cap is being revoked. */ - do_sync = wbc->sync_mode == WB_SYNC_ALL; - if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) do_sync = 1; dout("writepages_start %p dosync=%d (mode=%s)\n", inode, do_sync, -- cgit v1.2.3 From 0405a1499df42a2b9fd4906096c6bb950e15e850 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Sun, 23 Jun 2013 09:48:38 -0700 Subject: ceph: remove sb_start/end_write in ceph_aio_write. Either in vfs_write or io_submit,it call file_start/end_write. The different between file_start/end_write and sb_start/end_write is file_ only handle regular file.But i think in ceph_aio_write,it only for regular file. Signed-off-by: Jianpeng Ma Acked-by: Yan, Zheng --- fs/ceph/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e16907430..7c69f4f0dee6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); hold_mutex = true; @@ -809,7 +808,6 @@ retry_snap: out: if (hold_mutex) mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return written ? written : err; -- cgit v1.2.3 From fb3101b6f0db9ae3f35dc8e6ec908d0af8cdf12e Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 25 Jun 2013 14:48:19 +0800 Subject: ceph: Free mdsc if alloc mdsc->mdsmap failed. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6272c7884e66..3eb1b4470c85 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3042,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); -- cgit v1.2.3 From 93faca6ef45822b0825bb181859b1a8911e9c4c1 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 26 Jun 2013 11:15:27 +0800 Subject: ceph: Reconstruct the func ceph_reserve_caps. Drop ignored return value. Fix allocation failure case to not leak. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil --- fs/ceph/caps.c | 21 +++++++-------------- fs/ceph/super.h | 2 +- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9a5ccc9e0d62..8ec27b130cc9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7ccfdb4aea2e..dfbb729b3130 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); -- cgit v1.2.3 From 2cb33cac622afde897aa02d3dcd9fbba8bae839e Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 20 Jun 2013 13:13:59 -0700 Subject: libceph: Fix NULL pointer dereference in auth client code A malicious monitor can craft an auth reply message that could cause a NULL function pointer dereference in the client's kernel. To prevent this, the auth_none protocol handler needs an empty ceph_auth_client_ops->build_request() function. CVE-2013-1059 Signed-off-by: Tyler Hicks Reported-by: Chanam Park Reviewed-by: Seth Arnold Reviewed-by: Sage Weil Cc: stable@vger.kernel.org --- net/ceph/auth_none.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 925ca583c09c..8c93fa8d81bc 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -39,6 +39,11 @@ static int should_authenticate(struct ceph_auth_client *ac) return xi->starting; } +static int build_request(struct ceph_auth_client *ac, void *buf, void *end) +{ + return 0; +} + /* * the generic auth code decode the global_id, and we carry no actual * authenticate state, so nothing happens here. @@ -106,6 +111,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = { .destroy = destroy, .is_authenticated = is_authenticated, .should_authenticate = should_authenticate, + .build_request = build_request, .handle_reply = handle_reply, .create_authorizer = ceph_auth_none_create_authorizer, .destroy_authorizer = ceph_auth_none_destroy_authorizer, -- cgit v1.2.3 From 5446429630257f4723829409337a26c076907d5d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 1 Jul 2013 18:33:39 -0400 Subject: ceph: avoid accessing invalid memory when mounting ceph with a dev name that starts with a slash, ceph would attempt to access the character before that slash. Since we don't actually own that byte of memory, we would trigger an invalid access: [ 43.499934] BUG: unable to handle kernel paging request at ffff880fa3a97fff [ 43.500984] IP: [] parse_mount_options+0x1a4/0x300 [ 43.501491] PGD 743b067 PUD 10283c4067 PMD 10282a6067 PTE 8000000fa3a97060 [ 43.502301] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 43.503006] Dumping ftrace buffer: [ 43.503596] (ftrace buffer empty) [ 43.504046] CPU: 0 PID: 10879 Comm: mount Tainted: G W 3.10.0-sasha #1129 [ 43.504851] task: ffff880fa625b000 ti: ffff880fa3412000 task.ti: ffff880fa3412000 [ 43.505608] RIP: 0010:[] [] parse_mount_options$ [ 43.506552] RSP: 0018:ffff880fa3413d08 EFLAGS: 00010286 [ 43.507133] RAX: ffff880fa3a98000 RBX: ffff880fa3a98000 RCX: 0000000000000000 [ 43.507893] RDX: ffff880fa3a98001 RSI: 000000000000002f RDI: ffff880fa3a98000 [ 43.508610] RBP: ffff880fa3413d58 R08: 0000000000001f99 R09: ffff880fa3fe64c0 [ 43.509426] R10: ffff880fa3413d98 R11: ffff880fa38710d8 R12: ffff880fa3413da0 [ 43.509792] R13: ffff880fa3a97fff R14: 0000000000000000 R15: ffff880fa3413d90 [ 43.509792] FS: 00007fa9c48757e0(0000) GS:ffff880fd2600000(0000) knlGS:000000000000$ [ 43.509792] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 43.509792] CR2: ffff880fa3a97fff CR3: 0000000fa3bb9000 CR4: 00000000000006b0 [ 43.509792] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 43.509792] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 43.509792] Stack: [ 43.509792] 0000e5180000000e ffffffff85ca1900 ffff880fa38710d8 ffff880fa3413d98 [ 43.509792] 0000000000000120 0000000000000000 ffff880fa3a98000 0000000000000000 [ 43.509792] ffffffff85cf32a0 0000000000000000 ffff880fa3413dc8 ffffffff818f3c72 [ 43.509792] Call Trace: [ 43.509792] [] ceph_mount+0xa2/0x390 [ 43.509792] [] ? pcpu_alloc+0x334/0x3c0 [ 43.509792] [] mount_fs+0x8d/0x1a0 [ 43.509792] [] ? __alloc_percpu+0x10/0x20 [ 43.509792] [] vfs_kern_mount+0x79/0x100 [ 43.509792] [] do_new_mount+0xcd/0x1c0 [ 43.509792] [] do_mount+0x15d/0x210 [ 43.509792] [] ? strndup_user+0x45/0x60 [ 43.509792] [] SyS_mount+0x9d/0xe0 [ 43.509792] [] tracesys+0xdd/0xe2 [ 43.509792] Code: 4c 8b 5d c0 74 0a 48 8d 50 01 49 89 14 24 eb 17 31 c0 48 83 c9 ff $ [ 43.509792] RIP [] parse_mount_options+0x1a4/0x300 [ 43.509792] RSP [ 43.509792] CR2: ffff880fa3a97fff [ 43.509792] ---[ end trace 22469cd81e93af51 ]--- Signed-off-by: Sasha Levin Reviewed-by: Sage Weil --- fs/ceph/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7d377c9a5e35..6627b26a800c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ - if (*dev_name_end != ':') { + if (dev_name_end < dev_name || *dev_name_end != ':') { pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; -- cgit v1.2.3 From b415bf4f9fe25f39934f5c464125e4a2dffb6d08 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Jul 2013 12:40:19 +0800 Subject: ceph: fix pending vmtruncate race The locking order for pending vmtruncate is wrong, it can lead to following race: write wmtruncate work ------------------------ ---------------------- lock i_mutex check i_truncate_pending check i_truncate_pending truncate_inode_pages() lock i_mutex (blocked) copy data to page cache unlock i_mutex truncate_inode_pages() The fix is take i_mutex before calling __ceph_do_pending_vmtruncate() Fixes: http://tracker.ceph.com/issues/5453 Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 6 +++++- fs/ceph/file.c | 2 +- fs/ceph/inode.c | 14 ++++++-------- fs/ceph/super.h | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8ec27b130cc9..16266f3e9a33 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2057,7 +2057,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); - __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7c69f4f0dee6..a44d5153179b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -822,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index be0f7e20d62e..4906ada4a97c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - __ceph_do_pending_vmtruncate(inode, true); + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode) * Make sure any pending truncation is applied before doing anything * that may depend on it. */ -void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) +void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; @@ -1525,11 +1527,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - if (needlock) - mutex_lock(&inode->i_mutex); truncate_inode_pages(inode->i_mapping, to); - if (needlock) - mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { @@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); err = inode_change_ok(inode, attr); if (err != 0) @@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dfbb729b3130..cbded572345e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern int ceph_inode_set_size(struct inode *inode, loff_t size); -extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); -- cgit v1.2.3 From b1530f57042297f85330a140a6921b6f95fe74d3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Jul 2013 12:40:20 +0800 Subject: ceph: fix cap revoke race If caps are been revoking by the auth MDS, don't consider them as issued even they are still issued by non-auth MDS. The non-auth MDS should also be revoking/exporting these caps, the client just hasn't received the cap revoke/export message. The race I encountered is: When caps are exporting to new MDS, the client receives cap import message and cap revoke message from the new MDS, then receives cap export message from the old MDS. When the client receives cap revoke message from the new MDS, the revoking caps are still issued by the old MDS, so the client does nothing. Later when the cap export message is received, the client removes the caps issued by the old MDS. (Another way to fix the race is calling ceph_check_caps() in handle_cap_export()) Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 16266f3e9a33..7045a8dfaad4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -690,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } -- cgit v1.2.3 From 6ee6b95373dfa1d0a4c9bc76689ec10a60c1d6f2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 2 Jul 2013 12:40:21 +0800 Subject: ceph: fix race between cap issue and revoke If we receive new caps from the auth MDS and the non-auth MDS is revoking the newly issued caps, we should release the caps from the non-auth MDS. The scenario is filelock's state changes from SYNC to LOCK. Non-auth MDS revokes Fc cap, the client gets Fc cap from the auth MDS at the same time. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7045a8dfaad4..25442b40c25a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -806,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && __cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) + return 1; } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); @@ -2488,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a -- cgit v1.2.3 From 61c5d6bf7074ee32d014dcdf7698dc8c59eb712d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 24 Jun 2013 14:41:27 +0800 Subject: libceph: call r_unsafe_callback when unsafe reply is received We can't use !req->r_sent to check if OSD request is sent for the first time, this is because __cancel_request() zeros req->r_sent when OSD map changes. Rather than adding a new variable to struct ceph_osd_request to indicate if it's sent for the first time, We can call the unsafe callback only when unsafe OSD reply is received. If OSD's first reply is safe, just skip calling the unsafe callback. The purpose of unsafe callback is adding unsafe request to a list, so that fsync(2) can wait for the safe reply. fsync(2) doesn't need to wait for a write(2) that hasn't returned yet. So it's OK to add request to the unsafe list when the first OSD reply is received. (ceph_sync_write() returns after receiving the first OSD reply) Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 540dd29c9210..dd47889adc4a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1337,10 +1337,6 @@ static void __send_request(struct ceph_osd_client *osdc, ceph_msg_get(req->r_request); /* send consumes a ref */ - /* Mark the request unsafe if this is the first timet's being sent. */ - - if (!req->r_sent && req->r_unsafe_callback) - req->r_unsafe_callback(req, true); req->r_sent = req->r_osd->o_incarnation; ceph_con_send(&req->r_osd->o_con, req->r_request); @@ -1431,8 +1427,6 @@ static void handle_osds_timeout(struct work_struct *work) static void complete_request(struct ceph_osd_request *req) { - if (req->r_unsafe_callback) - req->r_unsafe_callback(req, false); complete_all(&req->r_safe_completion); /* fsync waiter */ } @@ -1559,14 +1553,20 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, mutex_unlock(&osdc->request_mutex); if (!already_completed) { + if (req->r_unsafe_callback && + result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) + req->r_unsafe_callback(req, true); if (req->r_callback) req->r_callback(req, msg); else complete_all(&req->r_completion); } - if (flags & CEPH_OSD_FLAG_ONDISK) + if (flags & CEPH_OSD_FLAG_ONDISK) { + if (req->r_unsafe_callback && already_completed) + req->r_unsafe_callback(req, false); complete_request(req); + } done: dout("req=%p req->r_linger=%d\n", req, req->r_linger); -- cgit v1.2.3 From 8b8cf8917f9b5d74e04f281272d8719ce335a497 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 28 Jun 2013 13:13:16 -0700 Subject: libceph: fix invalid unsigned->signed conversion for timespec encoding __kernel_time_t is a long, which cannot hold a U32_MAX on 32-bit architectures. Just drop this check as it has limited value. This fixes a crash like: [ 957.905812] kernel BUG at /srv/autobuild-ceph/gitbuilder.git/build/include/linux/ceph/decode.h:164! [ 957.914849] Internal error: Oops - BUG: 0 [#1] SMP ARM [ 957.919978] Modules linked in: rbd libceph libcrc32c ipmi_devintf ipmi_si ipmi_msghandler nfsd nfs_acl auth_rpcgss nfs fscache lockd sunrpc [ 957.932547] CPU: 1 Tainted: G W (3.9.0-ceph-19bb6a83-highbank #1) [ 957.939881] PC is at ceph_osdc_build_request+0x8c/0x4f8 [libceph] [ 957.945967] LR is at 0xec520904 [ 957.949103] pc : [] lr : [] psr: 20000153 [ 957.949103] sp : ec753df8 ip : 00000001 fp : ec53e100 [ 957.960571] r10: ebef25c0 r9 : ec5fa400 r8 : ecbcc000 [ 957.965788] r7 : 00000000 r6 : 00000000 r5 : ffffffff r4 : 00000020 [ 957.972307] r3 : 51cc8143 r2 : ec520900 r1 : ec753e58 r0 : ec520908 [ 957.978827] Flags: nzCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 957.986039] Control: 10c5387d Table: 2c59c04a DAC: 00000015 [ 957.991777] Process rbd (pid: 2138, stack limit = 0xec752238) [ 957.997514] Stack: (0xec753df8 to 0xec754000) [ 958.001864] 3de0: 00000001 00000001 [ 958.010032] 3e00: 00000001 bf139744 ecbcc000 ec55a0a0 00000024 00000000 ebef25c0 fffffffe [ 958.018204] 3e20: ffffffff 00000000 00000000 00000001 ec5fa400 ebef25c0 ec53e100 bf166b68 [ 958.026377] 3e40: 00000000 0000220f fffffffe ffffffff ec753e58 bf13ff24 51cc8143 05b25ed2 [ 958.034548] 3e60: 00000001 00000000 00000000 bf1688d4 00000001 00000000 00000000 00000000 [ 958.042720] 3e80: 00000001 00000060 ec5fa400 ed53d200 ed439600 ed439300 00000001 00000060 [ 958.050888] 3ea0: ec5fa400 ed53d200 00000000 bf16a320 00000000 ec53e100 00000040 ec753eb8 [ 958.059059] 3ec0: ec51df00 ed53d7c0 ed53d200 ed53d7c0 00000000 ed53d7c0 ec5fa400 bf16ed70 [ 958.067230] 3ee0: 00000000 00000060 00000002 ed53d200 00000000 bf16acf4 ed53d7c0 ec752000 [ 958.075402] 3f00: ed980e50 e954f5d8 00000000 00000060 ed53d240 ed53d258 ec753f80 c04f44a8 [ 958.083574] 3f20: edb7910c ec664700 01ade920 c02e4c44 00000060 c016b3dc ec51de40 01adfb84 [ 958.091745] 3f40: 00000060 ec752000 ec753f80 ec752000 00000060 c0108444 00000007 ec51de48 [ 958.099914] 3f60: ed0eb8c0 00000000 00000000 ec51de40 01adfb84 00000001 00000060 c0108858 [ 958.108085] 3f80: 00000000 00000000 51cc8143 00000060 01adfb84 00000007 00000004 c000dd68 [ 958.116257] 3fa0: 00000000 c000dbc0 00000060 01adfb84 00000007 01adfb84 00000060 01adfb80 [ 958.124429] 3fc0: 00000060 01adfb84 00000007 00000004 beded1a8 00000000 01adf2f0 01ade920 [ 958.132599] 3fe0: 00000000 beded180 b6811324 b6811334 800f0010 00000007 2e7f5821 2e7f5c21 [ 958.140815] [] (ceph_osdc_build_request+0x8c/0x4f8 [libceph]) from [] (rbd_osd_req_format_write+0x50/0x7c [rbd]) [ 958.152739] [] (rbd_osd_req_format_write+0x50/0x7c [rbd]) from [] (rbd_dev_header_watch_sync+0xe0/0x204 [rbd]) [ 958.164486] [] (rbd_dev_header_watch_sync+0xe0/0x204 [rbd]) from [] (rbd_dev_image_probe+0x23c/0x850 [rbd]) [ 958.175967] [] (rbd_dev_image_probe+0x23c/0x850 [rbd]) from [] (rbd_add+0x3c0/0x918 [rbd]) [ 958.185975] [] (rbd_add+0x3c0/0x918 [rbd]) from [] (bus_attr_store+0x20/0x2c) [ 958.194850] [] (bus_attr_store+0x20/0x2c) from [] (sysfs_write_file+0x168/0x198) [ 958.203984] [] (sysfs_write_file+0x168/0x198) from [] (vfs_write+0x9c/0x170) [ 958.212768] [] (vfs_write+0x9c/0x170) from [] (sys_write+0x3c/0x70) [ 958.220768] [] (sys_write+0x3c/0x70) from [] (ret_fast_syscall+0x0/0x30) [ 958.229199] Code: e59d1058 e5913000 e3530000 ba000114 (e7f001f2) CC: stable@vger.kernel.org # 3.4+ Signed-off-by: Josh Durgin Reviewed-by: Sage Weil --- include/linux/ceph/decode.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 379f71508995..0442c3d800f0 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -160,11 +160,6 @@ static inline void ceph_decode_timespec(struct timespec *ts, static inline void ceph_encode_timespec(struct ceph_timespec *tv, const struct timespec *ts) { - BUG_ON(ts->tv_sec < 0); - BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX); - BUG_ON(ts->tv_nsec < 0); - BUG_ON(ts->tv_nsec > (long)U32_MAX); - tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); } -- cgit v1.2.3