diff options
32 files changed, 4196 insertions, 2280 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index cd9213ccf3dc..0a306476424e 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -66,27 +66,7 @@ current_snap The current snapshot for which the device is mapped. -snap_* - - A directory per each snapshot - parent Information identifying the pool, image, and snapshot id for the parent image in a layered rbd image (format 2 only). - -Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name> -------------------------------------------------------------- - -snap_id - - The rados internal snapshot id assigned for this snapshot - -snap_size - - The size of the image when this snapshot was taken. - -snap_features - - A hexadecimal encoding of the feature bits for this snapshot. - diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b7b7a88d9f68..c2ca1818f335 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1,3 +1,4 @@ + /* rbd.c -- Export ceph rados objects as a Linux block device @@ -32,12 +33,14 @@ #include <linux/ceph/mon_client.h> #include <linux/ceph/decode.h> #include <linux/parser.h> +#include <linux/bsearch.h> #include <linux/kernel.h> #include <linux/device.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/blkdev.h> +#include <linux/slab.h> #include "rbd_types.h" @@ -52,13 +55,6 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have these defined elsewhere */ - -#define U8_MAX ((u8) (~0U)) -#define U16_MAX ((u16) (~0U)) -#define U32_MAX ((u32) (~0U)) -#define U64_MAX ((u64) (~0ULL)) - #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -72,6 +68,8 @@ #define RBD_SNAP_HEAD_NAME "-" +#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ + /* This allows a single page to hold an image name sent by OSD */ #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) #define RBD_IMAGE_ID_LEN_MAX 64 @@ -80,11 +78,14 @@ /* Feature bits */ -#define RBD_FEATURE_LAYERING 1 +#define RBD_FEATURE_LAYERING (1<<0) +#define RBD_FEATURE_STRIPINGV2 (1<<1) +#define RBD_FEATURES_ALL \ + (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) /* Features supported by this (client software) implementation. */ -#define RBD_FEATURES_ALL (0) +#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) /* * An RBD device name will be "rbd#", where the "rbd" comes from @@ -112,7 +113,8 @@ struct rbd_image_header { char *snap_names; u64 *snap_sizes; - u64 obj_version; + u64 stripe_unit; + u64 stripe_count; }; /* @@ -142,13 +144,13 @@ struct rbd_image_header { */ struct rbd_spec { u64 pool_id; - char *pool_name; + const char *pool_name; - char *image_id; - char *image_name; + const char *image_id; + const char *image_name; u64 snap_id; - char *snap_name; + const char *snap_name; struct kref kref; }; @@ -174,13 +176,44 @@ enum obj_request_type { OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; +enum obj_req_flags { + OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ + OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ + OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ + OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ +}; + struct rbd_obj_request { const char *object_name; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ + unsigned long flags; - struct rbd_img_request *img_request; - struct list_head links; /* img_request->obj_requests */ + /* + * An object request associated with an image will have its + * img_data flag set; a standalone object request will not. + * + * A standalone object request will have which == BAD_WHICH + * and a null obj_request pointer. + * + * An object request initiated in support of a layered image + * object (to check for its existence before a write) will + * have which == BAD_WHICH and a non-null obj_request pointer. + * + * Finally, an object request for rbd image data will have + * which != BAD_WHICH, and will have a non-null img_request + * pointer. The value of which will be in the range + * 0..(img_request->obj_request_count-1). + */ + union { + struct rbd_obj_request *obj_request; /* STAT op */ + struct { + struct rbd_img_request *img_request; + u64 img_offset; + /* links for img_request->obj_requests list */ + struct list_head links; + }; + }; u32 which; /* posn image request list */ enum obj_request_type type; @@ -191,13 +224,12 @@ struct rbd_obj_request { u32 page_count; }; }; + struct page **copyup_pages; struct ceph_osd_request *osd_req; u64 xferred; /* bytes transferred */ - u64 version; int result; - atomic_t done; rbd_obj_callback_t callback; struct completion completion; @@ -205,19 +237,31 @@ struct rbd_obj_request { struct kref kref; }; +enum img_req_flags { + IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ + IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ + IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ +}; + struct rbd_img_request { - struct request *rq; struct rbd_device *rbd_dev; u64 offset; /* starting image byte offset */ u64 length; /* byte count from offset */ - bool write_request; /* false for read */ + unsigned long flags; union { + u64 snap_id; /* for reads */ struct ceph_snap_context *snapc; /* for writes */ - u64 snap_id; /* for reads */ }; + union { + struct request *rq; /* block request */ + struct rbd_obj_request *obj_request; /* obj req initiator */ + }; + struct page **copyup_pages; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; + u64 xferred;/* aggregate bytes transferred */ + int result; /* first nonzero obj_request result */ u32 obj_request_count; struct list_head obj_requests; /* rbd_obj_request structs */ @@ -232,15 +276,6 @@ struct rbd_img_request { #define for_each_obj_request_safe(ireq, oreq, n) \ list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) -struct rbd_snap { - struct device dev; - const char *name; - u64 size; - struct list_head node; - u64 id; - u64 features; -}; - struct rbd_mapping { u64 size; u64 features; @@ -276,6 +311,7 @@ struct rbd_device { struct rbd_spec *parent_spec; u64 parent_overlap; + struct rbd_device *parent; /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -284,9 +320,6 @@ struct rbd_device { struct list_head node; - /* list of snapshots */ - struct list_head snaps; - /* sysfs related */ struct device dev; unsigned long open_count; /* protected by lock */ @@ -312,16 +345,21 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *rbd_img_request_cache; +static struct kmem_cache *rbd_obj_request_cache; +static struct kmem_cache *rbd_segment_name_cache; -static void rbd_dev_release(struct device *dev); -static void rbd_remove_snap_dev(struct rbd_snap *snap); +static int rbd_img_request_submit(struct rbd_img_request *img_request); + +static void rbd_dev_device_release(struct device *dev); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -383,8 +421,19 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); +static void rbd_img_parent_read(struct rbd_obj_request *obj_request); +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); + +static int rbd_dev_refresh(struct rbd_device *rbd_dev); +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id); +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u8 *order, u64 *snap_size); +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features); +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -484,6 +533,13 @@ out_opt: return ERR_PTR(ret); } +static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) +{ + kref_get(&rbdc->kref); + + return rbdc; +} + /* * Find a ceph client with specific addr and configuration. If * found, bump its reference count. @@ -499,7 +555,8 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) spin_lock(&rbd_client_list_lock); list_for_each_entry(client_node, &rbd_client_list, node) { if (!ceph_compare_options(ceph_opts, client_node->client)) { - kref_get(&client_node->kref); + __rbd_get_client(client_node); + found = true; break; } @@ -722,7 +779,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes[i] = le64_to_cpu(ondisk->snaps[i].image_size); } else { - WARN_ON(ondisk->snap_names_len); header->snap_names = NULL; header->snap_sizes = NULL; } @@ -735,18 +791,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, /* Allocate and fill in the snapshot context */ header->image_size = le64_to_cpu(ondisk->image_size); - size = sizeof (struct ceph_snap_context); - size += snap_count * sizeof (header->snapc->snaps[0]); - header->snapc = kzalloc(size, GFP_KERNEL); + + header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); if (!header->snapc) goto out_err; - - atomic_set(&header->snapc->nref, 1); header->snapc->seq = le64_to_cpu(ondisk->snap_seq); - header->snapc->num_snaps = snap_count; for (i = 0; i < snap_count; i++) - header->snapc->snaps[i] = - le64_to_cpu(ondisk->snaps[i].id); + header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); return 0; @@ -761,70 +812,174 @@ out_err: return -ENOMEM; } -static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) { - struct rbd_snap *snap; + const char *snap_name; + rbd_assert(which < rbd_dev->header.snapc->num_snaps); + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which--) + snap_name += strlen(snap_name) + 1; + + return kstrdup(snap_name, GFP_KERNEL); +} + +/* + * Snapshot id comparison function for use with qsort()/bsearch(). + * Note that result is for snapshots in *descending* order. + */ +static int snapid_compare_reverse(const void *s1, const void *s2) +{ + u64 snap_id1 = *(u64 *)s1; + u64 snap_id2 = *(u64 *)s2; + + if (snap_id1 < snap_id2) + return 1; + return snap_id1 == snap_id2 ? 0 : -1; +} + +/* + * Search a snapshot context to see if the given snapshot id is + * present. + * + * Returns the position of the snapshot id in the array if it's found, + * or BAD_SNAP_INDEX otherwise. + * + * Note: The snapshot array is in kept sorted (by the osd) in + * reverse order, highest snapshot id first. + */ +static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + u64 *found; + + found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, + sizeof (snap_id), snapid_compare_reverse); + + return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; +} + +static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, + u64 snap_id) +{ + u32 which; + + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return NULL; + + return _rbd_dev_v1_snap_name(rbd_dev, which); +} + +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +{ if (snap_id == CEPH_NOSNAP) return RBD_SNAP_HEAD_NAME; - list_for_each_entry(snap, &rbd_dev->snaps, node) - if (snap_id == snap->id) - return snap->name; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (rbd_dev->image_format == 1) + return rbd_dev_v1_snap_name(rbd_dev, snap_id); - return NULL; + return rbd_dev_v2_snap_name(rbd_dev, snap_id); } -static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) +static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_size) { + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (snap_id == CEPH_NOSNAP) { + *snap_size = rbd_dev->header.image_size; + } else if (rbd_dev->image_format == 1) { + u32 which; - struct rbd_snap *snap; + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return -ENOENT; - list_for_each_entry(snap, &rbd_dev->snaps, node) { - if (!strcmp(snap_name, snap->name)) { - rbd_dev->spec->snap_id = snap->id; - rbd_dev->mapping.size = snap->size; - rbd_dev->mapping.features = snap->features; + *snap_size = rbd_dev->header.snap_sizes[which]; + } else { + u64 size = 0; + int ret; - return 0; - } + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); + if (ret) + return ret; + + *snap_size = size; } + return 0; +} + +static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features) +{ + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (snap_id == CEPH_NOSNAP) { + *snap_features = rbd_dev->header.features; + } else if (rbd_dev->image_format == 1) { + *snap_features = 0; /* No features for format 1 */ + } else { + u64 features = 0; + int ret; + + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); + if (ret) + return ret; - return -ENOENT; + *snap_features = features; + } + return 0; } -static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) +static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) { + const char *snap_name = rbd_dev->spec->snap_name; + u64 snap_id; + u64 size = 0; + u64 features = 0; int ret; - if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, - sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->spec->snap_id = CEPH_NOSNAP; - rbd_dev->mapping.size = rbd_dev->header.image_size; - rbd_dev->mapping.features = rbd_dev->header.features; - ret = 0; + if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) { + snap_id = rbd_snap_id_by_name(rbd_dev, snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; } else { - ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); - if (ret < 0) - goto done; - rbd_dev->mapping.read_only = true; + snap_id = CEPH_NOSNAP; } - set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); -done: - return ret; + ret = rbd_snap_size(rbd_dev, snap_id, &size); + if (ret) + return ret; + ret = rbd_snap_features(rbd_dev, snap_id, &features); + if (ret) + return ret; + + rbd_dev->mapping.size = size; + rbd_dev->mapping.features = features; + + /* If we are mapping a snapshot it must be marked read-only */ + + if (snap_id != CEPH_NOSNAP) + rbd_dev->mapping.read_only = true; + + return 0; } -static void rbd_header_free(struct rbd_image_header *header) +static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) { - kfree(header->object_prefix); - header->object_prefix = NULL; - kfree(header->snap_sizes); - header->snap_sizes = NULL; - kfree(header->snap_names); - header->snap_names = NULL; - ceph_put_snap_context(header->snapc); - header->snapc = NULL; + rbd_dev->mapping.size = 0; + rbd_dev->mapping.features = 0; + rbd_dev->mapping.read_only = true; +} + +static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) +{ + rbd_dev->mapping.size = 0; + rbd_dev->mapping.features = 0; + rbd_dev->mapping.read_only = true; } static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) @@ -833,7 +988,7 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) u64 segment; int ret; - name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); + name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; @@ -849,6 +1004,13 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) return name; } +static void rbd_segment_name_free(const char *name) +{ + /* The explicit cast here is needed to drop the const qualifier */ + + kmem_cache_free(rbd_segment_name_cache, (void *)name); +} + static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) { u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; @@ -921,6 +1083,37 @@ static void zero_bio_chain(struct bio *chain, int start_ofs) } /* + * similar to zero_bio_chain(), zeros data defined by a page array, + * starting at the given byte offset from the start of the array and + * continuing up to the given end offset. The pages array is + * assumed to be big enough to hold all bytes up to the end. + */ +static void zero_pages(struct page **pages, u64 offset, u64 end) +{ + struct page **page = &pages[offset >> PAGE_SHIFT]; + + rbd_assert(end > offset); + rbd_assert(end - offset <= (u64)SIZE_MAX); + while (offset < end) { + size_t page_offset; + size_t length; + unsigned long flags; + void *kaddr; + + page_offset = (size_t)(offset & ~PAGE_MASK); + length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); + local_irq_save(flags); + kaddr = kmap_atomic(*page); + memset(kaddr + page_offset, 0, length); + kunmap_atomic(kaddr); + local_irq_restore(flags); + + offset += length; + page++; + } +} + +/* * Clone a portion of a bio, starting at the given byte offset * and continuing for the number of bytes indicated. */ @@ -1064,6 +1257,77 @@ out_err: return NULL; } +/* + * The default/initial value for all object request flags is 0. For + * each flag, once its value is set to 1 it is never reset to 0 + * again. + */ +static void obj_request_img_data_set(struct rbd_obj_request *obj_request) +{ + if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { + struct rbd_device *rbd_dev; + + rbd_dev = obj_request->img_request->rbd_dev; + rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", + obj_request); + } +} + +static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; +} + +static void obj_request_done_set(struct rbd_obj_request *obj_request) +{ + if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { + struct rbd_device *rbd_dev = NULL; + + if (obj_request_img_data_test(obj_request)) + rbd_dev = obj_request->img_request->rbd_dev; + rbd_warn(rbd_dev, "obj_request %p already marked done\n", + obj_request); + } +} + +static bool obj_request_done_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; +} + +/* + * This sets the KNOWN flag after (possibly) setting the EXISTS + * flag. The latter is set based on the "exists" value provided. + * + * Note that for our purposes once an object exists it never goes + * away again. It's possible that the response from two existence + * checks are separated by the creation of the target object, and + * the first ("doesn't exist") response arrives *after* the second + * ("does exist"). In that case we ignore the second one. + */ +static void obj_request_existence_set(struct rbd_obj_request *obj_request, + bool exists) +{ + if (exists) + set_bit(OBJ_REQ_EXISTS, &obj_request->flags); + set_bit(OBJ_REQ_KNOWN, &obj_request->flags); + smp_mb(); +} + +static bool obj_request_known_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; +} + +static bool obj_request_exists_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; +} + static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, @@ -1101,9 +1365,11 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, { rbd_assert(obj_request->img_request == NULL); - rbd_obj_request_get(obj_request); + /* Image request now owns object's original reference */ obj_request->img_request = img_request; obj_request->which = img_request->obj_request_count; + rbd_assert(!obj_request_img_data_test(obj_request)); + obj_request_img_data_set(obj_request); rbd_assert(obj_request->which != BAD_WHICH); img_request->obj_request_count++; list_add_tail(&obj_request->links, &img_request->obj_requests); @@ -1123,6 +1389,7 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, img_request->obj_request_count--; rbd_assert(obj_request->which == img_request->obj_request_count); obj_request->which = BAD_WHICH; + rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request == img_request); obj_request->img_request = NULL; obj_request->callback = NULL; @@ -1141,76 +1408,6 @@ static bool obj_request_type_valid(enum obj_request_type type) } } -static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) -{ - struct ceph_osd_req_op *op; - va_list args; - size_t size; - - op = kzalloc(sizeof (*op), GFP_NOIO); - if (!op) - return NULL; - op->op = opcode; - va_start(args, opcode); - switch (opcode) { - case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_WRITE: - /* rbd_osd_req_op_create(READ, offset, length) */ - /* rbd_osd_req_op_create(WRITE, offset, length) */ - op->extent.offset = va_arg(args, u64); - op->extent.length = va_arg(args, u64); - if (opcode == CEPH_OSD_OP_WRITE) - op->payload_len = op->extent.length; - break; - case CEPH_OSD_OP_STAT: - break; - case CEPH_OSD_OP_CALL: - /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ - op->cls.class_name = va_arg(args, char *); - size = strlen(op->cls.class_name); - rbd_assert(size <= (size_t) U8_MAX); - op->cls.class_len = size; - op->payload_len = size; - - op->cls.method_name = va_arg(args, char *); - size = strlen(op->cls.method_name); - rbd_assert(size <= (size_t) U8_MAX); - op->cls.method_len = size; - op->payload_len += size; - - op->cls.argc = 0; - op->cls.indata = va_arg(args, void *); - size = va_arg(args, size_t); - rbd_assert(size <= (size_t) U32_MAX); - op->cls.indata_len = (u32) size; - op->payload_len += size; - break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ - /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ - op->watch.cookie = va_arg(args, u64); - op->watch.ver = va_arg(args, u64); - op->watch.ver = cpu_to_le64(op->watch.ver); - if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) - op->watch.flag = (u8) 1; - break; - default: - rbd_warn(NULL, "unsupported opcode %hu\n", opcode); - kfree(op); - op = NULL; - break; - } - va_end(args); - - return op; -} - -static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) -{ - kfree(op); -} - static int rbd_obj_request_submit(struct ceph_osd_client *osdc, struct rbd_obj_request *obj_request) { @@ -1221,7 +1418,24 @@ static int rbd_obj_request_submit(struct ceph_osd_client *osdc, static void rbd_img_request_complete(struct rbd_img_request *img_request) { + dout("%s: img %p\n", __func__, img_request); + + /* + * If no error occurred, compute the aggregate transfer + * count for the image request. We could instead use + * atomic64_cmpxchg() to update it as each object request + * completes; not clear which way is better off hand. + */ + if (!img_request->result) { + struct rbd_obj_request *obj_request; + u64 xferred = 0; + + for_each_obj_request(img_request, obj_request) + xferred += obj_request->xferred; + img_request->xferred = xferred; + } + if (img_request->callback) img_request->callback(img_request); else @@ -1237,39 +1451,56 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return wait_for_completion_interruptible(&obj_request->completion); } -static void obj_request_done_init(struct rbd_obj_request *obj_request) +/* + * The default/initial value for all image request flags is 0. Each + * is conditionally set to 1 at image request initialization time + * and currently never change thereafter. + */ +static void img_request_write_set(struct rbd_img_request *img_request) { - atomic_set(&obj_request->done, 0); - smp_wmb(); + set_bit(IMG_REQ_WRITE, &img_request->flags); + smp_mb(); } -static void obj_request_done_set(struct rbd_obj_request *obj_request) +static bool img_request_write_test(struct rbd_img_request *img_request) { - int done; + smp_mb(); + return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; +} - done = atomic_inc_return(&obj_request->done); - if (done > 1) { - struct rbd_img_request *img_request = obj_request->img_request; - struct rbd_device *rbd_dev; +static void img_request_child_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} - rbd_dev = img_request ? img_request->rbd_dev : NULL; - rbd_warn(rbd_dev, "obj_request %p was already done\n", - obj_request); - } +static bool img_request_child_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; } -static bool obj_request_done_test(struct rbd_obj_request *obj_request) +static void img_request_layered_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + +static bool img_request_layered_test(struct rbd_img_request *img_request) { smp_mb(); - return atomic_read(&obj_request->done) != 0; + return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; } static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { + u64 xferred = obj_request->xferred; + u64 length = obj_request->length; + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, obj_request, obj_request->img_request, obj_request->result, - obj_request->xferred, obj_request->length); + xferred, length); /* * ENOENT means a hole in the image. We zero-fill the * entire length of the request. A short read also implies @@ -1277,15 +1508,20 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) * update the xferred count to indicate the whole request * was satisfied. */ - BUG_ON(obj_request->type != OBJ_REQUEST_BIO); + rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); if (obj_request->result == -ENOENT) { - zero_bio_chain(obj_request->bio_list, 0); + if (obj_request->type == OBJ_REQUEST_BIO) + zero_bio_chain(obj_request->bio_list, 0); + else + zero_pages(obj_request->pages, 0, length); obj_request->result = 0; - obj_request->xferred = obj_request->length; - } else if (obj_request->xferred < obj_request->length && - !obj_request->result) { - zero_bio_chain(obj_request->bio_list, obj_request->xferred); - obj_request->xferred = obj_request->length; + obj_request->xferred = length; + } else if (xferred < length && !obj_request->result) { + if (obj_request->type == OBJ_REQUEST_BIO) + zero_bio_chain(obj_request->bio_list, xferred); + else + zero_pages(obj_request->pages, xferred, length); + obj_request->xferred = length; } obj_request_done_set(obj_request); } @@ -1308,9 +1544,23 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { - dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, - obj_request->result, obj_request->xferred, obj_request->length); - if (obj_request->img_request) + struct rbd_img_request *img_request = NULL; + struct rbd_device *rbd_dev = NULL; + bool layered = false; + + if (obj_request_img_data_test(obj_request)) { + img_request = obj_request->img_request; + layered = img_request && img_request_layered_test(img_request); + rbd_dev = img_request->rbd_dev; + } + + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, + obj_request, img_request, obj_request->result, + obj_request->xferred, obj_request->length); + if (layered && obj_request->result == -ENOENT && + obj_request->img_offset < rbd_dev->parent_overlap) + rbd_img_parent_read(obj_request); + else if (img_request) rbd_img_obj_request_read_callback(obj_request); else obj_request_done_set(obj_request); @@ -1321,9 +1571,8 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) dout("%s: obj %p result %d %llu\n", __func__, obj_request, obj_request->result, obj_request->length); /* - * There is no such thing as a successful short write. - * Our xferred value is the number of bytes transferred - * back. Set it to our originally-requested length. + * There is no such thing as a successful short write. Set + * it to our originally-requested length. */ obj_request->xferred = obj_request->length; obj_request_done_set(obj_request); @@ -1347,22 +1596,25 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); rbd_assert(osd_req == obj_request->osd_req); - rbd_assert(!!obj_request->img_request ^ - (obj_request->which == BAD_WHICH)); + if (obj_request_img_data_test(obj_request)) { + rbd_assert(obj_request->img_request); + rbd_assert(obj_request->which != BAD_WHICH); + } else { + rbd_assert(obj_request->which == BAD_WHICH); + } if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); - WARN_ON(osd_req->r_num_ops != 1); /* For now */ + BUG_ON(osd_req->r_num_ops > 2); /* * We support a 64-bit length, but ultimately it has to be * passed to blk_end_request(), which takes an unsigned int. */ obj_request->xferred = osd_req->r_reply_op_len[0]; - rbd_assert(obj_request->xferred < (u64) UINT_MAX); - opcode = osd_req->r_request_ops[0].op; + rbd_assert(obj_request->xferred < (u64)UINT_MAX); + opcode = osd_req->r_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: rbd_osd_read_callback(obj_request); @@ -1388,28 +1640,49 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_obj_request_complete(obj_request); } +static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + u64 snap_id; + + rbd_assert(osd_req != NULL); + + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; + ceph_osdc_build_request(osd_req, obj_request->offset, + NULL, snap_id, NULL); +} + +static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + struct ceph_snap_context *snapc; + struct timespec mtime = CURRENT_TIME; + + rbd_assert(osd_req != NULL); + + snapc = img_request ? img_request->snapc : NULL; + ceph_osdc_build_request(osd_req, obj_request->offset, + snapc, CEPH_NOSNAP, &mtime); +} + static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, bool write_request, - struct rbd_obj_request *obj_request, - struct ceph_osd_req_op *op) + struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; struct ceph_snap_context *snapc = NULL; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - struct timespec now; - struct timespec *mtime; - u64 snap_id = CEPH_NOSNAP; - u64 offset = obj_request->offset; - u64 length = obj_request->length; - if (img_request) { - rbd_assert(img_request->write_request == write_request); - if (img_request->write_request) + if (obj_request_img_data_test(obj_request)) { + struct rbd_img_request *img_request = obj_request->img_request; + + rbd_assert(write_request == + img_request_write_test(img_request)); + if (write_request) snapc = img_request->snapc; - else - snap_id = img_request->snap_id; } /* Allocate and initialize the request, for the single op */ @@ -1419,31 +1692,10 @@ static struct ceph_osd_request *rbd_osd_req_create( if (!osd_req) return NULL; /* ENOMEM */ - rbd_assert(obj_request_type_valid(obj_request->type)); - switch (obj_request->type) { - case OBJ_REQUEST_NODATA: - break; /* Nothing to do */ - case OBJ_REQUEST_BIO: - rbd_assert(obj_request->bio_list != NULL); - osd_req->r_bio = obj_request->bio_list; - break; - case OBJ_REQUEST_PAGES: - osd_req->r_pages = obj_request->pages; - osd_req->r_num_pages = obj_request->page_count; - osd_req->r_page_alignment = offset & ~PAGE_MASK; - break; - } - - if (write_request) { + if (write_request) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; - now = CURRENT_TIME; - mtime = &now; - } else { + else osd_req->r_flags = CEPH_OSD_FLAG_READ; - mtime = NULL; /* not needed for reads */ - offset = 0; /* These are not used... */ - length = 0; /* ...for osd read requests */ - } osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; @@ -1454,14 +1706,51 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_file_layout = rbd_dev->layout; /* struct */ - /* osd_req will get its own reference to snapc (if non-null) */ + return osd_req; +} - ceph_osdc_build_request(osd_req, offset, length, 1, op, - snapc, snap_id, mtime); +/* + * Create a copyup osd request based on the information in the + * object request supplied. A copyup request has two osd ops, + * a copyup method call, and a "normal" write request. + */ +static struct ceph_osd_request * +rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc; + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + rbd_assert(img_request); + rbd_assert(img_request_write_test(img_request)); + + /* Allocate and initialize the request, for the two ops */ + + snapc = img_request->snapc; + rbd_dev = img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; + + osd_req->r_oid_len = strlen(obj_request->object_name); + rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); + memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + + osd_req->r_file_layout = rbd_dev->layout; /* struct */ return osd_req; } + static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); @@ -1480,18 +1769,23 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, rbd_assert(obj_request_type_valid(type)); size = strlen(object_name) + 1; - obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); - if (!obj_request) + name = kmalloc(size, GFP_KERNEL); + if (!name) return NULL; - name = (char *)(obj_request + 1); + obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); + if (!obj_request) { + kfree(name); + return NULL; + } + obj_request->object_name = memcpy(name, object_name, size); obj_request->offset = offset; obj_request->length = length; + obj_request->flags = 0; obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); - obj_request_done_init(obj_request); init_completion(&obj_request->completion); kref_init(&obj_request->kref); @@ -1530,7 +1824,9 @@ static void rbd_obj_request_destroy(struct kref *kref) break; } - kfree(obj_request); + kfree(obj_request->object_name); + obj_request->object_name = NULL; + kmem_cache_free(rbd_obj_request_cache, obj_request); } /* @@ -1541,37 +1837,40 @@ static void rbd_obj_request_destroy(struct kref *kref) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request) + bool write_request, + bool child_request) { struct rbd_img_request *img_request; - struct ceph_snap_context *snapc = NULL; - img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); + img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); if (!img_request) return NULL; if (write_request) { down_read(&rbd_dev->header_rwsem); - snapc = ceph_get_snap_context(rbd_dev->header.snapc); + ceph_get_snap_context(rbd_dev->header.snapc); up_read(&rbd_dev->header_rwsem); - if (WARN_ON(!snapc)) { - kfree(img_request); - return NULL; /* Shouldn't happen */ - } } img_request->rq = NULL; img_request->rbd_dev = rbd_dev; img_request->offset = offset; img_request->length = length; - img_request->write_request = write_request; - if (write_request) - img_request->snapc = snapc; - else + img_request->flags = 0; + if (write_request) { + img_request_write_set(img_request); + img_request->snapc = rbd_dev->header.snapc; + } else { img_request->snap_id = rbd_dev->spec->snap_id; + } + if (child_request) + img_request_child_set(img_request); + if (rbd_dev->parent_spec) + img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; img_request->callback = NULL; + img_request->result = 0; img_request->obj_request_count = 0; INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); @@ -1600,78 +1899,204 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_img_obj_request_del(img_request, obj_request); rbd_assert(img_request->obj_request_count == 0); - if (img_request->write_request) + if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); - kfree(img_request); + if (img_request_child_test(img_request)) + rbd_obj_request_put(img_request->obj_request); + + kmem_cache_free(rbd_img_request_cache, img_request); +} + +static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + unsigned int xferred; + int result; + bool more; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + + rbd_assert(obj_request->xferred <= (u64)UINT_MAX); + xferred = (unsigned int)obj_request->xferred; + result = obj_request->result; + if (result) { + struct rbd_device *rbd_dev = img_request->rbd_dev; + + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", + img_request_write_test(img_request) ? "write" : "read", + obj_request->length, obj_request->img_offset, + obj_request->offset); + rbd_warn(rbd_dev, " result %d xferred %x\n", + result, xferred); + if (!img_request->result) + img_request->result = result; + } + + /* Image object requests don't own their page array */ + + if (obj_request->type == OBJ_REQUEST_PAGES) { + obj_request->pages = NULL; + obj_request->page_count = 0; + } + + if (img_request_child_test(img_request)) { + rbd_assert(img_request->obj_request != NULL); + more = obj_request->which < img_request->obj_request_count - 1; + } else { + rbd_assert(img_request->rq != NULL); + more = blk_end_request(img_request->rq, result, xferred); + } + + return more; } -static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, - struct bio *bio_list) +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + u32 which = obj_request->which; + bool more = true; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + rbd_assert(img_request != NULL); + rbd_assert(img_request->obj_request_count > 0); + rbd_assert(which != BAD_WHICH); + rbd_assert(which < img_request->obj_request_count); + rbd_assert(which >= img_request->next_completion); + + spin_lock_irq(&img_request->completion_lock); + if (which != img_request->next_completion) + goto out; + + for_each_obj_request_from(img_request, obj_request) { + rbd_assert(more); + rbd_assert(which < img_request->obj_request_count); + + if (!obj_request_done_test(obj_request)) + break; + more = rbd_img_obj_end_request(obj_request); + which++; + } + + rbd_assert(more ^ (which == img_request->obj_request_count)); + img_request->next_completion = which; +out: + spin_unlock_irq(&img_request->completion_lock); + + if (!more) + rbd_img_request_complete(img_request); +} + +/* + * Split up an image request into one or more object requests, each + * to a different object. The "type" parameter indicates whether + * "data_desc" is the pointer to the head of a list of bio + * structures, or the base of a page array. In either case this + * function assumes data_desc describes memory sufficient to hold + * all data described by the image request. + */ +static int rbd_img_request_fill(struct rbd_img_request *img_request, + enum obj_request_type type, + void *data_desc) { struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; - unsigned int bio_offset; - u64 image_offset; + bool write_request = img_request_write_test(img_request); + struct bio *bio_list; + unsigned int bio_offset = 0; + struct page **pages; + u64 img_offset; u64 resid; u16 opcode; - dout("%s: img %p bio %p\n", __func__, img_request, bio_list); + dout("%s: img %p type %d data_desc %p\n", __func__, img_request, + (int)type, data_desc); - opcode = img_request->write_request ? CEPH_OSD_OP_WRITE - : CEPH_OSD_OP_READ; - bio_offset = 0; - image_offset = img_request->offset; - rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); + opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; + img_offset = img_request->offset; resid = img_request->length; rbd_assert(resid > 0); + + if (type == OBJ_REQUEST_BIO) { + bio_list = data_desc; + rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); + } else { + rbd_assert(type == OBJ_REQUEST_PAGES); + pages = data_desc; + } + while (resid) { + struct ceph_osd_request *osd_req; const char *object_name; - unsigned int clone_size; - struct ceph_osd_req_op *op; u64 offset; u64 length; - object_name = rbd_segment_name(rbd_dev, image_offset); + object_name = rbd_segment_name(rbd_dev, img_offset); if (!object_name) goto out_unwind; - offset = rbd_segment_offset(rbd_dev, image_offset); - length = rbd_segment_length(rbd_dev, image_offset, resid); + offset = rbd_segment_offset(rbd_dev, img_offset); + length = rbd_segment_length(rbd_dev, img_offset, resid); obj_request = rbd_obj_request_create(object_name, - offset, length, - OBJ_REQUEST_BIO); - kfree(object_name); /* object request has its own copy */ + offset, length, type); + /* object request has its own copy of the object name */ + rbd_segment_name_free(object_name); if (!obj_request) goto out_unwind; - rbd_assert(length <= (u64) UINT_MAX); - clone_size = (unsigned int) length; - obj_request->bio_list = bio_chain_clone_range(&bio_list, - &bio_offset, clone_size, - GFP_ATOMIC); - if (!obj_request->bio_list) - goto out_partial; + if (type == OBJ_REQUEST_BIO) { + unsigned int clone_size; + + rbd_assert(length <= (u64)UINT_MAX); + clone_size = (unsigned int)length; + obj_request->bio_list = + bio_chain_clone_range(&bio_list, + &bio_offset, + clone_size, + GFP_ATOMIC); + if (!obj_request->bio_list) + goto out_partial; + } else { + unsigned int page_count; + + obj_request->pages = pages; + page_count = (u32)calc_pages_for(offset, length); + obj_request->page_count = page_count; + if ((offset + length) & ~PAGE_MASK) + page_count--; /* more on last page */ + pages += page_count; + } - /* - * Build up the op to use in building the osd - * request. Note that the contents of the op are - * copied by rbd_osd_req_create(). - */ - op = rbd_osd_req_op_create(opcode, offset, length); - if (!op) - goto out_partial; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, - img_request->write_request, - obj_request, op); - rbd_osd_req_op_destroy(op); - if (!obj_request->osd_req) + osd_req = rbd_osd_req_create(rbd_dev, write_request, + obj_request); + if (!osd_req) goto out_partial; - /* status and version are initially zero-filled */ + obj_request->osd_req = osd_req; + obj_request->callback = rbd_img_obj_callback; + osd_req_op_extent_init(osd_req, 0, opcode, offset, length, + 0, 0); + if (type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_req, 0, + obj_request->bio_list, length); + else + osd_req_op_extent_osd_data_pages(osd_req, 0, + obj_request->pages, length, + offset & ~PAGE_MASK, false, false); + + if (write_request) + rbd_osd_req_format_write(obj_request); + else + rbd_osd_req_format_read(obj_request); + + obj_request->img_offset = img_offset; rbd_img_obj_request_add(img_request, obj_request); - image_offset += length; + img_offset += length; resid -= length; } @@ -1686,61 +2111,389 @@ out_unwind: return -ENOMEM; } -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +static void +rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; - u32 which = obj_request->which; - bool more = true; + struct rbd_device *rbd_dev; + u64 length; + u32 page_count; + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; + rbd_assert(img_request); - dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev); + length = (u64)1 << rbd_dev->header.obj_order; + page_count = (u32)calc_pages_for(0, length); + + rbd_assert(obj_request->copyup_pages); + ceph_release_page_vector(obj_request->copyup_pages, page_count); + obj_request->copyup_pages = NULL; + + /* + * We want the transfer count to reflect the size of the + * original write request. There is no such thing as a + * successful short write, so if the request was successful + * we can just set it to the originally-requested length. + */ + if (!obj_request->result) + obj_request->xferred = obj_request->length; + + /* Finish up with the normal image object callback */ + + rbd_img_obj_callback(obj_request); +} + +static void +rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *orig_request; + struct ceph_osd_request *osd_req; + struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; + struct page **pages; + int result; + u64 obj_size; + u64 xferred; + + rbd_assert(img_request_child_test(img_request)); + + /* First get what we need from the image request */ + + pages = img_request->copyup_pages; + rbd_assert(pages != NULL); + img_request->copyup_pages = NULL; + + orig_request = img_request->obj_request; + rbd_assert(orig_request != NULL); + rbd_assert(orig_request->type == OBJ_REQUEST_BIO); + result = img_request->result; + obj_size = img_request->length; + xferred = img_request->xferred; + + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev); + rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); + + rbd_img_request_put(img_request); + + if (result) + goto out_err; + + /* Allocate the new copyup osd request for the original request */ + + result = -ENOMEM; + rbd_assert(!orig_request->osd_req); + osd_req = rbd_osd_req_create_copyup(orig_request); + if (!osd_req) + goto out_err; + orig_request->osd_req = osd_req; + orig_request->copyup_pages = pages; + + /* Initialize the copyup op */ + + osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); + osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, + false, false); + + /* Then the original write request op */ + + osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, + orig_request->offset, + orig_request->length, 0, 0); + osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, + orig_request->length); + + rbd_osd_req_format_write(orig_request); + + /* All set, send it off. */ + + orig_request->callback = rbd_img_obj_copyup_callback; + osdc = &rbd_dev->rbd_client->client->osdc; + result = rbd_obj_request_submit(osdc, orig_request); + if (!result) + return; +out_err: + /* Record the error code and complete the request */ + + orig_request->result = result; + orig_request->xferred = 0; + obj_request_done_set(orig_request); + rbd_obj_request_complete(orig_request); +} + +/* + * Read from the parent image the range of data that covers the + * entire target of the given object request. This is used for + * satisfying a layered image write request when the target of an + * object request from the image request does not exist. + * + * A page array big enough to hold the returned data is allocated + * and supplied to rbd_img_request_fill() as the "data descriptor." + * When the read completes, this page array will be transferred to + * the original object request for the copyup operation. + * + * If an error occurs, record it as the result of the original + * object request and mark it done so it gets completed. + */ +static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = NULL; + struct rbd_img_request *parent_request = NULL; + struct rbd_device *rbd_dev; + u64 img_offset; + u64 length; + struct page **pages = NULL; + u32 page_count; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + + img_request = obj_request->img_request; rbd_assert(img_request != NULL); - rbd_assert(img_request->rq != NULL); - rbd_assert(img_request->obj_request_count > 0); - rbd_assert(which != BAD_WHICH); - rbd_assert(which < img_request->obj_request_count); - rbd_assert(which >= img_request->next_completion); + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev->parent != NULL); - spin_lock_irq(&img_request->completion_lock); - if (which != img_request->next_completion) - goto out; + /* + * First things first. The original osd request is of no + * use to use any more, we'll need a new one that can hold + * the two ops in a copyup request. We'll get that later, + * but for now we can release the old one. + */ + rbd_osd_req_destroy(obj_request->osd_req); + obj_request->osd_req = NULL; - for_each_obj_request_from(img_request, obj_request) { - unsigned int xferred; - int result; + /* + * Determine the byte range covered by the object in the + * child image to which the original request was to be sent. + */ + img_offset = obj_request->img_offset - obj_request->offset; + length = (u64)1 << rbd_dev->header.obj_order; - rbd_assert(more); - rbd_assert(which < img_request->obj_request_count); + /* + * There is no defined parent data beyond the parent + * overlap, so limit what we read at that boundary if + * necessary. + */ + if (img_offset + length > rbd_dev->parent_overlap) { + rbd_assert(img_offset < rbd_dev->parent_overlap); + length = rbd_dev->parent_overlap - img_offset; + } - if (!obj_request_done_test(obj_request)) - break; + /* + * Allocate a page array big enough to receive the data read + * from the parent. + */ + page_count = (u32)calc_pages_for(0, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) { + result = PTR_ERR(pages); + pages = NULL; + goto out_err; + } - rbd_assert(obj_request->xferred <= (u64) UINT_MAX); - xferred = (unsigned int) obj_request->xferred; - result = (int) obj_request->result; - if (result) - rbd_warn(NULL, "obj_request %s result %d xferred %u\n", - img_request->write_request ? "write" : "read", - result, xferred); + result = -ENOMEM; + parent_request = rbd_img_request_create(rbd_dev->parent, + img_offset, length, + false, true); + if (!parent_request) + goto out_err; + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; - more = blk_end_request(img_request->rq, result, xferred); - which++; + result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); + if (result) + goto out_err; + parent_request->copyup_pages = pages; + + parent_request->callback = rbd_img_obj_parent_read_full_callback; + result = rbd_img_request_submit(parent_request); + if (!result) + return 0; + + parent_request->copyup_pages = NULL; + parent_request->obj_request = NULL; + rbd_obj_request_put(obj_request); +out_err: + if (pages) + ceph_release_page_vector(pages, page_count); + if (parent_request) + rbd_img_request_put(parent_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); + + return result; +} + +static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_obj_request *orig_request; + int result; + + rbd_assert(!obj_request_img_data_test(obj_request)); + + /* + * All we need from the object request is the original + * request and the result of the STAT op. Grab those, then + * we're done with the request. + */ + orig_request = obj_request->obj_request; + obj_request->obj_request = NULL; + rbd_assert(orig_request); + rbd_assert(orig_request->img_request); + + result = obj_request->result; + obj_request->result = 0; + + dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, + obj_request, orig_request, result, + obj_request->xferred, obj_request->length); + rbd_obj_request_put(obj_request); + + rbd_assert(orig_request); + rbd_assert(orig_request->img_request); + + /* + * Our only purpose here is to determine whether the object + * exists, and we don't want to treat the non-existence as + * an error. If something else comes back, transfer the + * error to the original request and complete it now. + */ + if (!result) { + obj_request_existence_set(orig_request, true); + } else if (result == -ENOENT) { + obj_request_existence_set(orig_request, false); + } else if (result) { + orig_request->result = result; + goto out; } - rbd_assert(more ^ (which == img_request->obj_request_count)); - img_request->next_completion = which; + /* + * Resubmit the original request now that we have recorded + * whether the target object exists. + */ + orig_request->result = rbd_img_obj_request_submit(orig_request); out: - spin_unlock_irq(&img_request->completion_lock); + if (orig_request->result) + rbd_obj_request_complete(orig_request); + rbd_obj_request_put(orig_request); +} - if (!more) - rbd_img_request_complete(img_request); +static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) +{ + struct rbd_obj_request *stat_request; + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct page **pages = NULL; + u32 page_count; + size_t size; + int ret; + + /* + * The response data for a STAT call consists of: + * le64 length; + * struct { + * le32 tv_sec; + * le32 tv_nsec; + * } mtime; + */ + size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); + page_count = (u32)calc_pages_for(0, size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = -ENOMEM; + stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, + OBJ_REQUEST_PAGES); + if (!stat_request) + goto out; + + rbd_obj_request_get(obj_request); + stat_request->obj_request = obj_request; + stat_request->pages = pages; + stat_request->page_count = page_count; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, + stat_request); + if (!stat_request->osd_req) + goto out; + stat_request->callback = rbd_img_obj_exists_callback; + + osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); + osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, + false, false); + rbd_osd_req_format_read(stat_request); + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, stat_request); +out: + if (ret) + rbd_obj_request_put(obj_request); + + return ret; +} + +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct rbd_device *rbd_dev; + bool known; + + rbd_assert(obj_request_img_data_test(obj_request)); + + img_request = obj_request->img_request; + rbd_assert(img_request); + rbd_dev = img_request->rbd_dev; + + /* + * Only writes to layered images need special handling. + * Reads and non-layered writes are simple object requests. + * Layered writes that start beyond the end of the overlap + * with the parent have no parent data, so they too are + * simple object requests. Finally, if the target object is + * known to already exist, its parent data has already been + * copied, so a write to the object can also be handled as a + * simple object request. + */ + if (!img_request_write_test(img_request) || + !img_request_layered_test(img_request) || + rbd_dev->parent_overlap <= obj_request->img_offset || + ((known = obj_request_known_test(obj_request)) && + obj_request_exists_test(obj_request))) { + + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + + rbd_dev = obj_request->img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + + return rbd_obj_request_submit(osdc, obj_request); + } + + /* + * It's a layered write. The target object might exist but + * we may not know that yet. If we know it doesn't exist, + * start by reading the data for the full target object from + * the parent so we can use it for a copyup to the target. + */ + if (known) + return rbd_img_obj_parent_read_full(obj_request); + + /* We don't know whether the target exists. Go find out. */ + + return rbd_img_obj_exists_submit(obj_request); } static int rbd_img_request_submit(struct rbd_img_request *img_request) { - struct rbd_device *rbd_dev = img_request->rbd_dev; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; struct rbd_obj_request *next_obj_request; @@ -1748,27 +2501,105 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) for_each_obj_request_safe(img_request, obj_request, next_obj_request) { int ret; - obj_request->callback = rbd_img_obj_callback; - ret = rbd_obj_request_submit(osdc, obj_request); + ret = rbd_img_obj_request_submit(obj_request); if (ret) return ret; - /* - * The image request has its own reference to each - * of its object requests, so we can safely drop the - * initial one here. - */ - rbd_obj_request_put(obj_request); } return 0; } -static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, - u64 ver, u64 notify_id) +static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; - struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; + u64 obj_end; + + rbd_assert(img_request_child_test(img_request)); + + obj_request = img_request->obj_request; + rbd_assert(obj_request); + rbd_assert(obj_request->img_request); + + obj_request->result = img_request->result; + if (obj_request->result) + goto out; + + /* + * We need to zero anything beyond the parent overlap + * boundary. Since rbd_img_obj_request_read_callback() + * will zero anything beyond the end of a short read, an + * easy way to do this is to pretend the data from the + * parent came up short--ending at the overlap boundary. + */ + rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); + obj_end = obj_request->img_offset + obj_request->length; + rbd_dev = obj_request->img_request->rbd_dev; + if (obj_end > rbd_dev->parent_overlap) { + u64 xferred = 0; + + if (obj_request->img_offset < rbd_dev->parent_overlap) + xferred = rbd_dev->parent_overlap - + obj_request->img_offset; + + obj_request->xferred = min(img_request->xferred, xferred); + } else { + obj_request->xferred = img_request->xferred; + } +out: + rbd_img_request_put(img_request); + rbd_img_obj_request_read_callback(obj_request); + rbd_obj_request_complete(obj_request); +} + +static void rbd_img_parent_read(struct rbd_obj_request *obj_request) +{ + struct rbd_device *rbd_dev; + struct rbd_img_request *img_request; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->img_request != NULL); + rbd_assert(obj_request->result == (s32) -ENOENT); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + + rbd_dev = obj_request->img_request->rbd_dev; + rbd_assert(rbd_dev->parent != NULL); + /* rbd_read_finish(obj_request, obj_request->length); */ + img_request = rbd_img_request_create(rbd_dev->parent, + obj_request->img_offset, + obj_request->length, + false, true); + result = -ENOMEM; + if (!img_request) + goto out_err; + + rbd_obj_request_get(obj_request); + img_request->obj_request = obj_request; + + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + obj_request->bio_list); + if (result) + goto out_err; + + img_request->callback = rbd_img_parent_read_callback; + result = rbd_img_request_submit(img_request); + if (result) + goto out_err; + + return; +out_err: + if (img_request) + rbd_img_request_put(img_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); +} + +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id) +{ + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, @@ -1777,17 +2608,15 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, return -ENOMEM; ret = -ENOMEM; - op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); - if (!op) - goto out; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; - - osdc = &rbd_dev->rbd_client->client->osdc; obj_request->callback = rbd_obj_request_put; + + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, + notify_id, 0, 0); + rbd_osd_req_format_read(obj_request); + ret = rbd_obj_request_submit(osdc, obj_request); out: if (ret) @@ -1799,21 +2628,16 @@ out: static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *rbd_dev = (struct rbd_device *)data; - u64 hver; - int rc; if (!rbd_dev) return; dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, - rbd_dev->header_name, (unsigned long long) notify_id, - (unsigned int) opcode); - rc = rbd_dev_refresh(rbd_dev, &hver); - if (rc) - rbd_warn(rbd_dev, "got notification but failed to " - " update snaps: %d\n", rc); + rbd_dev->header_name, (unsigned long long)notify_id, + (unsigned int)opcode); + (void)rbd_dev_refresh(rbd_dev); - rbd_obj_notify_ack(rbd_dev, hver, notify_id); + rbd_obj_notify_ack(rbd_dev, notify_id); } /* @@ -1824,7 +2648,6 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1844,14 +2667,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request) goto out_cancel; - op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, - rbd_dev->header.obj_version, start); - if (!op) - goto out_cancel; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); if (!obj_request->osd_req) goto out_cancel; @@ -1860,6 +2676,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) else ceph_osdc_unregister_linger_request(osdc, rbd_dev->watch_request->osd_req); + + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, 0, start); + rbd_osd_req_format_write(obj_request); + ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out_cancel; @@ -1899,40 +2720,38 @@ out_cancel: } /* - * Synchronous osd object method call + * Synchronous osd object method call. Returns the number of bytes + * returned in the outbound buffer, or a negative error code. */ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, const char *object_name, const char *class_name, const char *method_name, - const char *outbound, + const void *outbound, size_t outbound_size, - char *inbound, - size_t inbound_size, - u64 *version) + void *inbound, + size_t inbound_size) { + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_client *osdc; - struct ceph_osd_req_op *op; struct page **pages; u32 page_count; int ret; /* - * Method calls are ultimately read operations but they - * don't involve object data (so no offset or length). - * The result should placed into the inbound buffer - * provided. They also supply outbound data--parameters for - * the object method. Currently if this is present it will - * be a snapshot id. + * Method calls are ultimately read operations. The result + * should placed into the inbound buffer provided. They + * also supply outbound data--parameters for the object + * method. Currently if this is present it will be a + * snapshot id. */ - page_count = (u32) calc_pages_for(0, inbound_size); + page_count = (u32)calc_pages_for(0, inbound_size); pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = -ENOMEM; - obj_request = rbd_obj_request_create(object_name, 0, 0, + obj_request = rbd_obj_request_create(object_name, 0, inbound_size, OBJ_REQUEST_PAGES); if (!obj_request) goto out; @@ -1940,17 +2759,29 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, - method_name, outbound, outbound_size); - if (!op) - goto out; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; - osdc = &rbd_dev->rbd_client->client->osdc; + osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, + class_name, method_name); + if (outbound_size) { + struct ceph_pagelist *pagelist; + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + if (!pagelist) + goto out; + + ceph_pagelist_init(pagelist); + ceph_pagelist_append(pagelist, outbound, outbound_size); + osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, + pagelist); + } + osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, + obj_request->pages, inbound_size, + 0, false, false); + rbd_osd_req_format_read(obj_request); + ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out; @@ -1961,10 +2792,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, ret = obj_request->result; if (ret < 0) goto out; - ret = 0; + + rbd_assert(obj_request->xferred < (u64)INT_MAX); + ret = (int)obj_request->xferred; ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); - if (version) - *version = obj_request->version; out: if (obj_request) rbd_obj_request_put(obj_request); @@ -2034,18 +2865,22 @@ static void rbd_request_fn(struct request_queue *q) } result = -EINVAL; - if (WARN_ON(offset && length > U64_MAX - offset + 1)) + if (offset && length > U64_MAX - offset + 1) { + rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", + offset, length); goto end_request; /* Shouldn't happen */ + } result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, - write_request); + write_request, false); if (!img_request) goto end_request; img_request->rq = rq; - result = rbd_img_request_fill_bio(img_request, rq->bio); + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + rq->bio); if (!result) result = rbd_img_request_submit(img_request); if (result) @@ -2053,8 +2888,10 @@ static void rbd_request_fn(struct request_queue *q) end_request: spin_lock_irq(q->queue_lock); if (result < 0) { - rbd_warn(rbd_dev, "obj_request %s result %d\n", - write_request ? "write" : "read", result); + rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", + write_request ? "write" : "read", + length, offset, result); + __blk_end_request_all(rq, result); } } @@ -2113,22 +2950,22 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) if (!disk) return; - if (disk->flags & GENHD_FL_UP) + rbd_dev->disk = NULL; + if (disk->flags & GENHD_FL_UP) { del_gendisk(disk); - if (disk->queue) - blk_cleanup_queue(disk->queue); + if (disk->queue) + blk_cleanup_queue(disk->queue); + } put_disk(disk); } static int rbd_obj_read_sync(struct rbd_device *rbd_dev, const char *object_name, - u64 offset, u64 length, - char *buf, u64 *version) + u64 offset, u64 length, void *buf) { - struct ceph_osd_req_op *op; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; size_t size; @@ -2148,16 +2985,19 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); - if (!op) - goto out; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; - osdc = &rbd_dev->rbd_client->client->osdc; + osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, + offset, length, 0, 0); + osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, + obj_request->pages, + obj_request->length, + obj_request->offset & ~PAGE_MASK, + false, false); + rbd_osd_req_format_read(obj_request); + ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out; @@ -2172,10 +3012,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); size = (size_t) obj_request->xferred; ceph_copy_from_page_vector(pages, buf, 0, size); - rbd_assert(size <= (size_t) INT_MAX); - ret = (int) size; - if (version) - *version = obj_request->version; + rbd_assert(size <= (size_t)INT_MAX); + ret = (int)size; out: if (obj_request) rbd_obj_request_put(obj_request); @@ -2196,7 +3034,7 @@ out: * Returns a pointer-coded errno if a failure occurs. */ static struct rbd_image_header_ondisk * -rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) +rbd_dev_v1_header_read(struct rbd_device *rbd_dev) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -2224,11 +3062,10 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) return ERR_PTR(-ENOMEM); ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, - 0, size, - (char *) ondisk, version); + 0, size, ondisk); if (ret < 0) goto out_err; - if (WARN_ON((size_t) ret < size)) { + if ((size_t)ret < size) { ret = -ENXIO; rbd_warn(rbd_dev, "short header read (want %zd got %d)", size, ret); @@ -2260,46 +3097,36 @@ static int rbd_read_header(struct rbd_device *rbd_dev, struct rbd_image_header *header) { struct rbd_image_header_ondisk *ondisk; - u64 ver = 0; int ret; - ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); + ondisk = rbd_dev_v1_header_read(rbd_dev); if (IS_ERR(ondisk)) return PTR_ERR(ondisk); ret = rbd_header_from_disk(header, ondisk); - if (ret >= 0) - header->obj_version = ver; kfree(ondisk); return ret; } -static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) -{ - struct rbd_snap *snap; - struct rbd_snap *next; - - list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) - rbd_remove_snap_dev(snap); -} - static void rbd_update_mapping_size(struct rbd_device *rbd_dev) { - sector_t size; - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) return; - size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; - dout("setting size to %llu sectors", (unsigned long long) size); - rbd_dev->mapping.size = (u64) size; - set_capacity(rbd_dev->disk, size); + if (rbd_dev->mapping.size != rbd_dev->header.image_size) { + sector_t size; + + rbd_dev->mapping.size = rbd_dev->header.image_size; + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long)size); + set_capacity(rbd_dev->disk, size); + } } /* * only read the first part of the ondisk header, without the snaps info */ -static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) { int ret; struct rbd_image_header h; @@ -2320,37 +3147,61 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) /* osd requests may still refer to snapc */ ceph_put_snap_context(rbd_dev->header.snapc); - if (hver) - *hver = h.obj_version; - rbd_dev->header.obj_version = h.obj_version; rbd_dev->header.image_size = h.image_size; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_sizes = h.snap_sizes; /* Free the extra copy of the object prefix */ - WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); + if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) + rbd_warn(rbd_dev, "object prefix changed (ignoring)"); kfree(h.object_prefix); - ret = rbd_dev_snaps_update(rbd_dev); - if (!ret) - ret = rbd_dev_snaps_register(rbd_dev); - up_write(&rbd_dev->header_rwsem); return ret; } -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) +/* + * Clear the rbd device's EXISTS flag if the snapshot it's mapped to + * has disappeared from the (just updated) snapshot context. + */ +static void rbd_exists_validate(struct rbd_device *rbd_dev) +{ + u64 snap_id; + + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) + return; + + snap_id = rbd_dev->spec->snap_id; + if (snap_id == CEPH_NOSNAP) + return; + + if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); +} + +static int rbd_dev_refresh(struct rbd_device *rbd_dev) { + u64 image_size; int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + image_size = rbd_dev->header.image_size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) - ret = rbd_dev_v1_refresh(rbd_dev, hver); + ret = rbd_dev_v1_refresh(rbd_dev); else - ret = rbd_dev_v2_refresh(rbd_dev, hver); + ret = rbd_dev_v2_refresh(rbd_dev); + + /* If it's a mapped snapshot, validate its EXISTS flag */ + + rbd_exists_validate(rbd_dev); mutex_unlock(&ctl_mutex); + if (ret) + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", ret); + if (image_size != rbd_dev->header.image_size) + revalidate_disk(rbd_dev->disk); return ret; } @@ -2394,8 +3245,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); - return 0; out_disk: put_disk(disk); @@ -2416,13 +3265,9 @@ static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - sector_t size; - down_read(&rbd_dev->header_rwsem); - size = get_capacity(rbd_dev->disk); - up_read(&rbd_dev->header_rwsem); - - return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); + return sprintf(buf, "%llu\n", + (unsigned long long)rbd_dev->mapping.size); } /* @@ -2435,7 +3280,7 @@ static ssize_t rbd_features_show(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "0x%016llx\n", - (unsigned long long) rbd_dev->mapping.features); + (unsigned long long)rbd_dev->mapping.features); } static ssize_t rbd_major_show(struct device *dev, @@ -2443,7 +3288,11 @@ static ssize_t rbd_major_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%d\n", rbd_dev->major); + if (rbd_dev->major) + return sprintf(buf, "%d\n", rbd_dev->major); + + return sprintf(buf, "(none)\n"); + } static ssize_t rbd_client_id_show(struct device *dev, @@ -2469,7 +3318,7 @@ static ssize_t rbd_pool_id_show(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%llu\n", - (unsigned long long) rbd_dev->spec->pool_id); + (unsigned long long) rbd_dev->spec->pool_id); } static ssize_t rbd_name_show(struct device *dev, @@ -2555,7 +3404,7 @@ static ssize_t rbd_image_refresh(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret; - ret = rbd_dev_refresh(rbd_dev, NULL); + ret = rbd_dev_refresh(rbd_dev); return ret < 0 ? ret : size; } @@ -2606,71 +3455,6 @@ static struct device_type rbd_device_type = { .release = rbd_sysfs_dev_release, }; - -/* - sysfs - snapshots -*/ - -static ssize_t rbd_snap_size_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)snap->size); -} - -static ssize_t rbd_snap_id_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)snap->id); -} - -static ssize_t rbd_snap_features_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - - return sprintf(buf, "0x%016llx\n", - (unsigned long long) snap->features); -} - -static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); -static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); -static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); - -static struct attribute *rbd_snap_attrs[] = { - &dev_attr_snap_size.attr, - &dev_attr_snap_id.attr, - &dev_attr_snap_features.attr, - NULL, -}; - -static struct attribute_group rbd_snap_attr_group = { - .attrs = rbd_snap_attrs, -}; - -static void rbd_snap_dev_release(struct device *dev) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - kfree(snap->name); - kfree(snap); -} - -static const struct attribute_group *rbd_snap_attr_groups[] = { - &rbd_snap_attr_group, - NULL -}; - -static struct device_type rbd_snap_device_type = { - .groups = rbd_snap_attr_groups, - .release = rbd_snap_dev_release, -}; - static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) { kref_get(&spec->kref); @@ -2694,8 +3478,6 @@ static struct rbd_spec *rbd_spec_alloc(void) return NULL; kref_init(&spec->kref); - rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ - return spec; } @@ -2722,7 +3504,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, spin_lock_init(&rbd_dev->lock); rbd_dev->flags = 0; INIT_LIST_HEAD(&rbd_dev->node); - INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); rbd_dev->spec = spec; @@ -2740,96 +3521,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, static void rbd_dev_destroy(struct rbd_device *rbd_dev) { - rbd_spec_put(rbd_dev->parent_spec); - kfree(rbd_dev->header_name); rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); kfree(rbd_dev); } -static bool rbd_snap_registered(struct rbd_snap *snap) -{ - bool ret = snap->dev.type == &rbd_snap_device_type; - bool reg = device_is_registered(&snap->dev); - - rbd_assert(!ret ^ reg); - - return ret; -} - -static void rbd_remove_snap_dev(struct rbd_snap *snap) -{ - list_del(&snap->node); - if (device_is_registered(&snap->dev)) - device_unregister(&snap->dev); -} - -static int rbd_register_snap_dev(struct rbd_snap *snap, - struct device *parent) -{ - struct device *dev = &snap->dev; - int ret; - - dev->type = &rbd_snap_device_type; - dev->parent = parent; - dev->release = rbd_snap_dev_release; - dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); - dout("%s: registering device for snapshot %s\n", __func__, snap->name); - - ret = device_register(dev); - - return ret; -} - -static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, - const char *snap_name, - u64 snap_id, u64 snap_size, - u64 snap_features) -{ - struct rbd_snap *snap; - int ret; - - snap = kzalloc(sizeof (*snap), GFP_KERNEL); - if (!snap) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - snap->name = kstrdup(snap_name, GFP_KERNEL); - if (!snap->name) - goto err; - - snap->id = snap_id; - snap->size = snap_size; - snap->features = snap_features; - - return snap; - -err: - kfree(snap->name); - kfree(snap); - - return ERR_PTR(ret); -} - -static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, - u64 *snap_size, u64 *snap_features) -{ - char *snap_name; - - rbd_assert(which < rbd_dev->header.snapc->num_snaps); - - *snap_size = rbd_dev->header.snap_sizes[which]; - *snap_features = 0; /* No features for v1 */ - - /* Skip over names until we find the one we are looking for */ - - snap_name = rbd_dev->header.snap_names; - while (which--) - snap_name += strlen(snap_name) + 1; - - return snap_name; -} - /* * Get the size and object order for an image snapshot, or if * snap_id is CEPH_NOSNAP, gets this information for the base @@ -2847,18 +3543,21 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", - (char *) &snapid, sizeof (snapid), - (char *) &size_buf, sizeof (size_buf), NULL); + &snapid, sizeof (snapid), + &size_buf, sizeof (size_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; + if (ret < sizeof (size_buf)) + return -ERANGE; - *order = size_buf.order; + if (order) + *order = size_buf.order; *snap_size = le64_to_cpu(size_buf.size); dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", - (unsigned long long) snap_id, (unsigned int) *order, - (unsigned long long) *snap_size); + (unsigned long long)snap_id, (unsigned int)*order, + (unsigned long long)*snap_size); return 0; } @@ -2881,17 +3580,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, - "rbd", "get_object_prefix", - NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); + "rbd", "get_object_prefix", NULL, 0, + reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, - p + RBD_OBJ_PREFIX_LEN_MAX, - NULL, GFP_NOIO); + p + ret, NULL, GFP_NOIO); + ret = 0; if (IS_ERR(rbd_dev->header.object_prefix)) { ret = PTR_ERR(rbd_dev->header.object_prefix); @@ -2899,7 +3597,6 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) } else { dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); } - out: kfree(reply_buf); @@ -2913,29 +3610,30 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, struct { __le64 features; __le64 incompat; - } features_buf = { 0 }; + } __attribute__ ((packed)) features_buf = { 0 }; u64 incompat; int ret; ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_features", - (char *) &snapid, sizeof (snapid), - (char *) &features_buf, sizeof (features_buf), - NULL); + &snapid, sizeof (snapid), + &features_buf, sizeof (features_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; + if (ret < sizeof (features_buf)) + return -ERANGE; incompat = le64_to_cpu(features_buf.incompat); - if (incompat & ~RBD_FEATURES_ALL) + if (incompat & ~RBD_FEATURES_SUPPORTED) return -ENXIO; *snap_features = le64_to_cpu(features_buf.features); dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", - (unsigned long long) snap_id, - (unsigned long long) *snap_features, - (unsigned long long) le64_to_cpu(features_buf.incompat)); + (unsigned long long)snap_id, + (unsigned long long)*snap_features, + (unsigned long long)le64_to_cpu(features_buf.incompat)); return 0; } @@ -2975,15 +3673,15 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) snapid = cpu_to_le64(CEPH_NOSNAP); ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", - (char *) &snapid, sizeof (snapid), - (char *) reply_buf, size, NULL); + &snapid, sizeof (snapid), + reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; - ret = -ERANGE; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + ret; + ret = -ERANGE; ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ @@ -2991,8 +3689,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) /* The ceph file layout needs to fit pool id in 32 bits */ ret = -EIO; - if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) - goto out; + if (parent_spec->pool_id > (u64)U32_MAX) { + rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", + (unsigned long long)parent_spec->pool_id, U32_MAX); + goto out_err; + } image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { @@ -3015,6 +3716,56 @@ out_err: return ret; } +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) +{ + struct { + __le64 stripe_unit; + __le64 stripe_count; + } __attribute__ ((packed)) striping_info_buf = { 0 }; + size_t size = sizeof (striping_info_buf); + void *p; + u64 obj_size; + u64 stripe_unit; + u64 stripe_count; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_stripe_unit_count", NULL, 0, + (char *)&striping_info_buf, size); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + return ret; + if (ret < size) + return -ERANGE; + + /* + * We don't actually support the "fancy striping" feature + * (STRIPINGV2) yet, but if the striping sizes are the + * defaults the behavior is the same as before. So find + * out, and only fail if the image has non-default values. + */ + ret = -EINVAL; + obj_size = (u64)1 << rbd_dev->header.obj_order; + p = &striping_info_buf; + stripe_unit = ceph_decode_64(&p); + if (stripe_unit != obj_size) { + rbd_warn(rbd_dev, "unsupported stripe unit " + "(got %llu want %llu)", + stripe_unit, obj_size); + return -EINVAL; + } + stripe_count = ceph_decode_64(&p); + if (stripe_count != 1) { + rbd_warn(rbd_dev, "unsupported stripe count " + "(got %llu want 1)", stripe_count); + return -EINVAL; + } + rbd_dev->header.stripe_unit = stripe_unit; + rbd_dev->header.stripe_count = stripe_count; + + return 0; +} + static char *rbd_dev_image_name(struct rbd_device *rbd_dev) { size_t image_id_size; @@ -3036,8 +3787,8 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) return NULL; p = image_id; - end = (char *) image_id + image_id_size; - ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); + end = image_id + image_id_size; + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; reply_buf = kmalloc(size, GFP_KERNEL); @@ -3047,11 +3798,12 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - (char *) reply_buf, size, NULL); + reply_buf, size); if (ret < 0) goto out; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + ret; + image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); if (IS_ERR(image_name)) image_name = NULL; @@ -3064,69 +3816,134 @@ out: return image_name; } +static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + const char *snap_name; + u32 which = 0; + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which < snapc->num_snaps) { + if (!strcmp(name, snap_name)) + return snapc->snaps[which]; + snap_name += strlen(snap_name) + 1; + which++; + } + return CEPH_NOSNAP; +} + +static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + u32 which; + bool found = false; + u64 snap_id; + + for (which = 0; !found && which < snapc->num_snaps; which++) { + const char *snap_name; + + snap_id = snapc->snaps[which]; + snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); + if (IS_ERR(snap_name)) + break; + found = !strcmp(name, snap_name); + kfree(snap_name); + } + return found ? snap_id : CEPH_NOSNAP; +} + /* - * When a parent image gets probed, we only have the pool, image, - * and snapshot ids but not the names of any of them. This call - * is made later to fill in those names. It has to be done after - * rbd_dev_snaps_update() has completed because some of the - * information (in particular, snapshot name) is not available - * until then. + * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if + * no snapshot by that name is found, or if an error occurs. */ -static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) { - struct ceph_osd_client *osdc; - const char *name; - void *reply_buf = NULL; + if (rbd_dev->image_format == 1) + return rbd_v1_snap_id_by_name(rbd_dev, name); + + return rbd_v2_snap_id_by_name(rbd_dev, name); +} + +/* + * When an rbd image has a parent image, it is identified by the + * pool, image, and snapshot ids (not names). This function fills + * in the names for those ids. (It's OK if we can't figure out the + * name for an image id, but the pool and snapshot ids should always + * exist and have names.) All names in an rbd spec are dynamically + * allocated. + * + * When an image being mapped (not a parent) is probed, we have the + * pool name and pool id, image name and image id, and the snapshot + * name. The only thing we're missing is the snapshot id. + */ +static int rbd_dev_spec_update(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_spec *spec = rbd_dev->spec; + const char *pool_name; + const char *image_name; + const char *snap_name; int ret; - if (rbd_dev->spec->pool_name) - return 0; /* Already have the names */ + /* + * An image being mapped will have the pool name (etc.), but + * we need to look up the snapshot id. + */ + if (spec->pool_name) { + if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { + u64 snap_id; + + snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; + spec->snap_id = snap_id; + } else { + spec->snap_id = CEPH_NOSNAP; + } - /* Look up the pool name */ + return 0; + } - osdc = &rbd_dev->rbd_client->client->osdc; - name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); - if (!name) { - rbd_warn(rbd_dev, "there is no pool with id %llu", - rbd_dev->spec->pool_id); /* Really a BUG() */ + /* Get the pool name; we have to make our own copy of this */ + + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); + if (!pool_name) { + rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); return -EIO; } - - rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); - if (!rbd_dev->spec->pool_name) + pool_name = kstrdup(pool_name, GFP_KERNEL); + if (!pool_name) return -ENOMEM; /* Fetch the image name; tolerate failure here */ - name = rbd_dev_image_name(rbd_dev); - if (name) - rbd_dev->spec->image_name = (char *) name; - else + image_name = rbd_dev_image_name(rbd_dev); + if (!image_name) rbd_warn(rbd_dev, "unable to get image name"); - /* Look up the snapshot name. */ + /* Look up the snapshot name, and make a copy */ - name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); - if (!name) { - rbd_warn(rbd_dev, "no snapshot with id %llu", - rbd_dev->spec->snap_id); /* Really a BUG() */ - ret = -EIO; + snap_name = rbd_snap_name(rbd_dev, spec->snap_id); + if (!snap_name) { + ret = -ENOMEM; goto out_err; } - rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); - if(!rbd_dev->spec->snap_name) - goto out_err; + + spec->pool_name = pool_name; + spec->image_name = image_name; + spec->snap_name = snap_name; return 0; out_err: - kfree(reply_buf); - kfree(rbd_dev->spec->pool_name); - rbd_dev->spec->pool_name = NULL; + kfree(image_name); + kfree(pool_name); return ret; } -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) { size_t size; int ret; @@ -3151,16 +3968,15 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, - "rbd", "get_snapcontext", - NULL, 0, - reply_buf, size, ver); + "rbd", "get_snapcontext", NULL, 0, + reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = -ERANGE; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + ret; + ret = -ERANGE; ceph_decode_64_safe(&p, end, seq, out); ceph_decode_32_safe(&p, end, snap_count, out); @@ -3177,37 +3993,33 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) } if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) goto out; + ret = 0; - size = sizeof (struct ceph_snap_context) + - snap_count * sizeof (snapc->snaps[0]); - snapc = kmalloc(size, GFP_KERNEL); + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); if (!snapc) { ret = -ENOMEM; goto out; } - - atomic_set(&snapc->nref, 1); snapc->seq = seq; - snapc->num_snaps = snap_count; for (i = 0; i < snap_count; i++) snapc->snaps[i] = ceph_decode_64(&p); rbd_dev->header.snapc = snapc; dout(" snap context seq = %llu, snap_count = %u\n", - (unsigned long long) seq, (unsigned int) snap_count); - + (unsigned long long)seq, (unsigned int)snap_count); out: kfree(reply_buf); - return 0; + return ret; } -static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id) { size_t size; void *reply_buf; - __le64 snap_id; + __le64 snapid; int ret; void *p; void *end; @@ -3218,236 +4030,52 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) if (!reply_buf) return ERR_PTR(-ENOMEM); - snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); + snapid = cpu_to_le64(snap_id); ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", - (char *) &snap_id, sizeof (snap_id), - reply_buf, size, NULL); + &snapid, sizeof (snapid), + reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); - if (ret < 0) + if (ret < 0) { + snap_name = ERR_PTR(ret); goto out; + } p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + ret; snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); - if (IS_ERR(snap_name)) { - ret = PTR_ERR(snap_name); + if (IS_ERR(snap_name)) goto out; - } else { - dout(" snap_id 0x%016llx snap_name = %s\n", - (unsigned long long) le64_to_cpu(snap_id), snap_name); - } - kfree(reply_buf); - return snap_name; + dout(" snap_id 0x%016llx snap_name = %s\n", + (unsigned long long)snap_id, snap_name); out: kfree(reply_buf); - return ERR_PTR(ret); -} - -static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, - u64 *snap_size, u64 *snap_features) -{ - u64 snap_id; - u8 order; - int ret; - - snap_id = rbd_dev->header.snapc->snaps[which]; - ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); - if (ret) - return ERR_PTR(ret); - ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); - if (ret) - return ERR_PTR(ret); - - return rbd_dev_v2_snap_name(rbd_dev, which); -} - -static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, - u64 *snap_size, u64 *snap_features) -{ - if (rbd_dev->image_format == 1) - return rbd_dev_v1_snap_info(rbd_dev, which, - snap_size, snap_features); - if (rbd_dev->image_format == 2) - return rbd_dev_v2_snap_info(rbd_dev, which, - snap_size, snap_features); - return ERR_PTR(-EINVAL); + return snap_name; } -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) { int ret; - __u8 obj_order; down_write(&rbd_dev->header_rwsem); - /* Grab old order first, to see if it changes */ - - obj_order = rbd_dev->header.obj_order, ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; - if (rbd_dev->header.obj_order != obj_order) { - ret = -EIO; - goto out; - } rbd_update_mapping_size(rbd_dev); - ret = rbd_dev_v2_snap_context(rbd_dev, hver); + ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); if (ret) goto out; - ret = rbd_dev_snaps_update(rbd_dev); - dout("rbd_dev_snaps_update returned %d\n", ret); - if (ret) - goto out; - ret = rbd_dev_snaps_register(rbd_dev); - dout("rbd_dev_snaps_register returned %d\n", ret); out: up_write(&rbd_dev->header_rwsem); return ret; } -/* - * Scan the rbd device's current snapshot list and compare it to the - * newly-received snapshot context. Remove any existing snapshots - * not present in the new snapshot context. Add a new snapshot for - * any snaphots in the snapshot context not in the current list. - * And verify there are no changes to snapshots we already know - * about. - * - * Assumes the snapshots in the snapshot context are sorted by - * snapshot id, highest id first. (Snapshots in the rbd_dev's list - * are also maintained in that order.) - */ -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) -{ - struct ceph_snap_context *snapc = rbd_dev->header.snapc; - const u32 snap_count = snapc->num_snaps; - struct list_head *head = &rbd_dev->snaps; - struct list_head *links = head->next; - u32 index = 0; - - dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); - while (index < snap_count || links != head) { - u64 snap_id; - struct rbd_snap *snap; - char *snap_name; - u64 snap_size = 0; - u64 snap_features = 0; - - snap_id = index < snap_count ? snapc->snaps[index] - : CEPH_NOSNAP; - snap = links != head ? list_entry(links, struct rbd_snap, node) - : NULL; - rbd_assert(!snap || snap->id != CEPH_NOSNAP); - - if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { - struct list_head *next = links->next; - - /* - * A previously-existing snapshot is not in - * the new snap context. - * - * If the now missing snapshot is the one the - * image is mapped to, clear its exists flag - * so we can avoid sending any more requests - * to it. - */ - if (rbd_dev->spec->snap_id == snap->id) - clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - rbd_remove_snap_dev(snap); - dout("%ssnap id %llu has been removed\n", - rbd_dev->spec->snap_id == snap->id ? - "mapped " : "", - (unsigned long long) snap->id); - - /* Done with this list entry; advance */ - - links = next; - continue; - } - - snap_name = rbd_dev_snap_info(rbd_dev, index, - &snap_size, &snap_features); - if (IS_ERR(snap_name)) - return PTR_ERR(snap_name); - - dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, - (unsigned long long) snap_id); - if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { - struct rbd_snap *new_snap; - - /* We haven't seen this snapshot before */ - - new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, - snap_id, snap_size, snap_features); - if (IS_ERR(new_snap)) { - int err = PTR_ERR(new_snap); - - dout(" failed to add dev, error %d\n", err); - - return err; - } - - /* New goes before existing, or at end of list */ - - dout(" added dev%s\n", snap ? "" : " at end\n"); - if (snap) - list_add_tail(&new_snap->node, &snap->node); - else - list_add_tail(&new_snap->node, head); - } else { - /* Already have this one */ - - dout(" already present\n"); - - rbd_assert(snap->size == snap_size); - rbd_assert(!strcmp(snap->name, snap_name)); - rbd_assert(snap->features == snap_features); - - /* Done with this list entry; advance */ - - links = links->next; - } - - /* Advance to the next entry in the snapshot context */ - - index++; - } - dout("%s: done\n", __func__); - - return 0; -} - -/* - * Scan the list of snapshots and register the devices for any that - * have not already been registered. - */ -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) -{ - struct rbd_snap *snap; - int ret = 0; - - dout("%s:\n", __func__); - if (WARN_ON(!device_is_registered(&rbd_dev->dev))) - return -EIO; - - list_for_each_entry(snap, &rbd_dev->snaps, node) { - if (!rbd_snap_registered(snap)) { - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); - if (ret < 0) - break; - } - } - dout("%s: returning %d\n", __func__, ret); - - return ret; -} - static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { struct device *dev; @@ -3459,7 +4087,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev->bus = &rbd_bus_type; dev->type = &rbd_device_type; dev->parent = &rbd_root_dev; - dev->release = rbd_dev_release; + dev->release = rbd_dev_device_release; dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); @@ -3673,6 +4301,7 @@ static int rbd_add_parse_args(const char *buf, size_t len; char *options; const char *mon_addrs; + char *snap_name; size_t mon_addrs_size; struct rbd_spec *spec = NULL; struct rbd_options *rbd_opts = NULL; @@ -3731,10 +4360,11 @@ static int rbd_add_parse_args(const char *buf, ret = -ENAMETOOLONG; goto out_err; } - spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); - if (!spec->snap_name) + snap_name = kmemdup(buf, len + 1, GFP_KERNEL); + if (!snap_name) goto out_mem; - *(spec->snap_name + len) = '\0'; + *(snap_name + len) = '\0'; + spec->snap_name = snap_name; /* Initialize all rbd options to the defaults */ @@ -3788,15 +4418,19 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) size_t size; char *object_name; void *response; - void *p; + char *image_id; /* * When probing a parent image, the image id is already * known (and the image name likely is not). There's no - * need to fetch the image id again in this case. + * need to fetch the image id again in this case. We + * do still need to set the image format though. */ - if (rbd_dev->spec->image_id) + if (rbd_dev->spec->image_id) { + rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; + return 0; + } /* * First, see if the format 2 image id file exists, and if @@ -3818,23 +4452,32 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) goto out; } + /* If it doesn't exist we'll assume it's a format 1 image */ + ret = rbd_obj_method_sync(rbd_dev, object_name, - "rbd", "get_id", - NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX, NULL); + "rbd", "get_id", NULL, 0, + response, RBD_IMAGE_ID_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); - if (ret < 0) - goto out; - - p = response; - rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, - p + RBD_IMAGE_ID_LEN_MAX, + if (ret == -ENOENT) { + image_id = kstrdup("", GFP_KERNEL); + ret = image_id ? 0 : -ENOMEM; + if (!ret) + rbd_dev->image_format = 1; + } else if (ret > sizeof (__le32)) { + void *p = response; + + image_id = ceph_extract_encoded_string(&p, p + ret, NULL, GFP_NOIO); - if (IS_ERR(rbd_dev->spec->image_id)) { - ret = PTR_ERR(rbd_dev->spec->image_id); - rbd_dev->spec->image_id = NULL; + ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; + if (!ret) + rbd_dev->image_format = 2; } else { - dout("image_id is %s\n", rbd_dev->spec->image_id); + ret = -EINVAL; + } + + if (!ret) { + rbd_dev->spec->image_id = image_id; + dout("image_id is %s\n", image_id); } out: kfree(response); @@ -3843,27 +4486,30 @@ out: return ret; } -static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) +/* Undo whatever state changes are made by v1 or v2 image probe */ + +static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { - int ret; - size_t size; + struct rbd_image_header *header; - /* Version 1 images have no id; empty string is used */ + rbd_dev_remove_parent(rbd_dev); + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; - rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); - if (!rbd_dev->spec->image_id) - return -ENOMEM; + /* Free dynamic fields from the header, then zero it out */ - /* Record the header object name for this rbd image. */ + header = &rbd_dev->header; + ceph_put_snap_context(header->snapc); + kfree(header->snap_sizes); + kfree(header->snap_names); + kfree(header->object_prefix); + memset(header, 0, sizeof (*header)); +} - size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) { - ret = -ENOMEM; - goto out_err; - } - sprintf(rbd_dev->header_name, "%s%s", - rbd_dev->spec->image_name, RBD_SUFFIX); +static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) +{ + int ret; /* Populate rbd image metadata */ @@ -3876,8 +4522,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) rbd_dev->parent_spec = NULL; rbd_dev->parent_overlap = 0; - rbd_dev->image_format = 1; - dout("discovered version 1 image, header name is %s\n", rbd_dev->header_name); @@ -3894,43 +4538,45 @@ out_err: static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { - size_t size; int ret; - u64 ver = 0; - - /* - * Image id was filled in by the caller. Record the header - * object name for this rbd image. - */ - size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) - return -ENOMEM; - sprintf(rbd_dev->header_name, "%s%s", - RBD_HEADER_PREFIX, rbd_dev->spec->image_id); - - /* Get the size and object order for the image */ ret = rbd_dev_v2_image_size(rbd_dev); - if (ret < 0) + if (ret) goto out_err; /* Get the object prefix (a.k.a. block_name) for the image */ ret = rbd_dev_v2_object_prefix(rbd_dev); - if (ret < 0) + if (ret) goto out_err; /* Get the and check features for the image */ ret = rbd_dev_v2_features(rbd_dev); - if (ret < 0) + if (ret) goto out_err; /* If the image supports layering, get the parent info */ if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret) + goto out_err; + + /* + * Don't print a warning for parent images. We can + * tell this point because we won't know its pool + * name yet (just its pool id). + */ + if (rbd_dev->spec->pool_name) + rbd_warn(rbd_dev, "WARNING: kernel layering " + "is EXPERIMENTAL!"); + } + + /* If the image supports fancy striping, get its parameters */ + + if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { + ret = rbd_dev_v2_striping_info(rbd_dev); if (ret < 0) goto out_err; } @@ -3942,12 +4588,9 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) /* Get the snapshot context, plus the header version */ - ret = rbd_dev_v2_snap_context(rbd_dev, &ver); + ret = rbd_dev_v2_snap_context(rbd_dev); if (ret) goto out_err; - rbd_dev->header.obj_version = ver; - - rbd_dev->image_format = 2; dout("discovered version 2 image, header name is %s\n", rbd_dev->header_name); @@ -3965,22 +4608,54 @@ out_err: return ret; } -static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) +static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) { + struct rbd_device *parent = NULL; + struct rbd_spec *parent_spec; + struct rbd_client *rbdc; int ret; - /* no need to lock here, as rbd_dev is not registered yet */ - ret = rbd_dev_snaps_update(rbd_dev); - if (ret) - return ret; + if (!rbd_dev->parent_spec) + return 0; + /* + * We need to pass a reference to the client and the parent + * spec when creating the parent rbd_dev. Images related by + * parent/child relationships always share both. + */ + parent_spec = rbd_spec_get(rbd_dev->parent_spec); + rbdc = __rbd_get_client(rbd_dev->rbd_client); - ret = rbd_dev_probe_update_spec(rbd_dev); - if (ret) - goto err_out_snaps; + ret = -ENOMEM; + parent = rbd_dev_create(rbdc, parent_spec); + if (!parent) + goto out_err; + + ret = rbd_dev_image_probe(parent); + if (ret < 0) + goto out_err; + rbd_dev->parent = parent; + + return 0; +out_err: + if (parent) { + rbd_spec_put(rbd_dev->parent_spec); + kfree(rbd_dev->header_name); + rbd_dev_destroy(parent); + } else { + rbd_put_client(rbdc); + rbd_spec_put(parent_spec); + } + + return ret; +} + +static int rbd_dev_device_setup(struct rbd_device *rbd_dev) +{ + int ret; - ret = rbd_dev_set_mapping(rbd_dev); + ret = rbd_dev_mapping_set(rbd_dev); if (ret) - goto err_out_snaps; + return ret; /* generate unique id: find highest unique id, add one */ rbd_dev_id_get(rbd_dev); @@ -4007,54 +4682,81 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; - /* - * At this point cleanup in the event of an error is the job - * of the sysfs code (initiated by rbd_bus_del_dev()). - */ - down_write(&rbd_dev->header_rwsem); - ret = rbd_dev_snaps_register(rbd_dev); - up_write(&rbd_dev->header_rwsem); - if (ret) - goto err_out_bus; - - ret = rbd_dev_header_watch_sync(rbd_dev, 1); - if (ret) - goto err_out_bus; - /* Everything's ready. Announce the disk to the world. */ + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); add_disk(rbd_dev->disk); pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, (unsigned long long) rbd_dev->mapping.size); return ret; -err_out_bus: - /* this will also clean up rest of rbd_dev stuff */ - rbd_bus_del_dev(rbd_dev); - - return ret; err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); -err_out_snaps: - rbd_remove_all_snaps(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); return ret; } +static int rbd_dev_header_name(struct rbd_device *rbd_dev) +{ + struct rbd_spec *spec = rbd_dev->spec; + size_t size; + + /* Record the header object name for this rbd image. */ + + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + + if (rbd_dev->image_format == 1) + size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); + else + size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); + + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); + if (!rbd_dev->header_name) + return -ENOMEM; + + if (rbd_dev->image_format == 1) + sprintf(rbd_dev->header_name, "%s%s", + spec->image_name, RBD_SUFFIX); + else + sprintf(rbd_dev->header_name, "%s%s", + RBD_HEADER_PREFIX, spec->image_id); + return 0; +} + +static void rbd_dev_image_release(struct rbd_device *rbd_dev) +{ + int ret; + + rbd_dev_unprobe(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, 0); + if (ret) + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; + rbd_dev->image_format = 0; + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; + + rbd_dev_destroy(rbd_dev); +} + /* * Probe for the existence of the header object for the given rbd * device. For format 2 images this includes determining the image * id. */ -static int rbd_dev_probe(struct rbd_device *rbd_dev) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev) { int ret; + int tmp; /* * Get the id from the image id object. If it's not a @@ -4063,18 +4765,48 @@ static int rbd_dev_probe(struct rbd_device *rbd_dev) */ ret = rbd_dev_image_id(rbd_dev); if (ret) + return ret; + rbd_assert(rbd_dev->spec->image_id); + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + + ret = rbd_dev_header_name(rbd_dev); + if (ret) + goto err_out_format; + + ret = rbd_dev_header_watch_sync(rbd_dev, 1); + if (ret) + goto out_header_name; + + if (rbd_dev->image_format == 1) ret = rbd_dev_v1_probe(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); - if (ret) { - dout("probe failed, returning %d\n", ret); - - return ret; - } + if (ret) + goto err_out_watch; - ret = rbd_dev_probe_finish(rbd_dev); + ret = rbd_dev_spec_update(rbd_dev); if (ret) - rbd_header_free(&rbd_dev->header); + goto err_out_probe; + + ret = rbd_dev_probe_parent(rbd_dev); + if (!ret) + return 0; + +err_out_probe: + rbd_dev_unprobe(rbd_dev); +err_out_watch: + tmp = rbd_dev_header_watch_sync(rbd_dev, 0); + if (tmp) + rbd_warn(rbd_dev, "unable to tear down watch request\n"); +out_header_name: + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; +err_out_format: + rbd_dev->image_format = 0; + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; + + dout("probe failed, returning %d\n", ret); return ret; } @@ -4111,11 +4843,13 @@ static ssize_t rbd_add(struct bus_type *bus, rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); if (rc < 0) goto err_out_client; - spec->pool_id = (u64) rc; + spec->pool_id = (u64)rc; /* The ceph file layout needs to fit pool id in 32 bits */ - if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + if (spec->pool_id > (u64)U32_MAX) { + rbd_warn(NULL, "pool id too large (%llu > %u)\n", + (unsigned long long)spec->pool_id, U32_MAX); rc = -EIO; goto err_out_client; } @@ -4130,11 +4864,15 @@ static ssize_t rbd_add(struct bus_type *bus, kfree(rbd_opts); rbd_opts = NULL; /* done with this */ - rc = rbd_dev_probe(rbd_dev); + rc = rbd_dev_image_probe(rbd_dev); if (rc < 0) goto err_out_rbd_dev; - return count; + rc = rbd_dev_device_setup(rbd_dev); + if (!rc) + return count; + + rbd_dev_image_release(rbd_dev); err_out_rbd_dev: rbd_dev_destroy(rbd_dev); err_out_client: @@ -4149,7 +4887,7 @@ err_out_module: dout("Error adding device %s\n", buf); - return (ssize_t) rc; + return (ssize_t)rc; } static struct rbd_device *__rbd_get_dev(unsigned long dev_id) @@ -4169,27 +4907,43 @@ static struct rbd_device *__rbd_get_dev(unsigned long dev_id) return NULL; } -static void rbd_dev_release(struct device *dev) +static void rbd_dev_device_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - if (rbd_dev->watch_event) - rbd_dev_header_watch_sync(rbd_dev, 0); - - /* clean up and free blkdev */ rbd_free_disk(rbd_dev); + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + rbd_dev_clear_mapping(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); - - /* release allocated disk header fields */ - rbd_header_free(&rbd_dev->header); - - /* done with the id, and with the rbd_dev */ + rbd_dev->major = 0; rbd_dev_id_put(rbd_dev); - rbd_assert(rbd_dev->rbd_client != NULL); - rbd_dev_destroy(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); +} - /* release module ref */ - module_put(THIS_MODULE); +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) +{ + while (rbd_dev->parent) { + struct rbd_device *first = rbd_dev; + struct rbd_device *second = first->parent; + struct rbd_device *third; + + /* + * Follow to the parent with no grandparent and + * remove it. + */ + while (second && (third = second->parent)) { + first = second; + second = third; + } + rbd_assert(second); + rbd_dev_image_release(second); + first->parent = NULL; + first->parent_overlap = 0; + + rbd_assert(first->parent_spec); + rbd_spec_put(first->parent_spec); + first->parent_spec = NULL; + } } static ssize_t rbd_remove(struct bus_type *bus, @@ -4197,13 +4951,13 @@ static ssize_t rbd_remove(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev = NULL; - int target_id, rc; + int target_id; unsigned long ul; - int ret = count; + int ret; - rc = strict_strtoul(buf, 10, &ul); - if (rc) - return rc; + ret = strict_strtoul(buf, 10, &ul); + if (ret) + return ret; /* convert to int; abort if we lost anything in the conversion */ target_id = (int) ul; @@ -4226,10 +4980,10 @@ static ssize_t rbd_remove(struct bus_type *bus, spin_unlock_irq(&rbd_dev->lock); if (ret < 0) goto done; - - rbd_remove_all_snaps(rbd_dev); + ret = count; rbd_bus_del_dev(rbd_dev); - + rbd_dev_image_release(rbd_dev); + module_put(THIS_MODULE); done: mutex_unlock(&ctl_mutex); @@ -4261,6 +5015,56 @@ static void rbd_sysfs_cleanup(void) device_unregister(&rbd_root_dev); } +static int rbd_slab_init(void) +{ + rbd_assert(!rbd_img_request_cache); + rbd_img_request_cache = kmem_cache_create("rbd_img_request", + sizeof (struct rbd_img_request), + __alignof__(struct rbd_img_request), + 0, NULL); + if (!rbd_img_request_cache) + return -ENOMEM; + + rbd_assert(!rbd_obj_request_cache); + rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", + sizeof (struct rbd_obj_request), + __alignof__(struct rbd_obj_request), + 0, NULL); + if (!rbd_obj_request_cache) + goto out_err; + + rbd_assert(!rbd_segment_name_cache); + rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", + MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + if (rbd_segment_name_cache) + return 0; +out_err: + if (rbd_obj_request_cache) { + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; + } + + kmem_cache_destroy(rbd_img_request_cache); + rbd_img_request_cache = NULL; + + return -ENOMEM; +} + +static void rbd_slab_exit(void) +{ + rbd_assert(rbd_segment_name_cache); + kmem_cache_destroy(rbd_segment_name_cache); + rbd_segment_name_cache = NULL; + + rbd_assert(rbd_obj_request_cache); + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; + + rbd_assert(rbd_img_request_cache); + kmem_cache_destroy(rbd_img_request_cache); + rbd_img_request_cache = NULL; +} + static int __init rbd_init(void) { int rc; @@ -4270,16 +5074,22 @@ static int __init rbd_init(void) return -EINVAL; } - rc = rbd_sysfs_init(); + rc = rbd_slab_init(); if (rc) return rc; - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); - return 0; + rc = rbd_sysfs_init(); + if (rc) + rbd_slab_exit(); + else + pr_info("loaded " RBD_DRV_NAME_LONG "\n"); + + return rc; } static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); + rbd_slab_exit(); } module_init(rbd_init); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a60ea977af6f..3e68ac101040 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -236,15 +236,21 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; + struct ceph_osd_data *osd_data; int rc = req->r_result; int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_pages[i]; + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -257,8 +263,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) SetPageUptodate(page); unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_pages); + kfree(osd_data->pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -279,6 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; struct ceph_osd_request *req; u64 off; u64 len; @@ -303,18 +311,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); - - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), - off, &len, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 0); + false); if (IS_ERR(req)) return PTR_ERR(req); /* build page vector */ - nr_pages = len >> PAGE_CACHE_SHIFT; + nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); ret = -ENOMEM; if (!pages) @@ -336,11 +343,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_pages = pages; - req->r_num_pages = nr_pages; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -373,7 +381,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); @@ -548,17 +557,23 @@ static void writepages_finish(struct ceph_osd_request *req, { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; int rc = req->r_result; - u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -566,7 +581,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -575,8 +590,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -605,32 +620,34 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(osd_data->pages); ceph_osdc_put_request(req); } -/* - * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_fs_client *fsc, - struct ceph_osd_request *req) +static struct ceph_osd_request * +ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, + struct ceph_snap_context *snapc, int num_ops) { - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, - GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); - } + struct ceph_fs_client *fsc; + struct ceph_inode_info *ci; + struct ceph_vino vino; + + fsc = ceph_inode_to_client(inode); + ci = ceph_inode(inode); + vino = ceph_vino(inode); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + + return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, + snapc, ci->i_truncate_seq, ci->i_truncate_size, true); } /* @@ -653,7 +670,7 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size = 0; + u64 snap_size; /* * Include a 'sync' in the OSD request if this is a data @@ -699,6 +716,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); + snap_size = 0; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -706,6 +724,8 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } + if (snap_size == 0) + snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); if (last_snapc && snapc != last_snapc) { @@ -718,10 +738,14 @@ retry: last_snapc = snapc; while (!done && index <= end) { + int num_ops = do_sync ? 2 : 1; + struct ceph_vino vino; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; @@ -773,11 +797,8 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if ((snap_size && page_offset(page) > snap_size) || - (!snap_size && - page_offset(page) > i_size_read(inode))) { - dout("%p page eof %llu\n", page, snap_size ? - snap_size : i_size_read(inode)); + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); done = 1; unlock_page(page); break; @@ -805,22 +826,23 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { + size_t size; + + BUG_ON(pages); + /* prepare async write request */ - offset = (u64) page_offset(page); + offset = (u64)page_offset(page); len = wsize; - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true, 0); + req = ceph_writepages_osd_request(inode, + offset, &len, snapc, + num_ops); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -828,11 +850,17 @@ get_more_pages: break; } - max_pages = req->r_num_pages; - - alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + size = max_pages * sizeof (*pages); + pages = kmalloc(size, GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } } /* note position of first page in pvec */ @@ -850,7 +878,7 @@ get_more_pages: } set_page_writeback(page); - req->r_pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -879,18 +907,27 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ - req->r_num_pages = locked_pages; - req->r_request_ops[0].extent.length = cpu_to_le64(len); - req->r_request_ops[0].payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); @@ -1067,51 +1104,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *fi = file->private_data; struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int r, want, got = 0; - - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - - dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, len, inode->i_size); - r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); - if (r < 0) - return r; - dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { - ceph_put_cap_refs(ci, got); - return -EAGAIN; - } + int r; do { /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) { - r = -ENOMEM; - break; - } + if (!page) + return -ENOMEM; + *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); - if (r) - page_cache_release(page); } while (r == -EAGAIN); - if (r) { - ceph_put_cap_refs(ci, got); - } else { - *pagep = page; - *(int *)fsdata = got; - } return r; } @@ -1125,12 +1134,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; - int got = (unsigned long)fsdata; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1153,19 +1160,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); - if (copied > 0) { - int dirty; - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - if (dirty) - __mark_inode_dirty(inode, dirty); - } - - dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - ceph_put_cap_refs(ci, got); - if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 78e2f575247d..da0f9b8a3bcb 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,15 +490,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear D_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) - ceph_dir_clear_complete(&ci->vfs_inode); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } } } @@ -553,6 +555,7 @@ retry: cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); @@ -628,7 +631,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (mseq > cap->mseq) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -997,9 +1003,9 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; @@ -2046,6 +2052,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); + spin_lock(&ci->i_ceph_lock); + } + if (need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", @@ -2067,12 +2080,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } have = __ceph_caps_issued(ci, &implemented); - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; - if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6d797f46d772..f02d82b7933e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -107,7 +107,7 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * D_COMPLETE tells indicates we have all dentries in the dir. It is + * Complete dir indicates that we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ @@ -198,8 +198,8 @@ more: filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_test_complete(dir)) { - dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -258,7 +258,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos == 0) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - fi->dir_release_count = ci->i_release_count; + fi->dir_release_count = atomic_read(&ci->i_release_count); dout("readdir off 0 -> '.'\n"); if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), @@ -284,7 +284,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - ceph_dir_test_complete(inode) && + __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(filp, dirent, filldir); @@ -350,7 +350,8 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude D_COMPLETE */ + /* preclude from marking dir complete */ + fi->dir_release_count--; } /* note next offset and last dentry name */ @@ -428,8 +429,9 @@ more: * the complete dir contents in our cache. */ spin_lock(&ci->i_ceph_lock); - if (ci->i_release_count == fi->dir_release_count) { - ceph_dir_set_complete(inode); + if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count); ci->i_max_offset = filp->f_pos; } spin_unlock(&ci->i_ceph_lock); @@ -604,7 +606,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - ceph_dir_test_complete(dir) && + __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); @@ -1065,44 +1067,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, } /* - * Set/clear/test dir complete flag on the dir's dentry. - */ -void ceph_dir_set_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry) && - ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -void ceph_dir_clear_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -bool ceph_dir_test_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) NOT complete\n", inode, dentry); - clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); - return false; -} - -/* * When the VFS prunes a dentry from the cache, we need to clear the * complete flag on the parent directory. * @@ -1110,15 +1074,13 @@ bool ceph_dir_test_complete(struct inode *inode) */ static void ceph_d_prune(struct dentry *dentry) { - struct ceph_dentry_info *di; - dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ if (IS_ROOT(dentry)) return; - /* if we are not hashed, we don't affect D_COMPLETE */ + /* if we are not hashed, we don't affect dir's completeness */ if (d_unhashed(dentry)) return; @@ -1126,8 +1088,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - di = ceph_dentry(dentry->d_parent); - clear_bit(CEPH_D_COMPLETE, &di->flags); + ceph_dir_clear_complete(dentry->d_parent->d_inode); } /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bf338d9b67e3..d70830c66833 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -446,19 +446,35 @@ done: } /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. */ -static void sync_write_commit(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - dout("sync_write_commit %p tid %llu\n", req, req->r_tid); - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } /* @@ -470,36 +486,33 @@ static void sync_write_commit(struct ceph_osd_request *req, * objects, rollback on failure, etc.) */ static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t *offset) + size_t left, loff_t pos, loff_t *ppos) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; struct ceph_osd_request *req; + int num_ops = 1; struct page **pages; int num_pages; - long long unsigned pos; u64 len; int written = 0; int flags; - int do_sync = 0; int check_caps = 0; int page_align, io_align; unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; + bool own_pages = false; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, *offset, + dout("sync_write on file %p %lld~%u %s\n", file, pos, (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_APPEND) - pos = i_size_read(inode); - else - pos = *offset; - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -516,7 +529,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) flags |= CEPH_OSD_FLAG_ACK; else - do_sync = 1; + num_ops++; /* Also include a 'startsync' command. */ /* * we may need to do multiple writes here if we span an object @@ -526,25 +539,20 @@ more: io_align = pos & ~PAGE_MASK; buf_align = (unsigned long)data & ~PAGE_MASK; len = left; - if (file->f_flags & O_DIRECT) { - /* write from beginning of first page, regardless of - io alignment */ - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - num_pages = calc_pages_for((unsigned long)data, len); - } else { - page_align = pos & ~PAGE_MASK; - num_pages = calc_pages_for(pos, len); - } + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - ceph_vino(inode), pos, &len, - CEPH_OSD_OP_WRITE, flags, - ci->i_snap_realm->cached_context, - do_sync, + vino, pos, &len, num_ops, + CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, page_align); + false); if (IS_ERR(req)) return PTR_ERR(req); + /* write from beginning of first page, regardless of io alignment */ + page_align = file->f_flags & O_DIRECT ? buf_align : io_align; + num_pages = calc_pages_for(page_align, len); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { @@ -572,36 +580,20 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ - req->r_safe_callback = sync_write_commit; - req->r_own_pages = 1; + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; + own_pages = true; } } - req->r_pages = pages; - req->r_num_pages = num_pages; - req->r_inode = inode; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, own_pages); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - if (req->r_safe_callback) { - /* - * Add to inode unsafe list only after we - * start_request so that a tid has been assigned. - */ - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - } - + if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret < 0 && req->r_safe_callback) { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } - } if (file->f_flags & O_DIRECT) ceph_put_page_vector(pages, num_pages, false); @@ -614,12 +606,12 @@ out: pos += len; written += len; left -= len; - data += written; + data += len; if (left) goto more; ret = written; - *offset = pos; + *ppos = pos; if (pos > i_size_read(inode)) check_caps = ceph_inode_set_size(inode, pos); if (check_caps) @@ -653,7 +645,6 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: - __ceph_do_pending_vmtruncate(inode); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else @@ -717,55 +708,75 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - loff_t endoff = pos + iov->iov_len; - int got = 0; - int ret, err, written; + ssize_t count, written = 0; + int err, want, got; + bool hold_mutex; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; -retry_snap: - written = 0; - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) - return -ENOSPC; - __ceph_do_pending_vmtruncate(inode); + sb_start_write(inode->i_sb); + mutex_lock(&inode->i_mutex); + hold_mutex = true; - /* - * try to do a buffered write. if we don't have sufficient - * caps, we'll get -EAGAIN from generic_file_aio_write, or a - * short write if we only get caps for some pages. - */ - if (!(iocb->ki_filp->f_flags & O_DIRECT) && - !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && - !(fi->flags & CEPH_F_SYNC)) { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (ret >= 0) - written = ret; - - if ((ret >= 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + written - 1, 1); - if (err < 0) - ret = err; - } - if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) - goto out; + err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (err) + goto out; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = file->f_mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + +retry_snap: + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + err = -ENOSPC; + goto out; } - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, inode->i_size); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); - if (ret < 0) + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, inode->i_size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + got = 0; + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + if (err < 0) goto out; - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, ceph_cap_string(got)); - ret = ceph_sync_write(file, iov->iov_base + written, - iov->iov_len - written, &iocb->ki_pos); - if (ret >= 0) { + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { + mutex_unlock(&inode->i_mutex); + written = ceph_sync_write(file, iov->iov_base, count, + pos, &iocb->ki_pos); + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, &iocb->ki_pos, + count, 0); + mutex_unlock(&inode->i_mutex); + } + hold_mutex = false; + + if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); @@ -773,18 +784,34 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); } + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, ceph_cap_string(got)); + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); ceph_put_cap_refs(ci, got); -out: - if (ret == -EOLDSNAPC) { + + if (written >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + written - 1, 1); + if (err < 0) + written = err; + } + + if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); + mutex_lock(&inode->i_mutex); + hold_mutex = true; goto retry_snap; } +out: + if (hold_mutex) + mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); + current->backing_dev_info = NULL; - return ret; + return written ? written : err; } /* @@ -796,7 +823,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); + __ceph_do_pending_vmtruncate(inode, false); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 851814d951cd..be0f7e20d62e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -302,7 +302,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_release_count = 0; + atomic_set(&ci->i_release_count, 1); + atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -561,7 +562,6 @@ static int fill_inode(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); int i; int issued = 0, implemented; - int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -601,7 +601,6 @@ static int fill_inode(struct inode *inode, (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -717,6 +716,17 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode), inode->i_mode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); + ci->i_max_offset = 2; + } no_change: spin_unlock(&ci->i_ceph_lock); @@ -767,19 +777,6 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - updating_inode && /* didn't jump to no_change */ - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !ceph_dir_test_complete(inode)) { - dout(" marking %p complete (empty)\n", inode); - ceph_dir_set_complete(inode); - ci->i_max_offset = 2; - } - /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); @@ -861,7 +858,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) di = ceph_dentry(dn); spin_lock(&ci->i_ceph_lock); - if (!ceph_dir_test_complete(inode)) { + if (!__ceph_dir_is_complete(ci)) { spin_unlock(&ci->i_ceph_lock); return; } @@ -1065,8 +1062,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when holding - * D_COMPLETE. + * directory offset so we can behave when dir is + * complete. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, @@ -1457,7 +1454,7 @@ out: /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq; * * We also truncate in a separate thread as well. */ @@ -1468,9 +1465,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode, true); iput(inode); } @@ -1494,12 +1489,10 @@ void ceph_queue_vmtruncate(struct inode *inode) } /* - * called with i_mutex held. - * * Make sure any pending truncation is applied before doing anything * that may depend on it. */ -void __ceph_do_pending_vmtruncate(struct inode *inode) +void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; @@ -1532,7 +1525,11 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); + if (needlock) + mutex_lock(&inode->i_mutex); truncate_inode_pages(inode->i_mapping, to); + if (needlock) + mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { @@ -1563,6 +1560,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, .follow_link = ceph_sym_follow_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, }; /* @@ -1585,7 +1588,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode); + __ceph_do_pending_vmtruncate(inode, false); err = inode_change_ok(inode, attr); if (err != 0) @@ -1767,7 +1770,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + __ceph_do_pending_vmtruncate(inode, false); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 4a989345b37b..e0b4ef31d3c8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, - osdc->osdmap); + + ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 442880d099c9..4f22671a5bd4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -265,7 +265,8 @@ static int parse_reply_info_extra(void **p, void *end, { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR) + else if (info->head->op == CEPH_MDS_OP_READDIR || + info->head->op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); else if (info->head->op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); @@ -364,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); kfree(s); } } @@ -1196,6 +1197,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ + __queue_cap_release(session, ceph_ino(inode), cap->cap_id, + cap->mseq, cap->issue_seq); __ceph_remove_cap(cap); } else { /* try to drop referring dentries */ @@ -1718,8 +1721,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - msg->pages = req->r_pages; - msg->nr_pages = req->r_num_pages; + if (req->r_data_len) { + /* outbound data set only by ceph_sync_setxattr() */ + BUG_ON(!req->r_pages); + ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + } + msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -1913,6 +1920,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } @@ -2026,20 +2034,16 @@ out: } /* - * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *inode = req->r_locked_dir; - struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); - spin_lock(&ci->i_ceph_lock); - ceph_dir_clear_complete(inode); - ci->i_release_count++; - spin_unlock(&ci->i_ceph_lock); + dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + ceph_dir_clear_complete(inode); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -2599,11 +2603,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - reply->pagelist = pagelist; if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - reply->hdr.data_len = cpu_to_le32(pagelist->length); - reply->nr_pages = calc_pages_for(0, pagelist->length); + if (pagelist->length) { + /* set up outbound data if we have any */ + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); + } ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3433,13 +3439,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &s->s_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); if (ret) return ERR_PTR(ret); } @@ -3455,7 +3465,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -3464,12 +3474,32 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3478,6 +3508,7 @@ static const struct ceph_connection_operations mds_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, }; /* eof */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 0d3c9240c61b..9278dec9e940 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; int i; - char r; + + /* special case for one mds */ + if (1 == m->m_max_mds && m->m_info[0].state > 0) + return 0; /* count */ for (i = 0; i < m->m_max_mds; i++) @@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return -1; /* pick */ - get_random_bytes(&r, 1); - n = r % n; + n = prandom_u32() % n; i = 0; for (i = 0; n > 0; i++, n--) while (m->m_info[i].state <= 0) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index cbb2f54a3019..f01645a27752 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; - snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; - atomic_set(&snapc->nref, 1); /* build (reverse sorted) snap vector */ num = 0; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6ddc0bca56b2..7d377c9a5e35 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -479,6 +479,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; const unsigned required_features = 0; + int page_count; + size_t size; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); @@ -522,8 +524,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, /* set up mempools */ err = -ENOMEM; - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + size = sizeof (struct page *) * (page_count ? page_count : 1); + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c7b309723dcc..8696be2ff679 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -204,7 +204,6 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { - unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -215,18 +214,6 @@ struct ceph_dentry_info { u64 offset; }; -/* - * dentry flags - * - * The locking for D_COMPLETE is a bit odd: - * - we can clear it at almost any time (see ceph_d_prune) - * - it is only meaningful if: - * - we hold dir inode i_ceph_lock - * - we hold dir FILE_SHARED caps - * - the dentry D_COMPLETE is set - */ -#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ - struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -257,7 +244,8 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - unsigned long i_release_count; + atomic_t i_release_count; + atomic_t i_complete_count; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -267,7 +255,7 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -436,33 +424,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + int release_count) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags &= ~mask; - spin_unlock(&ci->i_ceph_lock); + atomic_set(&ci->i_complete_count, release_count); } -static inline void ceph_i_set(struct inode *inode, unsigned mask) +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); + atomic_inc(&ci->i_release_count); +} - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags |= mask; - spin_unlock(&ci->i_ceph_lock); +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic_read(&ci->i_complete_count) == + atomic_read(&ci->i_release_count); } -static inline bool ceph_i_test(struct inode *inode, unsigned mask) +static inline void ceph_dir_clear_complete(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool r; + __ceph_dir_clear_complete(ceph_inode(inode)); +} - spin_lock(&ci->i_ceph_lock); - r = (ci->i_ceph_flags & mask) == mask; - spin_unlock(&ci->i_ceph_lock); - return r; +static inline bool ceph_dir_is_complete(struct inode *inode) +{ + return __ceph_dir_is_complete(ceph_inode(inode)); } @@ -489,13 +475,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* - * set/clear directory D_COMPLETE flag - */ -void ceph_dir_set_complete(struct inode *inode); -void ceph_dir_clear_complete(struct inode *inode); -bool ceph_dir_test_complete(struct inode *inode); - -/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -584,7 +563,7 @@ struct ceph_file_info { u64 next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ - unsigned long dir_release_count; + int dir_release_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -713,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern int ceph_inode_set_size(struct inode *inode, loff_t size); -extern void __ceph_do_pending_vmtruncate(struct inode *inode); +extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); @@ -755,6 +734,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap) extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, + u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index d4080f309b56..5f3386844134 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -52,6 +52,9 @@ struct ceph_auth_client_ops { */ int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); + /* ensure that an existing authorizer is up to date */ + int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len); void (*destroy_authorizer)(struct ceph_auth_client *ac, @@ -75,6 +78,8 @@ struct ceph_auth_client { u64 global_id; /* our unique id in system */ const struct ceph_crypto_key *key; /* our secret key */ unsigned want_keys; /* which services we want */ + + struct mutex mutex; }; extern struct ceph_auth_client *ceph_auth_init(const char *name, @@ -94,5 +99,18 @@ extern int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len); extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); +extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth); +extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a); +extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a); +extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + size_t len); +extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type); #endif diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 76554cecaab2..4c42080347af 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -41,6 +41,7 @@ */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ @@ -51,6 +52,7 @@ #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 360d9d08ca9e..379f71508995 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -8,6 +8,23 @@ #include <linux/ceph/types.h> +/* This seemed to be the easiest place to define these */ + +#define U8_MAX ((u8)(~0U)) +#define U16_MAX ((u16)(~0U)) +#define U32_MAX ((u32)(~0U)) +#define U64_MAX ((u64)(~0ULL)) + +#define S8_MAX ((s8)(U8_MAX >> 1)) +#define S16_MAX ((s16)(U16_MAX >> 1)) +#define S32_MAX ((s32)(U32_MAX >> 1)) +#define S64_MAX ((s64)(U64_MAX >> 1LL)) + +#define S8_MIN ((s8)(-S8_MAX - 1)) +#define S16_MIN ((s16)(-S16_MAX - 1)) +#define S32_MIN ((s32)(-S32_MAX - 1)) +#define S64_MIN ((s64)(-S64_MAX - 1LL)) + /* * in all cases, * void **p pointer to position pointer @@ -137,14 +154,19 @@ bad: static inline void ceph_decode_timespec(struct timespec *ts, const struct ceph_timespec *tv) { - ts->tv_sec = le32_to_cpu(tv->tv_sec); - ts->tv_nsec = le32_to_cpu(tv->tv_nsec); + ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec); + ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); } static inline void ceph_encode_timespec(struct ceph_timespec *tv, const struct timespec *ts) { - tv->tv_sec = cpu_to_le32(ts->tv_sec); - tv->tv_nsec = cpu_to_le32(ts->tv_nsec); + BUG_ON(ts->tv_sec < 0); + BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX); + BUG_ON(ts->tv_nsec < 0); + BUG_ON(ts->tv_nsec > (long)U32_MAX); + + tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); + tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); } /* diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 29818fc3fa49..2e3024881a5e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -66,6 +66,7 @@ struct ceph_options { #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_AUTH_NAME_DEFAULT "guest" @@ -156,31 +157,11 @@ struct ceph_snap_context { u64 snaps[]; }; -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} +extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags); +extern struct ceph_snap_context *ceph_get_snap_context( + struct ceph_snap_context *sc); +extern void ceph_put_snap_context(struct ceph_snap_context *sc); /* * calculate the number of pages a given length and offset map onto, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 60903e0f665c..7c1420bb1dce 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,6 +64,77 @@ struct ceph_messenger { u32 required_features; }; +enum ceph_msg_data_type { + CEPH_MSG_DATA_NONE, /* message contains no data payload */ + CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */ + CEPH_MSG_DATA_PAGELIST, /* data source/destination is a pagelist */ +#ifdef CONFIG_BLOCK + CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ +#endif /* CONFIG_BLOCK */ +}; + +static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) +{ + switch (type) { + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: + case CEPH_MSG_DATA_PAGELIST: +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: +#endif /* CONFIG_BLOCK */ + return true; + default: + return false; + } +} + +struct ceph_msg_data { + struct list_head links; /* ceph_msg->data */ + enum ceph_msg_data_type type; + union { +#ifdef CONFIG_BLOCK + struct { + struct bio *bio; + size_t bio_length; + }; +#endif /* CONFIG_BLOCK */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* total # bytes */ + unsigned int alignment; /* first page */ + }; + struct ceph_pagelist *pagelist; + }; +}; + +struct ceph_msg_data_cursor { + size_t total_resid; /* across all data items */ + struct list_head *data_head; /* = &ceph_msg->data */ + + struct ceph_msg_data *data; /* current data item */ + size_t resid; /* bytes not yet consumed */ + bool last_piece; /* current is last piece */ + bool need_crc; /* crc update needed */ + union { +#ifdef CONFIG_BLOCK + struct { /* bio */ + struct bio *bio; /* bio from list */ + unsigned int vector_index; /* vector from bio */ + unsigned int vector_offset; /* bytes from vector */ + }; +#endif /* CONFIG_BLOCK */ + struct { /* pages */ + unsigned int page_offset; /* offset in page */ + unsigned short page_index; /* index in array */ + unsigned short page_count; /* pages in array */ + }; + struct { /* pagelist */ + struct page *page; /* page from list */ + size_t offset; /* bytes from list */ + }; + }; +}; + /* * a single message. it contains a header (src, dest, message type, etc.), * footer (crc values, mainly), a "front" message body, and possibly a @@ -74,21 +145,15 @@ struct ceph_msg { struct ceph_msg_footer footer; /* footer */ struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - struct page **pages; /* data payload. NOT OWNER. */ - unsigned nr_pages; /* size of page array */ - unsigned page_alignment; /* io offset in first page */ - struct ceph_pagelist *pagelist; /* instead of pages */ + + size_t data_length; + struct list_head data; + struct ceph_msg_data_cursor cursor; struct ceph_connection *con; - struct list_head list_head; + struct list_head list_head; /* links for connection lists */ struct kref kref; -#ifdef CONFIG_BLOCK - struct bio *bio; /* instead of pages/pagelist */ - struct bio *bio_iter; /* bio iterator */ - int bio_seg; /* current bio segment */ -#endif /* CONFIG_BLOCK */ - struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; @@ -98,12 +163,6 @@ struct ceph_msg { struct ceph_msgpool *pool; }; -struct ceph_msg_pos { - int page, page_pos; /* which page; offset in page */ - int data_pos; /* offset in data payload */ - bool did_page_crc; /* true if we've calculated crc for current page */ -}; - /* ceph connection fault delay defaults, for exponential backoff */ #define BASE_DELAY_INTERVAL (HZ/2) #define MAX_DELAY_INTERVAL (5 * 60 * HZ) @@ -161,7 +220,6 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ bool out_msg_done; - struct ceph_msg_pos out_msg_pos; struct kvec out_kvec[8], /* sending header/footer data */ *out_kvec_cur; @@ -175,7 +233,6 @@ struct ceph_connection { /* message in temps */ struct ceph_msg_header in_hdr; struct ceph_msg *in_msg; - struct ceph_msg_pos in_msg_pos; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ char in_tag; /* protocol control byte */ @@ -218,6 +275,15 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); +extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment); +extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, + size_t length); +#endif /* CONFIG_BLOCK */ + extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 680d3d648cac..3d94a73b5f30 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -87,6 +87,7 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ #define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ +#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ /* diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1dd5d466b6f9..186db0bf4951 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -29,6 +29,7 @@ struct ceph_authorizer; */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); +typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); /* a given osd we're communicating with */ struct ceph_osd { @@ -48,7 +49,67 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 10 +#define CEPH_OSD_MAX_OP 2 + +enum ceph_osd_data_type { + CEPH_OSD_DATA_TYPE_NONE = 0, + CEPH_OSD_DATA_TYPE_PAGES, + CEPH_OSD_DATA_TYPE_PAGELIST, +#ifdef CONFIG_BLOCK + CEPH_OSD_DATA_TYPE_BIO, +#endif /* CONFIG_BLOCK */ +}; + +struct ceph_osd_data { + enum ceph_osd_data_type type; + union { + struct { + struct page **pages; + u64 length; + u32 alignment; + bool pages_from_pool; + bool own_pages; + }; + struct ceph_pagelist *pagelist; +#ifdef CONFIG_BLOCK + struct { + struct bio *bio; /* list of bios */ + size_t bio_length; /* total in list */ + }; +#endif /* CONFIG_BLOCK */ + }; +}; + +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 payload_len; + union { + struct ceph_osd_data raw_data_in; + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + struct ceph_osd_data osd_data; + } extent; + struct { + const char *class_name; + const char *method_name; + struct ceph_osd_data request_info; + struct ceph_osd_data request_data; + struct ceph_osd_data response_data; + __u8 class_len; + __u8 method_len; + __u8 argc; + } cls; + struct { + u64 cookie; + u64 ver; + u32 prot_ver; + u32 timeout; + __u8 flag; + } watch; + }; +}; /* an in-flight request */ struct ceph_osd_request { @@ -63,15 +124,14 @@ struct ceph_osd_request { int r_pg_osds[CEPH_PG_MAX_SIZE]; int r_num_pg_osds; - struct ceph_connection *r_con_filling_msg; - struct ceph_msg *r_request, *r_reply; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ - int r_num_ops; - /* encoded message content */ - struct ceph_osd_op *r_request_ops; + /* request osd ops array */ + unsigned int r_num_ops; + struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; + /* these are updated on each send */ __le32 *r_request_osdmap_epoch; __le32 *r_request_flags; @@ -85,12 +145,14 @@ struct ceph_osd_request { s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; + int r_completed; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; struct completion r_completion, r_safe_completion; - ceph_osdc_callback_t r_callback, r_safe_callback; + ceph_osdc_callback_t r_callback; + ceph_osdc_unsafe_callback_t r_unsafe_callback; struct ceph_eversion r_reassert_version; struct list_head r_unsafe_item; @@ -104,16 +166,6 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - unsigned r_num_pages; /* size of page array (follows) */ - unsigned r_page_alignment; /* io offset in first page */ - struct page **r_pages; /* pages for data payload */ - int r_pages_from_pool; - int r_own_pages; /* if true, i own page list */ -#ifdef CONFIG_BLOCK - struct bio *r_bio; /* instead of pages */ -#endif - - struct ceph_pagelist r_trail; /* trailing part of the data */ }; struct ceph_osd_event { @@ -172,48 +224,8 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; -struct ceph_osd_req_op { - u16 op; /* CEPH_OSD_OP_* */ - u32 payload_len; - union { - struct { - u64 offset, length; - u64 truncate_size; - u32 truncate_seq; - } extent; - struct { - const char *name; - const char *val; - u32 name_len; - u32 value_len; - __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ - __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ - } xattr; - struct { - const char *class_name; - const char *method_name; - const char *indata; - u32 indata_len; - __u8 class_len; - __u8 method_len; - __u8 argc; - } cls; - struct { - u64 cookie; - u64 count; - } pgls; - struct { - u64 snapid; - } snap; - struct { - u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; - } watch; - }; -}; +extern int ceph_osdc_setup(void); +extern void ceph_osdc_cleanup(void); extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); @@ -224,16 +236,71 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode); + +extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + +extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq); +extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length); + +extern struct ceph_osd_data *osd_req_op_extent_osd_data( + struct ceph_osd_request *osd_req, + unsigned int which); +extern struct ceph_osd_data *osd_req_op_cls_response_data( + struct ceph_osd_request *osd_req, + unsigned int which); + +extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, + unsigned int which, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, + unsigned int which, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + +extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, + unsigned int which, + struct ceph_pagelist *pagelist); +extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + +extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + const char *class, const char *method); +extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 cookie, u64 version, int flag); + extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - unsigned int num_op, + unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); -extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, - unsigned int num_op, - struct ceph_osd_req_op *src_ops, +extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime); @@ -241,12 +308,11 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 offset, u64 *len, int op, int flags, + u64 offset, u64 *len, + int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, - bool use_mempool, int page_align); + u32 truncate_seq, u64 truncate_size, + bool use_mempool); extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index c819190d1642..d05cc4451af6 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -3,6 +3,7 @@ #include <linux/rbtree.h> #include <linux/ceph/types.h> +#include <linux/ceph/decode.h> #include <linux/ceph/ceph_fs.h> #include <linux/crush/crush.h> @@ -119,6 +120,29 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, return &map->osd_addr[osd]; } +static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) +{ + __u8 version; + + if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { + pr_warning("incomplete pg encoding"); + + return -EINVAL; + } + version = ceph_decode_8(p); + if (version > 1) { + pr_warning("do not understand pg encoding %d > 1", + (int)version); + return -EINVAL; + } + + pgid->pool = ceph_decode_64(p); + pgid->seed = ceph_decode_32(p); + *p += 4; /* skip deprecated preferred value */ + + return 0; +} + extern struct ceph_osdmap *osdmap_decode(void **p, void *end); extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map, @@ -131,10 +155,8 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_object_layout(struct ceph_pg *pg, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap); +extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool); extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *acting); diff --git a/net/ceph/Makefile b/net/ceph/Makefile index e87ef435e11b..958d9856912c 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o + pagevec.o snapshot.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index b4bf4ac090f1..6b923bcaa2a4 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp if (!ac) goto out; + mutex_init(&ac->mutex); ac->negotiating = true; if (name) ac->name = name; @@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac) */ void ceph_auth_reset(struct ceph_auth_client *ac) { + mutex_lock(&ac->mutex); dout("auth_reset %p\n", ac); if (ac->ops && !ac->negotiating) ac->ops->reset(ac); ac->negotiating = true; + mutex_unlock(&ac->mutex); } int ceph_entity_name_encode(const char *name, void **p, void *end) @@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) int i, num; int ret; + mutex_lock(&ac->mutex); dout("auth_build_hello\n"); monhdr->have_version = 0; monhdr->session_mon = cpu_to_le16(-1); @@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) ret = ceph_entity_name_encode(ac->name, &p, end); if (ret < 0) - return ret; + goto out; ceph_decode_need(&p, end, sizeof(u64), bad); ceph_encode_64(&p, ac->global_id); ceph_encode_32(&lenp, p - lenp - sizeof(u32)); - return p - buf; + ret = p - buf; +out: + mutex_unlock(&ac->mutex); + return ret; bad: - return -ERANGE; + ret = -ERANGE; + goto out; } static int ceph_build_auth_request(struct ceph_auth_client *ac, @@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac, if (ret < 0) { pr_err("error %d building auth method %s request\n", ret, ac->ops->name); - return ret; + goto out; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - return p + ret - msg_buf; + ret = p + ret - msg_buf; +out: + return ret; } /* @@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, int result_msg_len; int ret = -EINVAL; + mutex_lock(&ac->mutex); dout("handle_auth_reply %p %p\n", p, end); ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); protocol = ceph_decode_32(&p); @@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ret = ac->ops->handle_reply(ac, result, payload, payload_end); if (ret == -EAGAIN) { - return ceph_build_auth_request(ac, reply_buf, reply_len); + ret = ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - return ret; } - return 0; -bad: - pr_err("failed to decode auth msg\n"); out: + mutex_unlock(&ac->mutex); return ret; + +bad: + pr_err("failed to decode auth msg\n"); + ret = -EINVAL; + goto out; } int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len) { + int ret = 0; + + mutex_lock(&ac->mutex); if (!ac->protocol) - return ceph_auth_build_hello(ac, msg_buf, msg_len); - BUG_ON(!ac->ops); - if (ac->ops->should_authenticate(ac)) - return ceph_build_auth_request(ac, msg_buf, msg_len); - return 0; + ret = ceph_auth_build_hello(ac, msg_buf, msg_len); + else if (ac->ops->should_authenticate(ac)) + ret = ceph_build_auth_request(ac, msg_buf, msg_len); + mutex_unlock(&ac->mutex); + return ret; } int ceph_auth_is_authenticated(struct ceph_auth_client *ac) { - if (!ac->ops) - return 0; - return ac->ops->is_authenticated(ac); + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops) + ret = ac->ops->is_authenticated(ac); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_is_authenticated); + +int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->create_authorizer) + ret = ac->ops->create_authorizer(ac, peer_type, auth); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_create_authorizer); + +void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, a); + mutex_unlock(&ac->mutex); +} +EXPORT_SYMBOL(ceph_auth_destroy_authorizer); + +int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, a); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_update_authorizer); + +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->verify_authorizer_reply) + ret = ac->ops->verify_authorizer_reply(ac, a, len); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); + +void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + mutex_unlock(&ac->mutex); } +EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a16bf14eb027..96238ba95f2b 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, return -ENOMEM; } au->service = th->service; + au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; msg_a->struct_v = 1; @@ -555,6 +556,26 @@ static int ceph_x_create_authorizer( return 0; } +static int ceph_x_update_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = (struct ceph_x_authorizer *)auth->authorizer; + if (au->secret_id < th->secret_id) { + dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", + au->service, au->secret_id, th->secret_id); + return ceph_x_build_authorizer(ac, th, au); + } + return 0; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len) { @@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - remove_ticket_handler(ac, th); + memset(&th->validity, 0, sizeof(th->validity)); } @@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, + .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index f459e93b774f..c5a058da7ac8 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -29,6 +29,7 @@ struct ceph_x_authorizer { struct ceph_buffer *buf; unsigned int service; u64 nonce; + u64 secret_id; char reply_buf[128]; /* big enough for encrypted blob */ }; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index e65e6e4be38b..34b11ee8124e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -606,11 +606,17 @@ static int __init init_ceph_lib(void) if (ret < 0) goto out_crypto; + ret = ceph_osdc_setup(); + if (ret < 0) + goto out_msgr; + pr_info("loaded (mon/osd proto %d/%d)\n", CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); return 0; +out_msgr: + ceph_msgr_exit(); out_crypto: ceph_crypto_shutdown(); out_debugfs: @@ -622,6 +628,7 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); ceph_debugfs_cleanup(); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 00d051f4894e..83661cdc0766 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; + unsigned int i; int opcode; - int i; req = rb_entry(p, struct ceph_osd_request, r_node); @@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp) seq_printf(s, "\t"); for (i = 0; i < req->r_num_ops; i++) { - opcode = le16_to_cpu(req->r_request_ops[i].op); + opcode = req->r_ops[i].op; seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2c0669fb54e3..eb0a46a49bd4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -21,6 +21,9 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -149,6 +152,11 @@ static bool con_flag_test_and_set(struct ceph_connection *con, return test_and_set_bit(con_flag, &con->flags); } +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *ceph_msg_cache; +static struct kmem_cache *ceph_msg_data_cache; + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -223,6 +231,41 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ static struct workqueue_struct *ceph_msgr_wq; +static int ceph_msgr_slab_init(void) +{ + BUG_ON(ceph_msg_cache); + ceph_msg_cache = kmem_cache_create("ceph_msg", + sizeof (struct ceph_msg), + __alignof__(struct ceph_msg), 0, NULL); + + if (!ceph_msg_cache) + return -ENOMEM; + + BUG_ON(ceph_msg_data_cache); + ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", + sizeof (struct ceph_msg_data), + __alignof__(struct ceph_msg_data), + 0, NULL); + if (ceph_msg_data_cache) + return 0; + + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; + + return -ENOMEM; +} + +static void ceph_msgr_slab_exit(void) +{ + BUG_ON(!ceph_msg_data_cache); + kmem_cache_destroy(ceph_msg_data_cache); + ceph_msg_data_cache = NULL; + + BUG_ON(!ceph_msg_cache); + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; +} + static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { @@ -230,6 +273,8 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } + ceph_msgr_slab_exit(); + BUG_ON(zero_page == NULL); kunmap(zero_page); page_cache_release(zero_page); @@ -242,6 +287,9 @@ int ceph_msgr_init(void) zero_page = ZERO_PAGE(0); page_cache_get(zero_page); + if (ceph_msgr_slab_init()) + return -ENOMEM; + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (ceph_msgr_wq) return 0; @@ -471,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) return r; } +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + void *kaddr; + int ret; + + BUG_ON(page_offset + length > PAGE_SIZE); + + kaddr = kmap(page); + BUG_ON(!kaddr); + ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); + kunmap(page); + + return ret; +} + /* * write something. @more is true if caller will be sending more data * shortly. @@ -493,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, } static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) + int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); int ret; @@ -697,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con, } #ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) + +/* + * For a bio data item, a piece is whatever remains of the next + * entry in the current bio iovec, or the first entry in the next + * bio in the list. + */ +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = data->bio; + BUG_ON(!bio); + BUG_ON(!bio->bi_vcnt); + + cursor->resid = min(length, data->bio_length); + cursor->bio = bio; + cursor->vector_index = 0; + cursor->vector_offset = 0; + cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; +} + +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, + size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + + bio_vec = &bio->bi_io_vec[index]; + BUG_ON(cursor->vector_offset >= bio_vec->bv_len); + *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + BUG_ON(*page_offset >= PAGE_SIZE); + if (cursor->last_piece) /* pagelist offset is always 0 */ + *length = cursor->resid; + else + *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + BUG_ON(*length > cursor->resid); + BUG_ON(*page_offset + *length > PAGE_SIZE); + + return bio_vec->bv_page; +} + +static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = &bio->bi_io_vec[index]; + + /* Advance the cursor offset */ + + BUG_ON(cursor->resid < bytes); + cursor->resid -= bytes; + cursor->vector_offset += bytes; + if (cursor->vector_offset < bio_vec->bv_len) + return false; /* more bytes to process in this segment */ + BUG_ON(cursor->vector_offset != bio_vec->bv_len); + + /* Move on to the next segment, and possibly the next bio */ + + if (++index == (unsigned int) bio->bi_vcnt) { + bio = bio->bi_next; + index = 0; } - *iter = bio; - *seg = bio->bi_idx; + cursor->bio = bio; + cursor->vector_index = index; + cursor->vector_offset = 0; + + if (!cursor->last_piece) { + BUG_ON(!cursor->resid); + BUG_ON(!bio); + /* A short read is OK, so use <= rather than == */ + if (cursor->resid <= bio->bi_io_vec[index].bv_len) + cursor->last_piece = true; + } + + return true; } +#endif /* CONFIG_BLOCK */ -static void iter_bio_next(struct bio **bio_iter, int *seg) +/* + * For a page array, a piece comes from the first page in the array + * that has not already been fully consumed. + */ +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) { - if (*bio_iter == NULL) - return; + struct ceph_msg_data *data = cursor->data; + int page_count; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + BUG_ON(!data->pages); + BUG_ON(!data->length); - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); + cursor->resid = min(length, data->length); + page_count = calc_pages_for(data->alignment, (u64)data->length); + cursor->page_offset = data->alignment & ~PAGE_MASK; + cursor->page_index = 0; + BUG_ON(page_count > (int)USHRT_MAX); + cursor->page_count = (unsigned short)page_count; + BUG_ON(length > SIZE_MAX - cursor->page_offset); + cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; } -#endif -static void prepare_write_message_data(struct ceph_connection *con) +static struct page * +ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) { - struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data *data = cursor->data; - BUG_ON(!msg); - BUG_ON(!msg->hdr.data_len); + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_index >= cursor->page_count); + BUG_ON(cursor->page_offset >= PAGE_SIZE); + + *page_offset = cursor->page_offset; + if (cursor->last_piece) + *length = cursor->resid; + else + *length = PAGE_SIZE - *page_offset; + + return data->pages[cursor->page_index]; +} + +static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); + + /* Advance the cursor page offset */ + + cursor->resid -= bytes; + cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; + if (!bytes || cursor->page_offset) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page; offset is already at 0 */ + + BUG_ON(cursor->page_index >= cursor->page_count); + cursor->page_index++; + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * For a pagelist, a piece is whatever remains to be consumed in the + * first page in the list, or the front of the next page. + */ +static void +ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + struct page *page; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + if (!length) + return; /* pagelist can be assigned but empty */ + + BUG_ON(list_empty(&pagelist->head)); + page = list_first_entry(&pagelist->head, struct page, lru); + + cursor->resid = min(length, pagelist->length); + cursor->page = page; + cursor->offset = 0; + cursor->last_piece = cursor->resid <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (msg->pages) - con->out_msg_pos.page_pos = msg->page_alignment; + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(!cursor->page); + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + + /* offset of first page in pagelist is always 0 */ + *page_offset = cursor->offset & ~PAGE_MASK; + if (cursor->last_piece) + *length = cursor->resid; else - con->out_msg_pos.page_pos = 0; + *length = PAGE_SIZE - *page_offset; + + return cursor->page; +} + +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); + + /* Advance the cursor offset */ + + cursor->resid -= bytes; + cursor->offset += bytes; + /* offset of first page in pagelist is always 0 */ + if (!bytes || cursor->offset & ~PAGE_MASK) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page */ + + BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); + cursor->page = list_entry_next(cursor->page, lru); + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page. The network layer might not + * consume an entire piece at once. A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece. It also tracks whether the current + * piece is the last one in the data item. + */ +static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) +{ + size_t length = cursor->total_resid; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + ceph_msg_data_pagelist_cursor_init(cursor, length); + break; + case CEPH_MSG_DATA_PAGES: + ceph_msg_data_pages_cursor_init(cursor, length); + break; #ifdef CONFIG_BLOCK - if (msg->bio) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ + case CEPH_MSG_DATA_BIO: + ceph_msg_data_bio_cursor_init(cursor, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + /* BUG(); */ + break; + } + cursor->need_crc = true; +} + +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +{ + struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data *data; + + BUG_ON(!length); + BUG_ON(length > msg->data_length); + BUG_ON(list_empty(&msg->data)); + + cursor->data_head = &msg->data; + cursor->total_resid = length; + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + cursor->data = data; + + __ceph_msg_data_cursor_init(cursor); +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) +{ + struct page *page; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + page = ceph_msg_data_pagelist_next(cursor, page_offset, length); + break; + case CEPH_MSG_DATA_PAGES: + page = ceph_msg_data_pages_next(cursor, page_offset, length); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + page = ceph_msg_data_bio_next(cursor, page_offset, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + page = NULL; + break; + } + BUG_ON(!page); + BUG_ON(*page_offset + *length > PAGE_SIZE); + BUG_ON(!*length); + if (last_piece) + *last_piece = cursor->last_piece; + + return page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * of the data item. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + bool new_piece; + + BUG_ON(bytes > cursor->resid); + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); + break; + case CEPH_MSG_DATA_PAGES: + new_piece = ceph_msg_data_pages_advance(cursor, bytes); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + new_piece = ceph_msg_data_bio_advance(cursor, bytes); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + BUG(); + break; + } + cursor->total_resid -= bytes; + + if (!cursor->resid && cursor->total_resid) { + WARN_ON(!cursor->last_piece); + BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); + cursor->data = list_entry_next(cursor->data, links); + __ceph_msg_data_cursor_init(cursor); + new_piece = true; + } + cursor->need_crc = new_piece; + + return new_piece; +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + BUG_ON(!msg); + BUG_ON(!data_len); + + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(msg, (size_t)data_len); } /* @@ -803,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } -#ifdef CONFIG_BLOCK - else - m->bio_iter = NULL; -#endif + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), - m->nr_pages); + m->data_length); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -843,11 +1250,13 @@ static void prepare_write_message(struct ceph_connection *con) /* is there a data payload? */ con->out_msg->footer.data_crc = 0; - if (m->hdr.data_len) - prepare_write_message_data(con); - else + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->out_more = 1; /* data + footer will follow */ + } else { /* no, queue up footer too and be done */ prepare_write_message_footer(con); + } con_flag_set(con, CON_FLAG_WRITE_PENDING); } @@ -874,6 +1283,24 @@ static void prepare_write_ack(struct ceph_connection *con) } /* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* * Prepare to write keepalive byte. */ static void prepare_write_keepalive(struct ceph_connection *con) @@ -1022,35 +1449,19 @@ out: return ret; /* done! */ } -static void out_msg_pos_next(struct ceph_connection *con, struct page *page, - size_t len, size_t sent, bool in_trail) +static u32 ceph_crc32c_page(u32 crc, struct page *page, + unsigned int page_offset, + unsigned int length) { - struct ceph_msg *msg = con->out_msg; + char *kaddr; - BUG_ON(!msg); - BUG_ON(!sent); - - con->out_msg_pos.data_pos += sent; - con->out_msg_pos.page_pos += sent; - if (sent < len) - return; + kaddr = kmap(page); + BUG_ON(kaddr == NULL); + crc = crc32c(crc, kaddr + page_offset, length); + kunmap(page); - BUG_ON(sent != len); - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif + return crc; } - /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -1058,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_msg_pages(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; - unsigned int data_len = le32_to_cpu(msg->hdr.data_len); - size_t len; + struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !con->msgr->nocrc; - int ret; - int total_max_write; - bool in_trail = false; - const size_t trail_len = (msg->trail ? msg->trail->length : 0); - const size_t trail_off = data_len - trail_len; + u32 crc; - dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, msg, con->out_msg_pos.page, msg->nr_pages, - con->out_msg_pos.page_pos); + dout("%s %p msg %p\n", __func__, con, msg); + + if (list_empty(&msg->data)) + return -EINVAL; /* * Iterate through each page that contains data to be @@ -1082,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con) * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ - while (data_len > con->out_msg_pos.data_pos) { - struct page *page = NULL; - int max_write = PAGE_SIZE; - int bio_offset = 0; - - in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; - if (!in_trail) - total_max_write = trail_off - con->out_msg_pos.data_pos; - - if (in_trail) { - total_max_write = data_len - con->out_msg_pos.data_pos; - - page = list_first_entry(&msg->trail->head, - struct page, lru); - } else if (msg->pages) { - page = msg->pages[con->out_msg_pos.page]; - } else if (msg->pagelist) { - page = list_first_entry(&msg->pagelist->head, - struct page, lru); -#ifdef CONFIG_BLOCK - } else if (msg->bio) { - struct bio_vec *bv; + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->resid) { + struct page *page; + size_t page_offset; + size_t length; + bool last_piece; + bool need_crc; + int ret; - bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); - page = bv->bv_page; - bio_offset = bv->bv_offset; - max_write = bv->bv_len; -#endif - } else { - page = zero_page; - } - len = min_t(int, max_write - con->out_msg_pos.page_pos, - total_max_write); - - if (do_datacrc && !con->out_msg_pos.did_page_crc) { - void *base; - u32 crc = le32_to_cpu(msg->footer.data_crc); - char *kaddr; - - kaddr = kmap(page); - BUG_ON(kaddr == NULL); - base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(crc, base, len); - kunmap(page); - msg->footer.data_crc = cpu_to_le32(crc); - con->out_msg_pos.did_page_crc = true; - } - ret = ceph_tcp_sendpage(con->sock, page, - con->out_msg_pos.page_pos + bio_offset, - len, 1); - if (ret <= 0) - goto out; + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + &last_piece); + ret = ceph_tcp_sendpage(con->sock, page, page_offset, + length, last_piece); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); - out_msg_pos_next(con, page, len, (size_t) ret, in_trail); + return ret; + } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); } - dout("write_partial_msg_pages %p msg %p done\n", con, msg); + dout("%s %p msg %p done\n", __func__, con, msg); /* prepare and queue up footer, too */ - if (!do_datacrc) + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); prepare_write_message_footer(con); - ret = 1; -out: - return ret; + + return 1; /* must return > 0 to indicate success */ } /* @@ -1160,7 +1536,7 @@ static int write_partial_skip(struct ceph_connection *con) while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (ret <= 0) goto out; con->out_skip -= ret; @@ -1191,6 +1567,13 @@ static void prepare_read_ack(struct ceph_connection *con) con->in_base_pos = 0; } +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_SEQ; +} + static void prepare_read_tag(struct ceph_connection *con) { dout("prepare_read_tag %p\n", con); @@ -1597,7 +1980,6 @@ static int process_connect(struct ceph_connection *con) con->error_msg = "connect authorization failure"; return -1; } - con->auth_retry = 1; con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) @@ -1668,6 +2050,7 @@ static int process_connect(struct ceph_connection *con) prepare_read_connect(con); break; + case CEPH_MSGR_TAG_SEQ: case CEPH_MSGR_TAG_READY: if (req_feat & ~server_feat) { pr_err("%s%lld %s protocol feature mismatch," @@ -1682,7 +2065,7 @@ static int process_connect(struct ceph_connection *con) WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; - + con->auth_retry = 0; /* we authenticated; clear flag */ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1698,7 +2081,12 @@ static int process_connect(struct ceph_connection *con) con->delay = 0; /* reset backoff memory */ - prepare_read_tag(con); + if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } break; case CEPH_MSGR_TAG_WAIT: @@ -1732,7 +2120,6 @@ static int read_partial_ack(struct ceph_connection *con) return read_partial(con, end, size, &con->in_temp_ack); } - /* * We can finally discard anything that's been acked. */ @@ -1757,8 +2144,6 @@ static void process_ack(struct ceph_connection *con) } - - static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) @@ -1782,77 +2167,49 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message_pages(struct ceph_connection *con, - struct page **pages, - unsigned int data_len, bool do_datacrc) +static int read_partial_msg_data(struct ceph_connection *con) { - void *p; + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + const bool do_datacrc = !con->msgr->nocrc; + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; int ret; - int left; - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - /* (page) data */ - BUG_ON(pages == NULL); - p = kmap(pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; - } - - return ret; -} - -#ifdef CONFIG_BLOCK -static int read_partial_message_bio(struct ceph_connection *con, - struct bio **bio_iter, int *bio_seg, - unsigned int data_len, bool do_datacrc) -{ - struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); - void *p; - int ret, left; + BUG_ON(!msg); + if (list_empty(&msg->data)) + return -EIO; - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(bv->bv_len - con->in_msg_pos.page_pos)); + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->resid) { + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; - p = kmap(bv->bv_page) + bv->bv_offset; + return ret; + } - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(bv->bv_page); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == bv->bv_len) { - con->in_msg_pos.page_pos = 0; - iter_bio_next(bio_iter, bio_seg); + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); } + if (do_datacrc) + con->in_data_crc = crc; - return ret; + return 1; /* must return > 0 to indicate success */ } -#endif /* * read (part of) a message. */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); + static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; @@ -1885,7 +2242,7 @@ static int read_partial_message(struct ceph_connection *con) if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_DATA_LEN) + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) @@ -1914,14 +2271,22 @@ static int read_partial_message(struct ceph_connection *con) int skip = 0; dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - con->in_hdr.front_len, con->in_hdr.data_len); + front_len, data_len); ret = ceph_con_in_msg_alloc(con, &skip); if (ret < 0) return ret; + + BUG_ON(!con->in_msg ^ skip); + if (con->in_msg && data_len > con->in_msg->data_length) { + pr_warning("%s skipping long message (%u > %zd)\n", + __func__, data_len, con->in_msg->data_length); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + skip = 1; + } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); - BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1936,17 +2301,10 @@ static int read_partial_message(struct ceph_connection *con) if (m->middle) m->middle->vec.iov_len = 0; - con->in_msg_pos.page = 0; - if (m->pages) - con->in_msg_pos.page_pos = m->page_alignment; - else - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.data_pos = 0; + /* prepare for data payload, if any */ -#ifdef CONFIG_BLOCK - if (m->bio) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif + if (data_len) + prepare_message_data(con->in_msg, data_len); } /* front */ @@ -1965,24 +2323,10 @@ static int read_partial_message(struct ceph_connection *con) } /* (page) data */ - while (con->in_msg_pos.data_pos < data_len) { - if (m->pages) { - ret = read_partial_message_pages(con, m->pages, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#ifdef CONFIG_BLOCK - } else if (m->bio) { - BUG_ON(!m->bio_iter); - ret = read_partial_message_bio(con, - &m->bio_iter, &m->bio_seg, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#endif - } else { - BUG_ON(1); - } + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; } /* footer */ @@ -2108,13 +2452,13 @@ more_kvec: goto do_next; } - ret = write_partial_msg_pages(con); + ret = write_partial_message_data(con); if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { - dout("try_write write_partial_msg_pages err %d\n", + dout("try_write write_partial_message_data err %d\n", ret); goto out; } @@ -2266,7 +2610,12 @@ more: prepare_read_tag(con); goto more; } - if (con->in_tag == CEPH_MSGR_TAG_ACK) { + if (con->in_tag == CEPH_MSGR_TAG_ACK || + con->in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ ret = read_partial_ack(con); if (ret <= 0) goto out; @@ -2672,6 +3021,88 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +{ + struct ceph_msg_data *data; + + if (WARN_ON(!ceph_msg_data_type_valid(type))) + return NULL; + + data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); + if (data) + data->type = type; + INIT_LIST_HEAD(&data->links); + + return data; +} + +static void ceph_msg_data_destroy(struct ceph_msg_data *data) +{ + if (!data) + return; + + WARN_ON(!list_empty(&data->links)); + if (data->type == CEPH_MSG_DATA_PAGELIST) { + ceph_pagelist_release(data->pagelist); + kfree(data->pagelist); + } + kmem_cache_free(ceph_msg_data_cache, data); +} + +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment) +{ + struct ceph_msg_data *data; + + BUG_ON(!pages); + BUG_ON(!length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); + BUG_ON(!data); + data->pages = pages; + data->length = length; + data->alignment = alignment & ~PAGE_MASK; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pages); + +void ceph_msg_data_add_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist) +{ + struct ceph_msg_data *data; + + BUG_ON(!pagelist); + BUG_ON(!pagelist->length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); + BUG_ON(!data); + data->pagelist = pagelist; + + list_add_tail(&data->links, &msg->data); + msg->data_length += pagelist->length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pagelist); + +#ifdef CONFIG_BLOCK +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, + size_t length) +{ + struct ceph_msg_data *data; + + BUG_ON(!bio); + + data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); + BUG_ON(!data); + data->bio = bio; + data->bio_length = length; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_bio); +#endif /* CONFIG_BLOCK */ /* * construct a new message with given type, size @@ -2682,49 +3113,20 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, { struct ceph_msg *m; - m = kmalloc(sizeof(*m), flags); + m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; - kref_init(&m->kref); - m->con = NULL; - INIT_LIST_HEAD(&m->list_head); - - m->hdr.tid = 0; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); - m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); - m->hdr.middle_len = 0; - m->hdr.data_len = 0; - m->hdr.data_off = 0; - m->hdr.reserved = 0; - m->footer.front_crc = 0; - m->footer.middle_crc = 0; - m->footer.data_crc = 0; - m->footer.flags = 0; - m->front_max = front_len; - m->front_is_vmalloc = false; - m->more_to_follow = false; - m->ack_stamp = 0; - m->pool = NULL; - - /* middle */ - m->middle = NULL; - /* data */ - m->nr_pages = 0; - m->page_alignment = 0; - m->pages = NULL; - m->pagelist = NULL; -#ifdef CONFIG_BLOCK - m->bio = NULL; - m->bio_iter = NULL; - m->bio_seg = 0; -#endif /* CONFIG_BLOCK */ - m->trail = NULL; + INIT_LIST_HEAD(&m->list_head); + kref_init(&m->kref); + INIT_LIST_HEAD(&m->data); /* front */ + m->front_max = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, @@ -2802,49 +3204,37 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { struct ceph_msg_header *hdr = &con->in_hdr; - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); + BUG_ON(!con->ops->alloc_msg); - if (con->ops->alloc_msg) { - struct ceph_msg *msg; - - mutex_unlock(&con->mutex); - msg = con->ops->alloc_msg(con, hdr, skip); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { - if (msg) - ceph_msg_put(msg); - return -EAGAIN; - } - con->in_msg = msg; - if (con->in_msg) { - con->in_msg->con = con->ops->get(con); - BUG_ON(con->in_msg->con == NULL); - } - if (*skip) { - con->in_msg = NULL; - return 0; - } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (con->state != CON_STATE_OPEN) { + if (msg) + ceph_msg_put(msg); + return -EAGAIN; } - if (!con->in_msg) { - con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!con->in_msg) { - pr_err("unable to allocate msg type %d len %d\n", - type, front_len); - return -ENOMEM; - } + if (msg) { + BUG_ON(*skip); + con->in_msg = msg; con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); - con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); + } else { + /* + * Null message pointer means either we should skip + * this message or we couldn't allocate memory. The + * former is not an error. + */ + if (*skip) + return 0; + con->error_msg = "error allocating memory for incoming message"; + + return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); @@ -2870,7 +3260,7 @@ void ceph_msg_kfree(struct ceph_msg *m) vfree(m->front.iov_base); else kfree(m->front.iov_base); - kfree(m); + kmem_cache_free(ceph_msg_cache, m); } /* @@ -2879,6 +3269,9 @@ void ceph_msg_kfree(struct ceph_msg *m) void ceph_msg_last_put(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + LIST_HEAD(data); + struct list_head *links; + struct list_head *next; dout("ceph_msg_put last one on %p\n", m); WARN_ON(!list_empty(&m->list_head)); @@ -2888,16 +3281,16 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - m->nr_pages = 0; - m->pages = NULL; - if (m->pagelist) { - ceph_pagelist_release(m->pagelist); - kfree(m->pagelist); - m->pagelist = NULL; - } + list_splice_init(&m->data, &data); + list_for_each_safe(links, next, &data) { + struct ceph_msg_data *data; - m->trail = NULL; + data = list_entry(links, struct ceph_msg_data, links); + list_del_init(links); + ceph_msg_data_destroy(data); + } + m->data_length = 0; if (m->pool) ceph_msgpool_put(m->pool, m); @@ -2908,8 +3301,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, - msg->front_max, msg->nr_pages); + pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, + msg->front_max, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index aef5b1062bee..1fe25cd29d0e 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); - if (monc->auth->ops->is_authenticated(monc->auth)) + if (ceph_auth_is_authenticated(monc->auth)) __send_subscribe(monc); } __schedule_delayed(monc); @@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); had_debugfs_info = have_debugfs_info(monc); - if (monc->auth->ops) - was_auth = monc->auth->ops->is_authenticated(monc->auth); + was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, @@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); - } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { + } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d730dd4d8eb2..a3395fdfbd4f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,3 +1,4 @@ + #include <linux/ceph/ceph_debug.h> #include <linux/module.h> @@ -21,6 +22,8 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 +static struct kmem_cache *ceph_osd_request_cache; + static const struct ceph_connection_operations osd_con_ops; static void __send_queued(struct ceph_osd_client *osdc); @@ -32,12 +35,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); -static int op_has_extent(int op) -{ - return (op == CEPH_OSD_OP_READ || - op == CEPH_OSD_OP_WRITE); -} - /* * Implement client access to distributed object storage cluster. * @@ -63,53 +60,238 @@ static int op_has_extent(int op) * * fill osd op in request message. */ -static int calc_layout(struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, + u64 *objnum, u64 *objoff, u64 *objlen) { u64 orig_len = *plen; - u64 bno = 0; - u64 objoff = 0; - u64 objlen = 0; int r; /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, + objoff, objlen); if (r < 0) return r; - if (objlen < orig_len) { - *plen = objlen; + if (*objlen < orig_len) { + *plen = *objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); } - if (op_has_extent(op->op)) { - u32 osize = le32_to_cpu(layout->fl_object_size); - op->extent.offset = objoff; - op->extent.length = objlen; - if (op->extent.truncate_size <= off - objoff) { - op->extent.truncate_size = 0; - } else { - op->extent.truncate_size -= off - objoff; - if (op->extent.truncate_size > osize) - op->extent.truncate_size = osize; - } + dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); + + return 0; +} + +static void ceph_osd_data_init(struct ceph_osd_data *osd_data) +{ + memset(osd_data, 0, sizeof (*osd_data)); + osd_data->type = CEPH_OSD_DATA_TYPE_NONE; +} + +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->length = length; + osd_data->alignment = alignment; + osd_data->pages_from_pool = pages_from_pool; + osd_data->own_pages = own_pages; +} + +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; + osd_data->pagelist = pagelist; +} + +#ifdef CONFIG_BLOCK +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = bio; + osd_data->bio_length = bio_length; +} +#endif /* CONFIG_BLOCK */ + +#define osd_req_op_data(oreq, whch, typ, fld) \ + ({ \ + BUG_ON(whch >= (oreq)->r_num_ops); \ + &(oreq)->r_ops[whch].typ.fld; \ + }) + +static struct ceph_osd_data * +osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].raw_data_in; +} + +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, extent, osd_data); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, cls, response_data); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ + +void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_raw_data_in(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, struct bio *bio, size_t bio_length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bio_init(osd_data, bio, bio_length); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_info); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} + +void osd_req_op_cls_request_data_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + +void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, response_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); + +static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) +{ + switch (osd_data->type) { + case CEPH_OSD_DATA_TYPE_NONE: + return 0; + case CEPH_OSD_DATA_TYPE_PAGES: + return osd_data->length; + case CEPH_OSD_DATA_TYPE_PAGELIST: + return (u64)osd_data->pagelist->length; +#ifdef CONFIG_BLOCK + case CEPH_OSD_DATA_TYPE_BIO: + return (u64)osd_data->bio_length; +#endif /* CONFIG_BLOCK */ + default: + WARN(true, "unrecognized data type %d\n", (int)osd_data->type); + return 0; } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; +} - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - bno, objoff, objlen, req->r_num_pages); +static void ceph_osd_data_release(struct ceph_osd_data *osd_data) +{ + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { + int num_pages; - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); - req->r_oid_len = strlen(req->r_oid); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + ceph_release_page_vector(osd_data->pages, num_pages); + } + ceph_osd_data_init(osd_data); +} + +static void osd_req_op_data_release(struct ceph_osd_request *osd_req, + unsigned int which) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; - return r; + switch (op->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + ceph_osd_data_release(&op->extent.osd_data); + break; + case CEPH_OSD_OP_CALL: + ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.request_data); + ceph_osd_data_release(&op->cls.response_data); + break; + default: + break; + } } /* @@ -117,30 +299,26 @@ static int calc_layout(struct ceph_vino vino, */ void ceph_osdc_release_request(struct kref *kref) { - struct ceph_osd_request *req = container_of(kref, - struct ceph_osd_request, - r_kref); + struct ceph_osd_request *req; + unsigned int which; + req = container_of(kref, struct ceph_osd_request, r_kref); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_con_filling_msg) { - dout("%s revoking msg %p from con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); + if (req->r_reply) { ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } - if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_own_pages) - ceph_release_page_vector(req->r_pages, - req->r_num_pages); + } + + for (which = 0; which < req->r_num_ops; which++) + osd_req_op_data_release(req, which); + ceph_put_snap_context(req->r_snapc); - ceph_pagelist_release(&req->r_trail); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else - kfree(req); + kmem_cache_free(ceph_osd_request_cache, req); + } EXPORT_SYMBOL(ceph_osdc_release_request); @@ -154,6 +332,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; + BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); + BUG_ON(num_ops > CEPH_OSD_MAX_OP); + msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ @@ -168,13 +349,14 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); } else { - req = kzalloc(sizeof(*req), gfp_flags); + req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); } if (req == NULL) return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; + req->r_num_ops = num_ops; kref_init(&req->r_kref); init_completion(&req->r_completion); @@ -198,8 +380,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - ceph_pagelist_init(&req->r_trail); - /* create request message; allow space for oid */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); @@ -218,60 +398,24 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static void osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, - struct ceph_osd_req_op *src) +static bool osd_req_opcode_valid(u16 opcode) { - dst->op = cpu_to_le16(src->op); - - switch (src->op) { - case CEPH_OSD_OP_STAT: - break; + switch (opcode) { case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_WRITE: - dst->extent.offset = - cpu_to_le64(src->extent.offset); - dst->extent.length = - cpu_to_le64(src->extent.length); - dst->extent.truncate_size = - cpu_to_le64(src->extent.truncate_size); - dst->extent.truncate_seq = - cpu_to_le32(src->extent.truncate_seq); - break; - case CEPH_OSD_OP_CALL: - dst->cls.class_len = src->cls.class_len; - dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - - ceph_pagelist_append(&req->r_trail, src->cls.class_name, - src->cls.class_len); - ceph_pagelist_append(&req->r_trail, src->cls.method_name, - src->cls.method_len); - ceph_pagelist_append(&req->r_trail, src->cls.indata, - src->cls.indata_len); - break; - case CEPH_OSD_OP_STARTSYNC: - break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - dst->watch.cookie = cpu_to_le64(src->watch.cookie); - dst->watch.ver = cpu_to_le64(src->watch.ver); - dst->watch.flag = src->watch.flag; - break; - default: - pr_err("unrecognized osd opcode %d\n", dst->op); - WARN_ON(1); - break; + case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_MAPEXT: case CEPH_OSD_OP_MASKTRUNC: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_NOTIFY: + case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_ASSERT_VER: + case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_TRUNCATE: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_DELETE: case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_STARTSYNC: case CEPH_OSD_OP_SETTRUNC: case CEPH_OSD_OP_TRIMTRUNC: case CEPH_OSD_OP_TMAPUP: @@ -279,11 +423,11 @@ static void osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_TMAPGET: case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_ROLLBACK: + case CEPH_OSD_OP_WATCH: case CEPH_OSD_OP_OMAPGETKEYS: case CEPH_OSD_OP_OMAPGETVALS: case CEPH_OSD_OP_OMAPGETHEADER: case CEPH_OSD_OP_OMAPGETVALSBYKEYS: - case CEPH_OSD_OP_MODE_RD: case CEPH_OSD_OP_OMAPSETVALS: case CEPH_OSD_OP_OMAPSETHEADER: case CEPH_OSD_OP_OMAPCLEAR: @@ -314,113 +458,233 @@ static void osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_RDUNLOCK: case CEPH_OSD_OP_UPLOCK: case CEPH_OSD_OP_DNLOCK: + case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_PGLS: case CEPH_OSD_OP_PGLS_FILTER: - pr_err("unsupported osd opcode %s\n", - ceph_osd_op_name(dst->op)); - WARN_ON(1); - break; + return true; + default: + return false; } - dst->payload_len = cpu_to_le32(src->payload_len); } /* - * build new request AND message - * + * This is an osd op init function for opcodes that have no data or + * other information associated with them. It also serves as a + * common init routine for all the other init functions, below. */ -void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, unsigned int num_ops, - struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) +static struct ceph_osd_req_op * +_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode) { - struct ceph_msg *msg = req->r_request; - struct ceph_osd_req_op *src_op; - void *p; - size_t msg_size; - int flags = req->r_flags; - u64 data_len; - int i; + struct ceph_osd_req_op *op; - req->r_num_ops = num_ops; - req->r_snapid = snap_id; - req->r_snapc = ceph_get_snap_context(snapc); + BUG_ON(which >= osd_req->r_num_ops); + BUG_ON(!osd_req_opcode_valid(opcode)); - /* encode request */ - msg->hdr.version = cpu_to_le16(4); + op = &osd_req->r_ops[which]; + memset(op, 0, sizeof (*op)); + op->op = opcode; - p = msg->front.iov_base; - ceph_encode_32(&p, 1); /* client_inc is always 1 */ - req->r_request_osdmap_epoch = p; - p += 4; - req->r_request_flags = p; - p += 4; - if (req->r_flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(p, mtime); - p += sizeof(struct ceph_timespec); - req->r_request_reassert_version = p; - p += sizeof(struct ceph_eversion); /* will get filled in */ + return op; +} - /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); - req->r_request_pool = p; - p += 8; - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ +void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode) +{ + (void)_osd_req_op_init(osd_req, which, opcode); +} +EXPORT_SYMBOL(osd_req_op_init); - ceph_encode_8(&p, 1); - req->r_request_pgid = p; - p += 8 + 4; - ceph_encode_32(&p, -1); /* preferred */ +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + size_t payload_len = 0; - /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - /* ops */ - ceph_encode_16(&p, num_ops); - src_op = src_ops; - req->r_request_ops = p; - for (i = 0; i < num_ops; i++, src_op++) { - osd_req_encode_op(req, p, src_op); - p += sizeof(struct ceph_osd_op); - } + op->extent.offset = offset; + op->extent.length = length; + op->extent.truncate_size = truncate_size; + op->extent.truncate_seq = truncate_seq; + if (opcode == CEPH_OSD_OP_WRITE) + payload_len += length; - /* snaps */ - ceph_encode_64(&p, req->r_snapid); - ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); - ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); - if (req->r_snapc) { - for (i = 0; i < snapc->num_snaps; i++) { - ceph_encode_64(&p, req->r_snapc->snaps[i]); - } + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length) +{ + struct ceph_osd_req_op *op; + u64 previous; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + previous = op->extent.length; + + if (length == previous) + return; /* Nothing to do */ + BUG_ON(length > previous); + + op->extent.length = length; + op->payload_len -= previous - length; +} +EXPORT_SYMBOL(osd_req_op_extent_update); + +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *class, const char *method) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_pagelist *pagelist; + size_t payload_len = 0; + size_t size; + + BUG_ON(opcode != CEPH_OSD_OP_CALL); + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + + op->cls.class_name = class; + size = strlen(class); + BUG_ON(size > (size_t) U8_MAX); + op->cls.class_len = size; + ceph_pagelist_append(pagelist, class, size); + payload_len += size; + + op->cls.method_name = method; + size = strlen(method); + BUG_ON(size > (size_t) U8_MAX); + op->cls.method_len = size; + ceph_pagelist_append(pagelist, method, size); + payload_len += size; + + osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); + + op->cls.argc = 0; /* currently unused */ + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 cookie, u64 version, int flag) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + + op->watch.cookie = cookie; + op->watch.ver = version; + if (opcode == CEPH_OSD_OP_WATCH && flag) + op->watch.flag = (u8)1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + +static void ceph_osdc_msg_data_add(struct ceph_msg *msg, + struct ceph_osd_data *osd_data) +{ + u64 length = ceph_osd_data_length(osd_data); + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + BUG_ON(length > (u64) SIZE_MAX); + if (length) + ceph_msg_data_add_pages(msg, osd_data->pages, + length, osd_data->alignment); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + BUG_ON(!length); + ceph_msg_data_add_pagelist(msg, osd_data->pagelist); +#ifdef CONFIG_BLOCK + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + ceph_msg_data_add_bio(msg, osd_data->bio, length); +#endif + } else { + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } +} - req->r_request_attempts = p; - p += 4; +static u64 osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, unsigned int which) +{ + struct ceph_osd_req_op *src; + struct ceph_osd_data *osd_data; + u64 request_data_len = 0; + u64 data_length; - data_len = req->r_trail.length; - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - data_len += len; + BUG_ON(which >= req->r_num_ops); + src = &req->r_ops[which]; + if (WARN_ON(!osd_req_opcode_valid(src->op))) { + pr_err("unrecognized osd opcode %d\n", src->op); + + return 0; } - req->r_request->hdr.data_len = cpu_to_le32(data_len); - req->r_request->page_alignment = req->r_page_alignment; - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); + switch (src->op) { + case CEPH_OSD_OP_STAT: + osd_data = &src->raw_data_in; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + if (src->op == CEPH_OSD_OP_WRITE) + request_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + osd_data = &src->extent.osd_data; + if (src->op == CEPH_OSD_OP_WRITE) + ceph_osdc_msg_data_add(req->r_request, osd_data); + else + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_CALL: + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + osd_data = &src->cls.request_info; + ceph_osdc_msg_data_add(req->r_request, osd_data); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = osd_data->pagelist->length; + + osd_data = &src->cls.request_data; + data_length = ceph_osd_data_length(osd_data); + if (data_length) { + BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); + dst->cls.indata_len = cpu_to_le32(data_length); + ceph_osdc_msg_data_add(req->r_request, osd_data); + src->payload_len += data_length; + request_data_len += data_length; + } + osd_data = &src->cls.response_data; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; + default: + pr_err("unsupported osd opcode %s\n", + ceph_osd_op_name(src->op)); + WARN_ON(1); - dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, - num_ops); - return; + return 0; + } + dst->op = cpu_to_le16(src->op); + dst->payload_len = cpu_to_le32(src->payload_len); + + return request_data_len; } -EXPORT_SYMBOL(ceph_osdc_build_request); /* * build new request AND message, calculate layout, and adjust file @@ -436,51 +700,63 @@ EXPORT_SYMBOL(ceph_osdc_build_request); struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 off, u64 *plen, + u64 off, u64 *plen, int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, - bool use_mempool, - int page_align) + bool use_mempool) { - struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; - unsigned int num_op = 1; + u64 objnum = 0; + u64 objoff = 0; + u64 objlen = 0; + u32 object_size; + u64 object_base; int r; - memset(&ops, 0, sizeof ops); - - ops[0].op = opcode; - ops[0].extent.truncate_seq = truncate_seq; - ops[0].extent.truncate_size = truncate_size; - - if (do_sync) { - ops[1].op = CEPH_OSD_OP_STARTSYNC; - num_op++; - } + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ - r = calc_layout(vino, layout, off, plen, req, ops); - if (r < 0) + r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); + if (r < 0) { + ceph_osdc_put_request(req); return ERR_PTR(r); - req->r_file_layout = *layout; /* keep a copy */ + } - /* in case it differs from natural (file) alignment that - calc_layout filled in for us */ - req->r_num_pages = calc_pages_for(page_align, *plen); - req->r_page_alignment = page_align; + object_size = le32_to_cpu(layout->fl_object_size); + object_base = off - objoff; + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; + } + + osd_req_op_extent_init(req, 0, opcode, objoff, objlen, + truncate_size, truncate_seq); + + /* + * A second op in the ops array means the caller wants to + * also issue a include a 'startsync' command so that the + * osd will flush data quickly. + */ + if (num_ops > 1) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + + req->r_file_layout = *layout; /* keep a copy */ - ceph_osdc_build_request(req, off, *plen, num_op, ops, - snapc, vino.snap, mtime); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", + vino.ino, objnum); + req->r_oid_len = strlen(req->r_oid); return req; } @@ -558,21 +834,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd) { struct ceph_osd_request *req, *nreq; + LIST_HEAD(resend); int err; dout("__kick_osd_requests osd%d\n", osd->o_osd); err = __reset_osd(osdc, osd); if (err) return; - + /* + * Build up a list of requests to resend by traversing the + * osd's list of requests. Requests for a given object are + * sent in tid order, and that is also the order they're + * kept on this list. Therefore all requests that are in + * flight will be found first, followed by all requests that + * have not yet been sent. And to resend requests while + * preserving this order we will want to put any sent + * requests back on the front of the osd client's unsent + * list. + * + * So we build a separate ordered list of already-sent + * requests for the affected osd and splice it onto the + * front of the osd client's unsent list. Once we've seen a + * request that has not yet been sent we're done. Those + * requests are already sitting right where they belong. + */ list_for_each_entry(req, &osd->o_requests, r_osd_item) { - list_move(&req->r_req_lru_item, &osdc->req_unsent); - dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + if (!req->r_sent) + break; + list_move_tail(&req->r_req_lru_item, &resend); + dout("requeueing %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); if (!req->r_linger) req->r_flags |= CEPH_OSD_FLAG_RETRY; } + list_splice(&resend, &osdc->req_unsent); + /* + * Linger requests are re-registered before sending, which + * sets up a new tid for each. We add them to the unsent + * list at the end to keep things in tid order. + */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, r_linger_osd) { /* @@ -581,8 +882,8 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, */ BUG_ON(!list_empty(&req->r_req_lru_item)); __register_request(osdc, req); - list_add(&req->r_req_lru_item, &osdc->req_unsent); - list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); __unregister_linger_request(osdc, req); dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); @@ -654,8 +955,7 @@ static void put_osd(struct ceph_osd *osd) if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -820,14 +1120,6 @@ static void __register_request(struct ceph_osd_client *osdc, } } -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - mutex_unlock(&osdc->request_mutex); -} - /* * called under osdc->request_mutex */ @@ -952,8 +1244,8 @@ static int __map_request(struct ceph_osd_client *osdc, int err; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_object_layout(&pgid, req->r_oid, - &req->r_file_layout, osdc->osdmap); + err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, + ceph_file_layout_pg_pool(req->r_file_layout)); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; @@ -1007,10 +1299,10 @@ static int __map_request(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); - list_add(&req->r_osd_item, &req->r_osd->o_requests); - list_move(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); + list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); } else { - list_move(&req->r_req_lru_item, &osdc->req_notarget); + list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ @@ -1045,8 +1337,14 @@ static void __send_request(struct ceph_osd_client *osdc, list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ - ceph_con_send(&req->r_osd->o_con, req->r_request); + + /* Mark the request unsafe if this is the first timet's being sent. */ + + if (!req->r_sent && req->r_unsafe_callback) + req->r_unsafe_callback(req, true); req->r_sent = req->r_osd->o_incarnation; + + ceph_con_send(&req->r_osd->o_con, req->r_request); } /* @@ -1134,31 +1432,11 @@ static void handle_osds_timeout(struct work_struct *work) static void complete_request(struct ceph_osd_request *req) { - if (req->r_safe_callback) - req->r_safe_callback(req, NULL); + if (req->r_unsafe_callback) + req->r_unsafe_callback(req, false); complete_all(&req->r_safe_completion); /* fsync waiter */ } -static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid) -{ - __u8 v; - - ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad); - v = ceph_decode_8(p); - if (v > 1) { - pr_warning("do not understand pg encoding %d > 1", v); - return -EINVAL; - } - pgid->pool = ceph_decode_64(p); - pgid->seed = ceph_decode_32(p); - *p += 4; - return 0; - -bad: - pr_warning("incomplete pg encoding"); - return -EINVAL; -} - /* * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. @@ -1170,7 +1448,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_osd_request *req; u64 tid; int object_len; - int numops, payload_len, flags; + unsigned int numops; + int payload_len, flags; s32 result; s32 retry_attempt; struct ceph_pg pg; @@ -1178,7 +1457,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, u32 reassert_epoch; u64 reassert_version; u32 osdmap_epoch; - int i; + int already_completed; + u32 bytes; + unsigned int i; tid = le64_to_cpu(msg->hdr.tid); dout("handle_reply %p tid %llu\n", msg, tid); @@ -1191,7 +1472,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ceph_decode_need(&p, end, object_len, bad); p += object_len; - err = __decode_pgid(&p, end, &pg); + err = ceph_decode_pgid(&p, end, &pg); if (err) goto bad; @@ -1207,8 +1488,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req = __lookup_request(osdc, tid); if (req == NULL) { dout("handle_reply tid %llu dne\n", tid); - mutex_unlock(&osdc->request_mutex); - return; + goto bad_mutex; } ceph_osdc_get_request(req); @@ -1233,9 +1513,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, payload_len += len; p += sizeof(*op); } - if (payload_len != le32_to_cpu(msg->hdr.data_len)) { + bytes = le32_to_cpu(msg->hdr.data_len); + if (payload_len != bytes) { pr_warning("sum of op payload lens %d != data_len %d", - payload_len, le32_to_cpu(msg->hdr.data_len)); + payload_len, bytes); goto bad_put; } @@ -1244,21 +1525,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - /* - * if this connection filled our message, drop our reference now, to - * avoid a (safe but slower) revoke later. - */ - if (req->r_con_filling_msg == con && req->r_reply == msg) { - dout(" dropping con_filling_msg ref %p\n", con); - req->r_con_filling_msg = NULL; - con->ops->put(con); - } - if (!req->r_got_reply) { - unsigned int bytes; req->r_result = result; - bytes = le32_to_cpu(msg->hdr.data_len); dout("handle_reply result %d bytes %d\n", req->r_result, bytes); if (req->r_result == 0) @@ -1286,7 +1555,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ((flags & CEPH_OSD_FLAG_WRITE) == 0)) __unregister_request(osdc, req); + already_completed = req->r_completed; + req->r_completed = 1; mutex_unlock(&osdc->request_mutex); + if (already_completed) + goto done; if (req->r_callback) req->r_callback(req, msg); @@ -1303,6 +1576,8 @@ done: bad_put: ceph_osdc_put_request(req); +bad_mutex: + mutex_unlock(&osdc->request_mutex); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); @@ -1736,6 +2011,104 @@ bad: } /* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) +{ + struct ceph_msg *msg = req->r_request; + void *p; + size_t msg_size; + int flags = req->r_flags; + u64 data_len; + unsigned int i; + + req->r_snapid = snap_id; + req->r_snapc = ceph_get_snap_context(snapc); + + /* encode request */ + msg->hdr.version = cpu_to_le16(4); + + p = msg->front.iov_base; + ceph_encode_32(&p, 1); /* client_inc is always 1 */ + req->r_request_osdmap_epoch = p; + p += 4; + req->r_request_flags = p; + p += 4; + if (req->r_flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(p, mtime); + p += sizeof(struct ceph_timespec); + req->r_request_reassert_version = p; + p += sizeof(struct ceph_eversion); /* will get filled in */ + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + req->r_request_pool = p; + p += 8; + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + ceph_encode_8(&p, 1); + req->r_request_pgid = p; + p += 8 + 4; + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); + p += req->r_oid_len; + + /* ops--can imply data */ + ceph_encode_16(&p, (u16)req->r_num_ops); + data_len = 0; + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); + p += sizeof(struct ceph_osd_op); + } + + /* snaps */ + ceph_encode_64(&p, req->r_snapid); + ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); + ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); + if (req->r_snapc) { + for (i = 0; i < snapc->num_snaps; i++) { + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } + } + + req->r_request_attempts = p; + p += 4; + + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) { + u16 data_off; + + /* + * The header "data_off" is a hint to the receiver + * allowing it to align received data into its + * buffers such that there's no need to re-copy + * it before writing it to disk (direct I/O). + */ + data_off = (u16) (off & 0xffff); + req->r_request->hdr.data_off = cpu_to_le16(data_off); + } + req->r_request->hdr.data_len = cpu_to_le32(data_len); + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + + dout("build_request msg_size was %d\n", (int)msg_size); +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/* * Register request, send initial attempt. */ int ceph_osdc_start_request(struct ceph_osd_client *osdc, @@ -1744,41 +2117,26 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - req->r_request->pages = req->r_pages; - req->r_request->nr_pages = req->r_num_pages; -#ifdef CONFIG_BLOCK - req->r_request->bio = req->r_bio; -#endif - req->r_request->trail = &req->r_trail; - - register_request(osdc, req); - down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - /* - * a racing kick_requests() may have sent the message for us - * while we dropped request_mutex above, so only send now if - * the request still han't been touched yet. - */ - if (req->r_sent == 0) { - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_request(osdc, req); + __register_request(osdc, req); + WARN_ON(req->r_sent); + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; } - rc = 0; + goto out_unlock; } - + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + rc = 0; out_unlock: mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); @@ -1940,18 +2298,22 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, truncate_seq, truncate_size, NULL, - false, page_align); + NULL, truncate_seq, truncate_size, + false); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short read due to an object boundary */ - req->r_pages = pages; - dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, req->r_num_pages, page_align); + osd_req_op_extent_osd_data_pages(req, 0, + pages, *plen, page_align, false, false); + + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, *plen, page_align); + + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1978,20 +2340,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int rc = 0; int page_align = off & ~PAGE_MASK; - BUG_ON(vino.snap != CEPH_NOSNAP); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, + BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - snapc, 0, - truncate_seq, truncate_size, mtime, - true, page_align); + snapc, truncate_seq, truncate_size, + true); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short write due to an object boundary */ - req->r_pages = pages; - dout("writepages %llu~%llu (%d pages)\n", off, len, - req->r_num_pages); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); + + ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2005,6 +2368,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_setup(void) +{ + BUG_ON(ceph_osd_request_cache); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", + sizeof (struct ceph_osd_request), + __alignof__(struct ceph_osd_request), + 0, NULL); + + return ceph_osd_request_cache ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(ceph_osdc_setup); + +void ceph_osdc_cleanup(void) +{ + BUG_ON(!ceph_osd_request_cache); + kmem_cache_destroy(ceph_osd_request_cache); + ceph_osd_request_cache = NULL; +} +EXPORT_SYMBOL(ceph_osdc_cleanup); + /* * handle incoming message */ @@ -2064,13 +2447,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, goto out; } - if (req->r_con_filling_msg) { + if (req->r_reply->con) dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); - ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } + req->r_reply, req->r_reply->con); + ceph_msg_revoke_incoming(req->r_reply); if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", @@ -2084,26 +2464,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - int want = calc_pages_for(req->r_page_alignment, data_len); - - if (req->r_pages && unlikely(req->r_num_pages < want)) { - pr_warning("tid %lld reply has %d bytes %d pages, we" - " had only %d pages ready\n", tid, data_len, - want, req->r_num_pages); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; + struct ceph_osd_data *osd_data; + + /* + * XXX This is assuming there is only one op containing + * XXX page data. Probably OK for reads, but this + * XXX ought to be done more generally. + */ + osd_data = osd_req_op_extent_osd_data(req, 0); + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + if (osd_data->pages && + unlikely(osd_data->length < data_len)) { + + pr_warning("tid %lld reply has %d bytes " + "we had only %llu bytes ready\n", + tid, data_len, osd_data->length); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } } - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - m->page_alignment = req->r_page_alignment; -#ifdef CONFIG_BLOCK - m->bio = req->r_bio; -#endif } *skip = 0; - req->r_con_filling_msg = con->ops->get(con); dout("get_reply tid %lld %p\n", tid, m); out: @@ -2168,13 +2551,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); if (ret) return ERR_PTR(ret); } @@ -2190,11 +2577,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - /* - * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, - * XXX which do we do: succeed or fail? - */ - return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -2203,9 +2586,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - if (ac->ops && ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); - + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4543b9aba40c..603ddd92db19 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } -static int __decode_pgid(void **p, void *end, struct ceph_pg *pg) -{ - u8 v; - - ceph_decode_need(p, end, 1+8+4+4, bad); - v = ceph_decode_8(p); - if (v != 1) - goto bad; - pg->pool = ceph_decode_64(p); - pg->seed = ceph_decode_32(p); - *p += 4; /* skip preferred */ - return 0; - -bad: - dout("error decoding pgid\n"); - return -EINVAL; -} - /* * decode a full map. */ @@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_pg pgid; struct ceph_pg_mapping *pg; - err = __decode_pgid(p, end, &pgid); + err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; ceph_decode_need(p, end, sizeof(u32), bad); @@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_pg pgid; u32 pglen; - err = __decode_pgid(p, end, &pgid); + err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; ceph_decode_need(p, end, sizeof(u32), bad); @@ -1111,27 +1093,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * calculate an object layout (i.e. pgid) from an oid, * file_layout, and osdmap */ -int ceph_calc_object_layout(struct ceph_pg *pg, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap) +int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool) { - unsigned int num, num_mask; - struct ceph_pg_pool_info *pool; + struct ceph_pg_pool_info *pool_info; BUG_ON(!osdmap); - pg->pool = le32_to_cpu(fl->fl_pg_pool); - pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); - if (!pool) + pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); + if (!pool_info) return -EIO; - pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); - num = pool->pg_num; - num_mask = pool->pg_num_mask; + pg->pool = pool; + pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); + dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_object_layout); +EXPORT_SYMBOL(ceph_calc_ceph_pg); /* * Calculate raw osd vector for the given pgid. Return pointer to osd diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c new file mode 100644 index 000000000000..154683f5f14c --- /dev/null +++ b/net/ceph/snapshot.c @@ -0,0 +1,78 @@ +/* + * snapshot.c Ceph snapshot context utility routines (part of libceph) + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <stddef.h> + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/ceph/libceph.h> + +/* + * Ceph snapshot contexts are reference counted objects, and the + * returned structure holds a single reference. Acquire additional + * references with ceph_get_snap_context(), and release them with + * ceph_put_snap_context(). When the reference count reaches zero + * the entire structure is freed. + */ + +/* + * Create a new ceph snapshot context large enough to hold the + * indicated number of snapshot ids (which can be 0). Caller has + * to fill in snapc->seq and snapc->snaps[0..snap_count-1]. + * + * Returns a null pointer if an error occurs. + */ +struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags) +{ + struct ceph_snap_context *snapc; + size_t size; + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (snapc->snaps[0]); + snapc = kzalloc(size, gfp_flags); + if (!snapc) + return NULL; + + atomic_set(&snapc->nref, 1); + snapc->num_snaps = snap_count; + + return snapc; +} +EXPORT_SYMBOL(ceph_create_snap_context); + +struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +{ + if (sc) + atomic_inc(&sc->nref); + return sc; +} +EXPORT_SYMBOL(ceph_get_snap_context); + +void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} +EXPORT_SYMBOL(ceph_put_snap_context); |