From d24cdcd3e40a6825135498e11c20c7976b9bf545 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Jan 2017 12:06:09 +0100 Subject: libceph: use BUG() instead of BUG_ON(1) I ran into this compile warning, which is the result of BUG_ON(1) not always leading to the compiler treating the code path as unreachable: include/linux/ceph/osdmap.h: In function 'ceph_can_shift_osds': include/linux/ceph/osdmap.h:62:1: error: control reaches end of non-void function [-Werror=return-type] Using BUG() here avoids the warning. Signed-off-by: Arnd Bergmann Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 9a9041784dcf..412906609954 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) case CEPH_POOL_TYPE_EC: return false; default: - BUG_ON(1); + BUG(); } } -- cgit v1.2.3 From 66a0e2d579dbec5c676cfe446234ffebb267c564 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 31 Jan 2017 15:55:06 +0100 Subject: crush: remove mutable part of CRUSH map Then add it to the working state. It would be very nice if we didn't have to take a lock to calculate a crush placement. By moving the permutation array into the working data, we can treat the CRUSH map as immutable. Reflects ceph.git commit cbcd039651c0569551cb90d26ce27e1432671f2a. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 1 + include/linux/crush/crush.h | 41 +++++++-- include/linux/crush/mapper.h | 4 +- net/ceph/crush/crush.c | 5 -- net/ceph/crush/mapper.c | 206 +++++++++++++++++++++++++++++-------------- net/ceph/osdmap.c | 47 ++++++++-- 6 files changed, 218 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 412906609954..cef1cab789b9 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -175,6 +175,7 @@ struct ceph_osdmap { struct mutex crush_scratch_mutex; int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; + void *crush_workspace; }; static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index be8f12b8f195..fbecbd089d75 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -135,13 +135,6 @@ struct crush_bucket { __u32 size; /* num items */ __s32 *items; - /* - * cached random permutation: used for uniform bucket and for - * the linear search fallback for the other bucket types. - */ - __u32 perm_x; /* @x for which *perm is defined */ - __u32 perm_n; /* num elements of *perm that are permuted/defined */ - __u32 *perm; }; struct crush_bucket_uniform { @@ -211,6 +204,21 @@ struct crush_map { * device fails. */ __u8 chooseleaf_stable; + /* + * This value is calculated after decode or construction by + * the builder. It is exposed here (rather than having a + * 'build CRUSH working space' function) so that callers can + * reserve a static buffer, allocate space on the stack, or + * otherwise avoid calling into the heap allocator if they + * want to. The size of the working space depends on the map, + * while the size of the scratch vector passed to the mapper + * depends on the size of the desired result set. + * + * Nothing stops the caller from allocating both in one swell + * foop and passing in two points, though. + */ + size_t working_size; + #ifndef __KERNEL__ /* * version 0 (original) of straw_calc has various flaws. version 1 @@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i) return ((i+1) << 1)-1; } +/* + * These data structures are private to the CRUSH implementation. They + * are exposed in this header file because builder needs their + * definitions to calculate the total working size. + * + * Moving this out of the crush map allow us to treat the CRUSH map as + * immutable within the mapper and removes the requirement for a CRUSH + * map lock. + */ +struct crush_work_bucket { + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; /* Permutation of the bucket's items */ +}; + +struct crush_work { + struct crush_work_bucket **work; /* Per-bucket working store */ +}; + #endif diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 5dfd5b1125d2..3303c7fd8a31 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -15,6 +15,8 @@ extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weights, int weight_max, - int *scratch); + void *cwin, int *scratch); + +void crush_init_workspace(const struct crush_map *map, void *v); #endif diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 80d7c3a97cb8..5bf94c04f645 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) { kfree(b->item_weights); kfree(b->sum_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b->node_weights); kfree(b); @@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) { kfree(b->straws); kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 130ab407c5ec..9e75be5ec716 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size return -1; } - /* * bucket choose methods * @@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size * Since this is expensive, we optimize for the r=0 case, which * captures the vast majority of calls. */ -static int bucket_perm_choose(struct crush_bucket *bucket, +static int bucket_perm_choose(const struct crush_bucket *bucket, + struct crush_work_bucket *work, int x, int r) { unsigned int pr = r % bucket->size; unsigned int i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { + if (work->perm_x != (__u32)x || work->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; + work->perm_x = x; /* optimize common r=0 case */ if (pr == 0) { s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ + work->perm[0] = s; + work->perm_n = 0xffff; /* magic value, see below */ goto out; } for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { + work->perm[i] = i; + work->perm_n = 0; + } else if (work->perm_n == 0xffff) { /* clean up after the r=0 case above */ for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; + work->perm[i] = i; + work->perm[work->perm[0]] = 0; + work->perm_n = 1; } /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) + for (i = 0; i < work->perm_n; i++) dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned int p = bucket->perm_n; + while (work->perm_n <= pr) { + unsigned int p = work->perm_n; /* no point in swapping the final entry */ if (p < bucket->size - 1) { i = crush_hash32_3(bucket->hash, x, bucket->id, p) % (bucket->size - p); if (i) { - unsigned int t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; + unsigned int t = work->perm[p + i]; + work->perm[p + i] = work->perm[p]; + work->perm[p] = t; } dprintk(" perm_choose swap %d with %d\n", p, p+i); } - bucket->perm_n++; + work->perm_n++; } for (i = 0; i < bucket->size; i++) dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); - s = bucket->perm[pr]; + s = work->perm[pr]; out: dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, bucket->size, x, r, pr, s); @@ -132,14 +132,14 @@ out: } /* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) +static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, + struct crush_work_bucket *work, int x, int r) { - return bucket_perm_choose(&bucket->h, x, r); + return bucket_perm_choose(&bucket->h, work, x, r); } /* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, +static int bucket_list_choose(const struct crush_bucket_list *bucket, int x, int r) { int i; @@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, w *= bucket->sum_weights[i]; w = w >> 16; /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) + if (w < bucket->item_weights[i]) { return bucket->h.items[i]; + } } dprintk("bad list sums for bucket %d\n", bucket->h.id); @@ -192,7 +193,7 @@ static int terminal(int x) return x & 1; } -static int bucket_tree_choose(struct crush_bucket_tree *bucket, +static int bucket_tree_choose(const struct crush_bucket_tree *bucket, int x, int r) { int n; @@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, /* straw */ -static int bucket_straw_choose(struct crush_bucket_straw *bucket, +static int bucket_straw_choose(const struct crush_bucket_straw *bucket, int x, int r) { __u32 i; @@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin) * */ -static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, +static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, int x, int r) { unsigned int i, high = 0; @@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, high_draw = draw; } } + return bucket->h.items[high]; } -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +static int crush_bucket_choose(const struct crush_bucket *in, + struct crush_work_bucket *work, + int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); + return bucket_uniform_choose( + (const struct crush_bucket_uniform *)in, + work, x, r); case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, + return bucket_list_choose((const struct crush_bucket_list *)in, x, r); case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, + return bucket_tree_choose((const struct crush_bucket_tree *)in, x, r); case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); + return bucket_straw_choose( + (const struct crush_bucket_straw *)in, + x, r); case CRUSH_BUCKET_STRAW2: - return bucket_straw2_choose((struct crush_bucket_straw2 *)in, - x, r); + return bucket_straw2_choose( + (const struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } - /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map, * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, @@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map, int rep; unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int r; int i; int item = 0; @@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map, if (local_fallback_retries > 0 && flocal >= (in->size>>1) && flocal > local_fallback_retries) - item = bucket_perm_choose(in, x, r); + item = bucket_perm_choose( + in, work->work[-1-in->id], + x, r); else - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map, sub_r = r >> (vary_r-1); else sub_r = 0; - if (crush_choose_firstn(map, - map->buckets[-1-item], - weight, weight_max, - x, stable ? 1 : outpos+1, 0, - out2, outpos, count, - recurse_tries, 0, - local_retries, - local_fallback_retries, - 0, - vary_r, - stable, - NULL, - sub_r) <= outpos) + if (crush_choose_firstn( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, stable ? 1 : outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + stable, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -600,7 +613,8 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int left, int numrep, int type, int *out, int outpos, @@ -610,7 +624,7 @@ static void crush_choose_indep(const struct crush_map *map, int *out2, int parent_r) { - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int endpos = outpos + left; int rep; unsigned int ftotal; @@ -678,7 +692,9 @@ static void crush_choose_indep(const struct crush_map *map, break; } - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -724,13 +740,15 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { - crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - recurse_tries, 0, - 0, NULL, r); + crush_choose_indep( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -781,6 +799,53 @@ static void crush_choose_indep(const struct crush_map *map, #endif } + +/* + * This takes a chunk of memory and sets it up to be a shiny new + * working area for a CRUSH placement computation. It must be called + * on any newly allocated memory before passing it in to + * crush_do_rule. It may be used repeatedly after that, so long as the + * map has not changed. If the map /has/ changed, you must make sure + * the working size is no smaller than what was allocated and re-run + * crush_init_workspace. + * + * If you do retain the working space between calls to crush, make it + * thread-local. + */ +void crush_init_workspace(const struct crush_map *map, void *v) +{ + struct crush_work *w = v; + __s32 b; + + /* + * We work by moving through the available space and setting + * values and pointers as we go. + * + * It's a bit like Forth's use of the 'allot' word since we + * set the pointer first and then reserve the space for it to + * point to by incrementing the point. + */ + v += sizeof(struct crush_work *); + w->work = v; + v += map->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < map->max_buckets; ++b) { + if (!map->buckets[b]) + continue; + + w->work[b] = v; + switch (map->buckets[b]->alg) { + default: + v += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = v; + v += map->buckets[b]->size * sizeof(__u32); + } + BUG_ON(v - (void *)w != map->working_size); +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -790,14 +855,16 @@ static void crush_choose_indep(const struct crush_map *map, * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector + * @cwin: pointer to at least map->working_size bytes of memory * @scratch: scratch vector for private use; must be >= 3 * result_max */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - int *scratch) + void *cwin, int *scratch) { int result_len; + struct crush_work *cw = cwin; int *a = scratch; int *b = scratch + result_max; int *c = scratch + result_max*2; @@ -807,7 +874,7 @@ int crush_do_rule(const struct crush_map *map, int *o; int osize; int *tmp; - struct crush_rule *rule; + const struct crush_rule *rule; __u32 step; int i, j; int numrep; @@ -840,7 +907,7 @@ int crush_do_rule(const struct crush_map *map, for (step = 0; step < rule->len; step++) { int firstn = 0; - struct crush_rule_step *curstep = &rule->steps[step]; + const struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) { case CRUSH_RULE_TAKE: @@ -936,6 +1003,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, + cw, map->buckets[bno], weight, weight_max, x, numrep, @@ -956,6 +1024,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, + cw, map->buckets[bno], weight, weight_max, x, out_size, numrep, @@ -997,5 +1066,6 @@ int crush_do_rule(const struct crush_map *map, break; } } + return result_len; } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 47df075b31e5..3892e7fa7747 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -153,6 +153,32 @@ bad: return -EINVAL; } +static void crush_finalize(struct crush_map *c) +{ + __s32 b; + + /* Space for the array of pointers to per-bucket workspace */ + c->working_size = sizeof(struct crush_work) + + c->max_buckets * sizeof(struct crush_work_bucket *); + + for (b = 0; b < c->max_buckets; b++) { + if (!c->buckets[b]) + continue; + + switch (c->buckets[b]->alg) { + default: + /* + * The base case, permutation variables and + * the pointer to the permutation array. + */ + c->working_size += sizeof(struct crush_work_bucket); + break; + } + /* Every bucket has a permutation array. */ + c->working_size += c->buckets[b]->size * sizeof(__u32); + } +} + static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; @@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); if (b->items == NULL) goto badmem; - b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); - if (b->perm == NULL) - goto badmem; - b->perm_n = 0; ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) @@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); + crush_finalize(c); + done: dout("crush_decode success\n"); return c; @@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_weight); kfree(map->osd_addr); kfree(map->osd_primary_affinity); + kfree(map->crush_workspace); kfree(map); } @@ -810,12 +835,23 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) { + void *workspace; + if (IS_ERR(crush)) return PTR_ERR(crush); + workspace = kmalloc(crush->working_size, GFP_NOIO); + if (!workspace) { + crush_destroy(crush); + return -ENOMEM; + } + crush_init_workspace(crush, workspace); + if (map->crush) crush_destroy(map->crush); + kfree(map->crush_workspace); map->crush = crush; + map->crush_workspace = workspace; return 0; } @@ -1940,7 +1976,8 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, mutex_lock(&map->crush_scratch_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_scratch_ary); + weight, weight_max, map->crush_workspace, + map->crush_scratch_ary); mutex_unlock(&map->crush_scratch_mutex); return r; -- cgit v1.2.3 From 743efcffffc6620ab44ea9ec67c7e4e28dfa7742 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 31 Jan 2017 15:55:06 +0100 Subject: crush: merge working data and scratch Much like Arlo Guthrie, I decided that one big pile is better than two little piles. Reflects ceph.git commit 95c2df6c7e0b22d2ea9d91db500cf8b9441c73ba. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 3 +-- include/linux/crush/mapper.h | 14 +++++++++++++- net/ceph/crush/mapper.c | 17 +++++++---------- net/ceph/osdmap.c | 14 ++++++++------ 4 files changed, 29 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index cef1cab789b9..8cebdc4158c3 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -173,8 +173,7 @@ struct ceph_osdmap { * the list of osds that store+replicate them. */ struct crush_map *crush; - struct mutex crush_scratch_mutex; - int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; + struct mutex crush_workspace_mutex; void *crush_workspace; }; diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 3303c7fd8a31..c95e19e1ff11 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -15,7 +15,19 @@ extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weights, int weight_max, - void *cwin, int *scratch); + void *cwin); + +/* + * Returns the exact amount of workspace that will need to be used + * for a given combination of crush_map and result_max. The caller can + * then allocate this much on its own, either on the stack, in a + * per-thread long-lived buffer, or however it likes. + */ +static inline size_t crush_work_size(const struct crush_map *map, + int result_max) +{ + return map->working_size + result_max * 3 * sizeof(__u32); +} void crush_init_workspace(const struct crush_map *map, void *v); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 9e75be5ec716..2e31217ccae3 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -855,23 +855,22 @@ void crush_init_workspace(const struct crush_map *map, void *v) * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector - * @cwin: pointer to at least map->working_size bytes of memory - * @scratch: scratch vector for private use; must be >= 3 * result_max + * @cwin: pointer to at least crush_work_size() bytes of memory */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - void *cwin, int *scratch) + void *cwin) { int result_len; struct crush_work *cw = cwin; - int *a = scratch; - int *b = scratch + result_max; - int *c = scratch + result_max*2; + int *a = cwin + map->working_size; + int *b = a + result_max; + int *c = b + result_max; + int *w = a; + int *o = b; int recurse_to_leaf; - int *w; int wsize = 0; - int *o; int osize; int *tmp; const struct crush_rule *rule; @@ -902,8 +901,6 @@ int crush_do_rule(const struct crush_map *map, rule = map->rules[ruleno]; result_len = 0; - w = a; - o = b; for (step = 0; step < rule->len; step++) { int firstn = 0; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 3892e7fa7747..2374956c4d40 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -743,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + mutex_init(&map->crush_workspace_mutex); return map; } @@ -836,11 +836,14 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) { void *workspace; + size_t work_size; if (IS_ERR(crush)) return PTR_ERR(crush); - workspace = kmalloc(crush->working_size, GFP_NOIO); + work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); + dout("%s work_size %zu bytes\n", __func__, work_size); + workspace = kmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1974,11 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, BUG_ON(result_max > CEPH_PG_MAX_SIZE); - mutex_lock(&map->crush_scratch_mutex); + mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_workspace, - map->crush_scratch_ary); - mutex_unlock(&map->crush_scratch_mutex); + weight, weight_max, map->crush_workspace); + mutex_unlock(&map->crush_workspace_mutex); return r; } -- cgit v1.2.3 From 083a51fbc57ca848ab087692f3cc97898fd88b54 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Feb 2017 16:14:52 +0100 Subject: libceph: bump CEPH_PG_MAX_SIZE to 32 ... to accommodate potentially very wide EC pools. This increases the size of a typical rbd ceph_osd_request by ~12% (from 1040 to 1168 bytes), but I'd rather go future proof here. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/rados.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 5c0da61cb763..5d0018782d50 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -50,7 +50,7 @@ struct ceph_timespec { #define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_HYBRID 3 -#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ +#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */ /* * placement group. -- cgit v1.2.3 From 6c696d8560e74cd42458931375875d62ae88c6ae Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 25 Jan 2017 18:16:23 +0100 Subject: rbd: kill obj_request->object_name and rbd_segment_name_cache Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 79 ++++----------------------------------------- include/linux/ceph/osdmap.h | 7 ---- 2 files changed, 7 insertions(+), 79 deletions(-) (limited to 'include') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6c094b580eae..b6e269b4d534 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -231,7 +231,6 @@ enum obj_req_flags { }; struct rbd_obj_request { - const char *object_name; u64 object_no; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ @@ -440,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; -static struct kmem_cache *rbd_segment_name_cache; static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -1249,37 +1247,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.features = 0; } -static void rbd_segment_name_free(const char *name) -{ - /* The explicit cast here is needed to drop the const qualifier */ - - kmem_cache_free(rbd_segment_name_cache, (void *)name); -} - -static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) -{ - const char *name_format = rbd_dev->image_format == 1 ? - RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; - char *name; - u64 segment; - int ret; - - name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); - if (!name) - return NULL; - segment = offset >> rbd_dev->header.obj_order; - ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, - rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { - pr_err("error formatting segment name for #%llu (%d)\n", - segment, ret); - rbd_segment_name_free(name); - name = NULL; - } - - return name; -} - static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) { u64 segment_size = rbd_obj_bytes(&rbd_dev->header); @@ -2050,29 +2017,17 @@ static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) ceph_osdc_put_request(osd_req); } -/* object_name is assumed to be a non-null pointer and NUL-terminated */ - -static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, - enum obj_request_type type) +static struct rbd_obj_request * +rbd_obj_request_create(enum obj_request_type type) { struct rbd_obj_request *obj_request; - size_t size; - char *name; rbd_assert(obj_request_type_valid(type)); - size = strlen(object_name) + 1; - name = kmalloc(size, GFP_NOIO); - if (!name) - return NULL; - obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); - if (!obj_request) { - kfree(name); + if (!obj_request) return NULL; - } - obj_request->object_name = memcpy(name, object_name, size); obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); @@ -2114,8 +2069,6 @@ static void rbd_obj_request_destroy(struct kref *kref) break; } - kfree(obj_request->object_name); - obj_request->object_name = NULL; kmem_cache_free(rbd_obj_request_cache, obj_request); } @@ -2490,17 +2443,11 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, while (resid) { struct ceph_osd_request *osd_req; - const char *object_name; u64 object_no = img_offset >> rbd_dev->header.obj_order; u64 offset = rbd_segment_offset(rbd_dev, img_offset); u64 length = rbd_segment_length(rbd_dev, img_offset, resid); - object_name = rbd_segment_name(rbd_dev, img_offset); - if (!object_name) - goto out_unwind; - obj_request = rbd_obj_request_create(object_name, type); - /* object request has its own copy of the object name */ - rbd_segment_name_free(object_name); + obj_request = rbd_obj_request_create(type); if (!obj_request) goto out_unwind; @@ -2846,8 +2793,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) size_t size; int ret; - stat_request = rbd_obj_request_create(obj_request->object_name, - OBJ_REQUEST_PAGES); + stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); if (!stat_request) return -ENOMEM; @@ -6389,27 +6335,16 @@ static int rbd_slab_init(void) if (!rbd_obj_request_cache) goto out_err; - rbd_assert(!rbd_segment_name_cache); - rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); - if (rbd_segment_name_cache) - return 0; -out_err: - kmem_cache_destroy(rbd_obj_request_cache); - rbd_obj_request_cache = NULL; + return 0; +out_err: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; - return -ENOMEM; } static void rbd_slab_exit(void) { - rbd_assert(rbd_segment_name_cache); - kmem_cache_destroy(rbd_segment_name_cache); - rbd_segment_name_cache = NULL; - rbd_assert(rbd_obj_request_cache); kmem_cache_destroy(rbd_obj_request_cache); rbd_obj_request_cache = NULL; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8cebdc4158c3..938656f70807 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -81,13 +81,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest, const struct ceph_object_locator *src); void ceph_oloc_destroy(struct ceph_object_locator *oloc); -/* - * Maximum supported by kernel client object name length - * - * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define CEPH_MAX_OID_NAME_LEN 100 - /* * 51-char inline_name is long enough for all cephfs and all but one * rbd requests: in ".rbd"/"rbd_id." can be -- cgit v1.2.3 From b18b9550e4059ceea0393c518eb323b95243f92f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 11 Feb 2017 18:46:08 +0100 Subject: libceph: get rid of ack vs commit - CEPH_OSD_FLAG_ACK shouldn't be set anymore, so assert on it - remove support for handling ack replies (OSDs will send ack replies only if clients request them) - drop the "do lingering callbacks under osd->lock" logic from handle_reply() -- lreq->lock is sufficient in all three cases Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 6 +-- net/ceph/osd_client.c | 113 +++++++++------------------------------- 2 files changed, 27 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 03a6653d329a..2ea0c282f3dc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -22,7 +22,6 @@ struct ceph_osd_client; * completion callback for async writepages */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); -typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); #define CEPH_HOMELESS_OSD -1 @@ -170,15 +169,12 @@ struct ceph_osd_request { unsigned int r_num_ops; int r_result; - bool r_got_reply; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion; - struct completion r_done_completion; /* fsync waiter */ + struct completion r_completion; /* private to osd_client.c */ ceph_osdc_callback_t r_callback; - ceph_osdc_unsafe_callback_t r_unsafe_callback; struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ac4753421d0c..e1c6c2b4a295 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req) kref_init(&req->r_kref); init_completion(&req->r_completion); - init_completion(&req->r_done_completion); RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_unsafe_item); @@ -1637,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) bool need_send = false; bool promoted = false; - WARN_ON(req->r_tid || req->r_got_reply); + WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: @@ -1705,17 +1704,10 @@ promote: static void account_request(struct ceph_osd_request *req) { - unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + WARN_ON(req->r_flags & CEPH_OSD_FLAG_ACK); + WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); - if (req->r_flags & CEPH_OSD_FLAG_READ) { - WARN_ON(req->r_flags & mask); - req->r_flags |= CEPH_OSD_FLAG_ACK; - } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) - WARN_ON(!(req->r_flags & mask)); - else - WARN_ON(1); - - WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); + req->r_flags |= CEPH_OSD_FLAG_ONDISK; atomic_inc(&req->r_osdc->num_requests); } @@ -1750,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - if (req->r_callback) + if (req->r_callback) { + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + req->r_tid, req->r_callback, req->r_result); req->r_callback(req); - else - complete_all(&req->r_completion); + } } /* - * Note that this is open-coded in handle_reply(), which has to deal - * with ack vs commit, dup acks, etc. + * This is open-coded in handle_reply(). */ static void complete_request(struct ceph_osd_request *req, int err) { @@ -1767,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); __complete_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -1793,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req) cancel_map_check(req); finish_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -2170,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req) mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); - WARN_ON(!__linger_registered(lreq)); linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; @@ -2786,31 +2777,8 @@ e_inval: } /* - * We are done with @req if - * - @m is a safe reply, or - * - @m is an unsafe reply and we didn't want a safe one - */ -static bool done_request(const struct ceph_osd_request *req, - const struct MOSDOpReply *m) -{ - return (m->result < 0 || - (m->flags & CEPH_OSD_FLAG_ONDISK) || - !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); -} - -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - * - * ->r_unsafe_callback is set? yes no - * - * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, - * any or needed/got safe) r_done_completion r_done_completion - * - * first reply is unsafe r_unsafe_cb(true) (nothing) - * - * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, - * r_done_completion r_done_completion + * Handle MOSDOpReply. Set ->r_result and call the callback if it is + * specified. */ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { @@ -2819,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); u32 data_len = 0; - bool already_acked; int ret; int i; @@ -2898,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) le32_to_cpu(msg->hdr.data_len), req->r_tid); goto fail_request; } - dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, - req, req->r_tid, req->r_got_reply, m.result, data_len); - - already_acked = req->r_got_reply; - if (!already_acked) { - req->r_result = m.result ?: data_len; - req->r_replay_version = m.replay_version; /* struct */ - req->r_got_reply = true; - } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { - dout("req %p tid %llu dup ack\n", req, req->r_tid); - goto out_unlock_session; - } - - if (done_request(req, &m)) { - finish_request(req); - if (req->r_linger) { - WARN_ON(req->r_unsafe_callback); - dout("req %p tid %llu cb (locked)\n", req, req->r_tid); - __complete_request(req); - } - } + dout("%s req %p tid %llu result %d data_len %u\n", __func__, + req, req->r_tid, m.result, data_len); + /* + * Since we only ever request ONDISK, we should only ever get + * one (type of) reply back. + */ + WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_result = m.result ?: data_len; + finish_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); - if (done_request(req, &m)) { - if (already_acked && req->r_unsafe_callback) { - dout("req %p tid %llu safe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, false); - } else if (!req->r_linger) { - dout("req %p tid %llu cb\n", req, req->r_tid); - __complete_request(req); - } - complete_all(&req->r_done_completion); - ceph_osdc_put_request(req); - } else { - if (req->r_unsafe_callback) { - dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, true); - } else { - WARN_ON(1); - } - } - + __complete_request(req); + complete_all(&req->r_completion); + ceph_osdc_put_request(req); return; fail_request: @@ -3541,7 +3480,7 @@ again: up_read(&osdc->lock); dout("%s waiting on req %p tid %llu last_tid %llu\n", __func__, req, req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); + wait_for_completion(&req->r_completion); ceph_osdc_put_request(req); goto again; } -- cgit v1.2.3