summaryrefslogtreecommitdiffstats
path: root/net/ceph/crush/mapper.c
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2015-04-14 16:54:52 +0300
committerIlya Dryomov <idryomov@gmail.com>2015-04-22 18:33:43 +0300
commit958a27658d94cf212caeb0ffb04ee0b0bc89cc40 (patch)
tree2aff94f8b656b5526c8cfef1a8994edbb0c960fb /net/ceph/crush/mapper.c
parent45002267e8d2699bf9b022315bee3dd13b044843 (diff)
downloadlinux-958a27658d94cf212caeb0ffb04ee0b0bc89cc40.tar.bz2
crush: straw2 bucket type with an efficient 64-bit crush_ln()
This is an improved straw bucket that correctly avoids any data movement between items A and B when neither A nor B's weights are changed. Said differently, if we adjust the weight of item C (including adding it anew or removing it completely), we will only see inputs move to or from C, never between other items in the bucket. Notably, there is not intermediate scaling factor that needs to be calculated. The mapping function is a simple function of the item weights. The below commits were squashed together into this one (mostly to avoid adding and then yanking a ~6000 lines worth of crush_ln_table): - crush: add a straw2 bucket type - crush: add crush_ln to calculate nature log efficently - crush: improve straw2 adjustment slightly - crush: change crush_ln to provide 32 more digits - crush: fix crush_get_bucket_item_weight and bucket destroy for straw2 - crush/mapper: fix divide-by-0 in straw2 (with div64_s64() for draw = ln / w and INT64_MIN -> S64_MIN - need to create a proper compat.h in ceph.git) Reflects ceph.git commits 242293c908e923d474910f2b8203fa3b41eb5a53, 32a1ead92efcd351822d22a5fc37d159c65c1338, 6289912418c4a3597a11778bcf29ed5415117ad9, 35fcb04e2945717cf5cfe150b9fa89cb3d2303a1, 6445d9ee7290938de1e4ee9563912a6ab6d8ee5f, b5921d55d16796e12d66ad2c4add7305f9ce2353. Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'net/ceph/crush/mapper.c')
-rw-r--r--net/ceph/crush/mapper.c101
1 files changed, 101 insertions, 0 deletions
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 91c41fe83113..5b47736d27d9 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -20,6 +20,7 @@
#include <linux/crush/crush.h>
#include <linux/crush/hash.h>
+#include "crush_ln_table.h"
/*
* Implement the core CRUSH mapping algorithm.
@@ -237,6 +238,102 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
return bucket->h.items[high];
}
+// compute 2^44*log2(input+1)
+uint64_t crush_ln(unsigned xin)
+{
+ unsigned x=xin, x1;
+ int iexpon, index1, index2;
+ uint64_t RH, LH, LL, xl64, result;
+
+ x++;
+
+ // normalize input
+ iexpon = 15;
+ while(!(x&0x18000)) { x<<=1; iexpon--; }
+
+ index1 = (x>>8)<<1;
+ // RH ~ 2^56/index1
+ RH = __RH_LH_tbl[index1 - 256];
+ // LH ~ 2^48 * log2(index1/256)
+ LH = __RH_LH_tbl[index1 + 1 - 256];
+
+ // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
+ xl64 = (int64_t)x * RH;
+ xl64 >>= 48;
+ x1 = xl64;
+
+ result = iexpon;
+ result <<= (12 + 32);
+
+ index2 = x1 & 0xff;
+ // LL ~ 2^48*log2(1.0+index2/2^15)
+ LL = __LL_tbl[index2];
+
+ LH = LH + LL;
+
+ LH >>= (48-12 - 32);
+ result += LH;
+
+ return result;
+}
+
+
+/*
+ * straw2
+ *
+ * for reference, see:
+ *
+ * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ *
+ */
+
+static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+ int x, int r)
+{
+ unsigned i, high = 0;
+ unsigned u;
+ unsigned w;
+ __s64 ln, draw, high_draw = 0;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ w = bucket->item_weights[i];
+ if (w) {
+ u = crush_hash32_3(bucket->h.hash, x,
+ bucket->h.items[i], r);
+ u &= 0xffff;
+
+ /*
+ * for some reason slightly less than 0x10000 produces
+ * a slightly more accurate distribution... probably a
+ * rounding effect.
+ *
+ * the natural log lookup table maps [0,0xffff]
+ * (corresponding to real numbers [1/0x10000, 1] to
+ * [0, 0xffffffffffff] (corresponding to real numbers
+ * [-11.090355,0]).
+ */
+ ln = crush_ln(u) - 0x1000000000000ll;
+
+ /*
+ * divide by 16.16 fixed-point weight. note
+ * that the ln value is negative, so a larger
+ * weight means a larger (less negative) value
+ * for draw.
+ */
+ draw = div64_s64(ln, w);
+ } else {
+ draw = S64_MIN;
+ }
+
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
@@ -254,12 +351,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
case CRUSH_BUCKET_STRAW:
return bucket_straw_choose((struct crush_bucket_straw *)in,
x, r);
+ case CRUSH_BUCKET_STRAW2:
+ return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
+ x, r);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
}
}
+
/*
* true if device is marked "out" (failed, fully offloaded)
* of the cluster