+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
#ifdef __KERNEL__
# include <linux/string.h>
# include <linux/slab.h>
# include <linux/bug.h>
# include <linux/kernel.h>
-# ifndef dprintk
-# define dprintk(args...)
-# endif
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
#endif
+#include "crush_ln_table.h"
-#include <linux/crush/crush.h>
-#include <linux/crush/hash.h>
-#include <linux/crush/mapper.h>
+#define dprintk(args...) /* printf(args) */
/*
* Implement the core CRUSH mapping algorithm.
int i;
for (i = bucket->h.size-1; i >= 0; i--) {
- __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+ __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
r, bucket->h.id);
w &= 0xffff;
dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
return bucket->h.items[high];
}
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
+{
+ unsigned int x = xin, x1;
+ int iexpon, index1, index2;
+ __u64 RH, LH, LL, xl64, result;
+
+ x++;
+
+ /* normalize input */
+ iexpon = 15;
+ while (!(x & 0x18000)) {
+ x <<= 1;
+ iexpon--;
+ }
+
+ index1 = (x >> 8) << 1;
+ /* RH ~ 2^56/index1 */
+ RH = __RH_LH_tbl[index1 - 256];
+ /* LH ~ 2^48 * log2(index1/256) */
+ LH = __RH_LH_tbl[index1 + 1 - 256];
+
+ /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+ xl64 = (__s64)x * RH;
+ xl64 >>= 48;
+ x1 = xl64;
+
+ result = iexpon;
+ result <<= (12 + 32);
+
+ index2 = x1 & 0xff;
+ /* LL ~ 2^48*log2(1.0+index2/2^15) */
+ LL = __LL_tbl[index2];
+
+ LH = LH + LL;
+
+ LH >>= (48 - 12 - 32);
+ result += LH;
+
+ return result;
+}
+
+
+/*
+ * straw2
+ *
+ * for reference, see:
+ *
+ * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ *
+ */
+
+static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+ int x, int r)
+{
+ unsigned int i, high = 0;
+ unsigned int u;
+ unsigned int w;
+ __s64 ln, draw, high_draw = 0;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ w = bucket->item_weights[i];
+ if (w) {
+ u = crush_hash32_3(bucket->h.hash, x,
+ bucket->h.items[i], r);
+ u &= 0xffff;
+
+ /*
+ * for some reason slightly less than 0x10000 produces
+ * a slightly more accurate distribution... probably a
+ * rounding effect.
+ *
+ * the natural log lookup table maps [0,0xffff]
+ * (corresponding to real numbers [1/0x10000, 1] to
+ * [0, 0xffffffffffff] (corresponding to real numbers
+ * [-11.090355,0]).
+ */
+ ln = crush_ln(u) - 0x1000000000000ll;
+
+ /*
+ * divide by 16.16 fixed-point weight. note
+ * that the ln value is negative, so a larger
+ * weight means a larger (less negative) value
+ * for draw.
+ */
+ draw = div64_s64(ln, w);
+ } else {
+ draw = S64_MIN;
+ }
+
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
case CRUSH_BUCKET_STRAW:
return bucket_straw_choose((struct crush_bucket_straw *)in,
x, r);
+ case CRUSH_BUCKET_STRAW2:
+ return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
+ x, r);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
}
}
+
/*
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
* @type: the type of item to choose
* @out: pointer to output vector
* @outpos: our position in that vector
+ * @out_size: size of the out vector
* @tries: number of attempts to make
* @recurse_tries: number of attempts to have recursive chooseleaf make
* @local_retries: localized retries
const __u32 *weight, int weight_max,
int x, int numrep, int type,
int *out, int outpos,
+ int out_size,
unsigned int tries,
unsigned int recurse_tries,
unsigned int local_retries,
int item = 0;
int itemtype;
int collide, reject;
+ int count = out_size;
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
recurse_to_leaf ? "_LEAF" : "",
tries, recurse_tries, local_retries, local_fallback_retries,
parent_r);
- for (rep = outpos; rep < numrep; rep++) {
+ for (rep = outpos; rep < numrep && count > 0 ; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
skip_rep = 0;
map->buckets[-1-item],
weight, weight_max,
x, outpos+1, 0,
- out2, outpos,
+ out2, outpos, count,
recurse_tries, 0,
local_retries,
local_fallback_retries,
dprintk("CHOOSE got %d\n", item);
out[outpos] = item;
outpos++;
+ count--;
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
}
dprintk("CHOOSE returns %d\n", outpos);
}
for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+ if (out2 && ftotal) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
for (rep = outpos; rep < endpos; rep++) {
if (out[rep] != CRUSH_ITEM_UNDEF)
continue;
out2[rep] = CRUSH_ITEM_NONE;
}
}
+#ifndef __KERNEL__
+ if (map->choose_tries && ftotal <= map->choose_total_tries)
+ map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+ if (out2) {
+ dprintk("%u %d a: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out[rep]);
+ }
+ dprintk("\n");
+ dprintk("%u %d b: ", ftotal, left);
+ for (rep = outpos; rep < endpos; rep++) {
+ dprintk(" %d", out2[rep]);
+ }
+ dprintk("\n");
+ }
+#endif
}
/**
__u32 step;
int i, j;
int numrep;
+ int out_size;
/*
* the original choose_total_tries value was off by one (it
* counted "retries" and not "tries"). add one.
switch (curstep->op) {
case CRUSH_RULE_TAKE:
- w[0] = curstep->arg1;
- wsize = 1;
+ if ((curstep->arg1 >= 0 &&
+ curstep->arg1 < map->max_devices) ||
+ (-1-curstep->arg1 < map->max_buckets &&
+ map->buckets[-1-curstep->arg1])) {
+ w[0] = curstep->arg1;
+ wsize = 1;
+ } else {
+ dprintk(" bad take value %d\n", curstep->arg1);
+ }
break;
case CRUSH_RULE_SET_CHOOSE_TRIES:
x, numrep,
curstep->arg2,
o+osize, j,
+ result_max-osize,
choose_tries,
recurse_tries,
choose_local_retries,
c+osize,
0);
} else {
+ out_size = ((numrep < (result_max-osize)) ?
+ numrep : (result_max-osize));
crush_choose_indep(
map,
map->buckets[-1-w[i]],
weight, weight_max,
- x, numrep, numrep,
+ x, out_size, numrep,
curstep->arg2,
o+osize, j,
choose_tries,
recurse_to_leaf,
c+osize,
0);
- osize += numrep;
+ osize += out_size;
}
}
}
return result_len;
}
-
-