crush: add chooseleaf_stable tunable
authorIlya Dryomov <idryomov@gmail.com>
Sun, 31 Jan 2016 13:36:07 +0000 (14:36 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 4 Feb 2016 17:25:55 +0000 (18:25 +0100)
Add a tunable to fix the bug that chooseleaf may cause unnecessary pg
migrations when some device fails.

Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
include/linux/crush/crush.h
net/ceph/crush/mapper.c

index 48b4930..be8f12b 100644 (file)
@@ -59,7 +59,8 @@ enum {
        CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
        CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
        CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
-       CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
+       CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
+       CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
 };
 
 /*
@@ -205,6 +206,11 @@ struct crush_map {
         * mappings line up a bit better with previous mappings. */
        __u8 chooseleaf_vary_r;
 
+       /* if true, it makes chooseleaf firstn to return stable results (if
+        * no local retry) so that data migrations would be optimal when some
+        * device fails. */
+       __u8 chooseleaf_stable;
+
 #ifndef __KERNEL__
        /*
         * version 0 (original) of straw_calc has various flaws.  version 1
index abb7006..5fcfb98 100644 (file)
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
  * @local_retries: localized retries
  * @local_fallback_retries: localized fallback retries
  * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
  * @vary_r: pass r to recursive calls
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  * @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
                               unsigned int local_fallback_retries,
                               int recurse_to_leaf,
                               unsigned int vary_r,
+                              unsigned int stable,
                               int *out2,
                               int parent_r)
 {
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
        int collide, reject;
        int count = out_size;
 
-       dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+       dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
                recurse_to_leaf ? "_LEAF" : "",
                bucket->id, x, outpos, numrep,
                tries, recurse_tries, local_retries, local_fallback_retries,
-               parent_r);
+               parent_r, stable);
 
-       for (rep = outpos; rep < numrep && count > 0 ; rep++) {
+       for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
                /* keep trying until we get a non-out, non-colliding item */
                ftotal = 0;
                skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
                                                if (crush_choose_firstn(map,
                                                         map->buckets[-1-item],
                                                         weight, weight_max,
-                                                        x, outpos+1, 0,
+                                                        x, stable ? 1 : outpos+1, 0,
                                                         out2, outpos, count,
                                                         recurse_tries, 0,
                                                         local_retries,
                                                         local_fallback_retries,
                                                         0,
                                                         vary_r,
+                                                        stable,
                                                         NULL,
                                                         sub_r) <= outpos)
                                                        /* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
        int choose_local_fallback_retries = map->choose_local_fallback_tries;
 
        int vary_r = map->chooseleaf_vary_r;
+       int stable = map->chooseleaf_stable;
 
        if ((__u32)ruleno >= map->max_rules) {
                dprintk(" bad ruleno %d\n", ruleno);
@@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
                                vary_r = curstep->arg1;
                        break;
 
+               case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+                       if (curstep->arg1 >= 0)
+                               stable = curstep->arg1;
+                       break;
+
                case CRUSH_RULE_CHOOSELEAF_FIRSTN:
                case CRUSH_RULE_CHOOSE_FIRSTN:
                        firstn = 1;
@@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
                                                choose_local_fallback_retries,
                                                recurse_to_leaf,
                                                vary_r,
+                                               stable,
                                                c+osize,
                                                0);
                                } else {