Merge branch 'for-linus' of git://git.kernel.dk/linux-block
[cascardo/linux.git] / block / blk-cgroup.c
index 9cc48d1..55512dd 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
                return;
 
        for (i = 0; i < BLKCG_MAX_POLS; i++)
-               kfree(blkg->pd[i]);
+               if (blkg->pd[i])
+                       blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-       blk_exit_rl(&blkg->rl);
+       if (blkg->blkcg != &blkcg_root)
+               blk_exit_rl(&blkg->rl);
+
+       blkg_rwstat_exit(&blkg->stat_ios);
+       blkg_rwstat_exit(&blkg->stat_bytes);
        kfree(blkg);
 }
 
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        if (!blkg)
                return NULL;
 
+       if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+           blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+               goto err_free;
+
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                        continue;
 
                /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               pd = pol->pd_alloc_fn(gfp_mask, q->node);
                if (!pd)
                        goto err_free;
 
@@ -129,26 +139,11 @@ err_free:
        return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint)
 {
        struct blkcg_gq *blkg;
 
-       blkg = rcu_dereference(blkcg->blkg_hint);
-       if (blkg && blkg->q == q)
-               return blkg;
-
        /*
         * Hint didn't match.  Look up from the radix tree.  Note that the
         * hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 
        return NULL;
 }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (unlikely(blk_queue_bypass(q)))
-               return NULL;
-       return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                    struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
-               ret = -EINVAL;
+               ret = -ENODEV;
                goto err_free_blkg;
        }
 
        wb_congested = wb_congested_get_create(&q->backing_dev_info,
-                                              blkcg->css.id, GFP_ATOMIC);
+                                              blkcg->css.id, GFP_NOWAIT);
        if (!wb_congested) {
                ret = -ENOMEM;
                goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* allocate */
        if (!new_blkg) {
-               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
-                       ret = -EINVAL;
+                       ret = -ENODEV;
                        goto err_put_congested;
                }
                blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_init_fn)
-                       pol->pd_init_fn(blkg);
+                       pol->pd_init_fn(blkg->pd[i]);
        }
 
        /* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                        struct blkcg_policy *pol = blkcg_policy[i];
 
                        if (blkg->pd[i] && pol->pd_online_fn)
-                               pol->pd_online_fn(blkg);
+                               pol->pd_online_fn(blkg->pd[i]);
                }
        }
        blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
         * we shouldn't allow anything to go through for a bypassing queue.
         */
        if (unlikely(blk_queue_bypass(q)))
-               return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 
        blkg = __blkg_lookup(blkcg, q, true);
        if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                        return blkg;
        }
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
        struct blkcg *blkcg = blkg->blkcg;
+       struct blkcg_gq *parent = blkg->parent;
        int i;
 
        lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
+                       pol->pd_offline_fn(blkg->pd[i]);
+       }
+
+       if (parent) {
+               blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+               blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
        }
+
        blkg->online = false;
 
        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -403,15 +386,6 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
        struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-       int i;
-
-       /* tell policies that this one is being freed */
-       for (i = 0; i < BLKCG_MAX_POLS; i++) {
-               struct blkcg_policy *pol = blkcg_policy[i];
-
-               if (blkg->pd[i] && pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-       }
 
        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
@@ -475,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               blkg_rwstat_reset(&blkg->stat_bytes);
+               blkg_rwstat_reset(&blkg->stat_ios);
+
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];
 
-                       if (blkcg_policy_enabled(blkg->q, pol) &&
-                           pol->pd_reset_stats_fn)
-                               pol->pd_reset_stats_fn(blkg);
+                       if (blkg->pd[i] && pol->pd_reset_stats_fn)
+                               pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }
 
@@ -489,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
        return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
        /* some drivers (floppy) instantiate a queue w/o disk registered */
        if (blkg->q->backing_dev_info.dev)
                return dev_name(blkg->q->backing_dev_info.dev);
        return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
@@ -584,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
        return v;
 }
@@ -623,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+                                   struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+                                             struct blkg_policy_data *pd,
+                                             int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+                                                             NULL, off);
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        u64 sum = 0;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_stat *stat = (void *)pos_pd + off;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_stat *stat;
+
+               if (!pos_blkg->online)
+                       continue;
+
+               if (pol)
+                       stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       stat = (void *)blkg + off;
 
-               if (pos_blkg->online)
-                       sum += blkg_stat_read(stat);
+               sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
        }
        rcu_read_unlock();
 
@@ -657,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        struct blkg_rwstat sum = { };
        int i;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-               struct blkg_rwstat tmp;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_rwstat *rwstat;
 
                if (!pos_blkg->online)
                        continue;
 
-               tmp = blkg_rwstat_read(rwstat);
+               if (pol)
+                       rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       rwstat = (void *)pos_blkg + off;
 
                for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i];
+                       atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+                               percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+                               &sum.aux_cnt[i]);
        }
        rcu_read_unlock();
 
@@ -703,29 +778,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx)
+                  char *input, struct blkg_conf_ctx *ctx)
        __acquires(rcu) __acquires(disk->queue->queue_lock)
 {
        struct gendisk *disk;
        struct blkcg_gq *blkg;
        unsigned int major, minor;
-       unsigned long long v;
-       int part, ret;
+       int key_len, part, ret;
+       char *body;
 
-       if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+       if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;
 
+       body = input + key_len;
+       if (!isspace(*body))
+               return -EINVAL;
+       body = skip_spaces(body);
+
        disk = get_gendisk(MKDEV(major, minor), &part);
        if (!disk)
-               return -EINVAL;
+               return -ENODEV;
        if (part) {
                put_disk(disk);
-               return -EINVAL;
+               return -ENODEV;
        }
 
        rcu_read_lock();
@@ -734,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (blkcg_policy_enabled(disk->queue, pol))
                blkg = blkg_lookup_create(blkcg, disk->queue);
        else
-               blkg = ERR_PTR(-EINVAL);
+               blkg = ERR_PTR(-EOPNOTSUPP);
 
        if (IS_ERR(blkg)) {
                ret = PTR_ERR(blkg);
@@ -756,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
        ctx->disk = disk;
        ctx->blkg = blkg;
-       ctx->v = v;
+       ctx->body = body;
        return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -777,7 +857,54 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               const char *dname;
+               struct blkg_rwstat rwstat;
+               u64 rbytes, wbytes, rios, wios;
+
+               dname = blkg_dev_name(blkg);
+               if (!dname)
+                       continue;
+
+               spin_lock_irq(blkg->q->queue_lock);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+               rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_ios));
+               rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               spin_unlock_irq(blkg->q->queue_lock);
+
+               if (rbytes || wbytes || rios || wios)
+                       seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+                                  dname, rbytes, wbytes, rios, wios);
+       }
+
+       rcu_read_unlock();
+       return 0;
+}
+
 struct cftype blkcg_files[] = {
+       {
+               .name = "stat",
+               .seq_show = blkcg_print_stat,
+       },
+       { }     /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
@@ -825,18 +952,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
        struct blkcg *blkcg = css_to_blkcg(css);
+       int i;
 
        mutex_lock(&blkcg_pol_mutex);
+
        list_del(&blkcg->all_blkcgs_node);
-       mutex_unlock(&blkcg_pol_mutex);
 
-       if (blkcg != &blkcg_root) {
-               int i;
+       for (i = 0; i < BLKCG_MAX_POLS; i++)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 
-               for (i = 0; i < BLKCG_MAX_POLS; i++)
-                       kfree(blkcg->pd[i]);
-               kfree(blkcg);
-       }
+       mutex_unlock(&blkcg_pol_mutex);
+
+       kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
@@ -850,13 +978,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
        if (!parent_css) {
                blkcg = &blkcg_root;
-               goto done;
-       }
-
-       blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-       if (!blkcg) {
-               ret = ERR_PTR(-ENOMEM);
-               goto free_blkcg;
+       } else {
+               blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+               if (!blkcg) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_blkcg;
+               }
        }
 
        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -869,23 +996,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
-               if (!pol || !pol->cpd_size)
+               if (!pol || !pol->cpd_alloc_fn)
                        continue;
 
-               BUG_ON(blkcg->pd[i]);
-               cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+               cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd) {
                        ret = ERR_PTR(-ENOMEM);
                        goto free_pd_blkcg;
                }
-               blkcg->pd[i] = cpd;
+               blkcg->cpd[i] = cpd;
+               cpd->blkcg = blkcg;
                cpd->plid = i;
-               pol->cpd_init_fn(blkcg);
+               if (pol->cpd_init_fn)
+                       pol->cpd_init_fn(cpd);
        }
 
-done:
        spin_lock_init(&blkcg->lock);
-       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -897,7 +1024,8 @@ done:
 
 free_pd_blkcg:
        for (i--; i >= 0; i--)
-               kfree(blkcg->pd[i]);
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
        kfree(blkcg);
        mutex_unlock(&blkcg_pol_mutex);
@@ -941,7 +1069,7 @@ int blkcg_init_queue(struct request_queue *q)
                radix_tree_preload_end();
 
        if (IS_ERR(blkg)) {
-               kfree(new_blkg);
+               blkg_free(new_blkg);
                return PTR_ERR(blkg);
        }
 
@@ -1018,12 +1146,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
        return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+               struct blkcg *blkcg;
+
+               if (!pol || !pol->cpd_bind_fn)
+                       continue;
+
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+                       if (blkcg->cpd[pol->plid])
+                               pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+       }
+       mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
-       .legacy_cftypes = blkcg_files,
+       .bind = blkcg_bind,
+       .dfl_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_legacy_files,
+       .legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
@@ -1033,7 +1184,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
        .depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1054,65 +1205,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
                          const struct blkcg_policy *pol)
 {
-       LIST_HEAD(pds);
+       struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg;
-       struct blkg_policy_data *pd, *nd;
-       int cnt = 0, ret;
+       int ret;
 
        if (blkcg_policy_enabled(q, pol))
                return 0;
 
-       /* count and allocate policy_data for all existing blkgs */
        blk_queue_bypass_start(q);
-       spin_lock_irq(q->queue_lock);
-       list_for_each_entry(blkg, &q->blkg_list, q_node)
-               cnt++;
-       spin_unlock_irq(q->queue_lock);
-
-       /* allocate per-blkg policy data for all existing blkgs */
-       while (cnt--) {
-               pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-               if (!pd) {
+pd_prealloc:
+       if (!pd_prealloc) {
+               pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+               if (!pd_prealloc) {
                        ret = -ENOMEM;
-                       goto out_free;
+                       goto out_bypass_end;
                }
-               list_add_tail(&pd->alloc_node, &pds);
        }
 
-       /*
-        * Install the allocated pds and cpds. With @q bypassing, no new blkg
-        * should have been created while the queue lock was dropped.
-        */
        spin_lock_irq(q->queue_lock);
 
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
-               if (WARN_ON(list_empty(&pds))) {
-                       /* umm... this shouldn't happen, just abort */
-                       ret = -ENOMEM;
-                       goto out_unlock;
-               }
-               pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-               list_del_init(&pd->alloc_node);
+               struct blkg_policy_data *pd;
 
-               /* grab blkcg lock too while installing @pd on @blkg */
-               spin_lock(&blkg->blkcg->lock);
+               if (blkg->pd[pol->plid])
+                       continue;
+
+               pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+               if (!pd)
+                       swap(pd, pd_prealloc);
+               if (!pd) {
+                       spin_unlock_irq(q->queue_lock);
+                       goto pd_prealloc;
+               }
 
                blkg->pd[pol->plid] = pd;
                pd->blkg = blkg;
                pd->plid = pol->plid;
-               pol->pd_init_fn(blkg);
-
-               spin_unlock(&blkg->blkcg->lock);
+               if (pol->pd_init_fn)
+                       pol->pd_init_fn(pd);
        }
 
        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;
-out_unlock:
+
        spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
        blk_queue_bypass_end(q);
-       list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-               kfree(pd);
+       if (pd_prealloc)
+               pol->pd_free_fn(pd_prealloc);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1142,13 +1282,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
                /* grab blkcg lock too while removing @pd from @blkg */
                spin_lock(&blkg->blkcg->lock);
 
-               if (pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
-               if (pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-
-               kfree(blkg->pd[pol->plid]);
-               blkg->pd[pol->plid] = NULL;
+               if (blkg->pd[pol->plid]) {
+                       if (pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+               }
 
                spin_unlock(&blkg->blkcg->lock);
        }
@@ -1170,9 +1309,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
        struct blkcg *blkcg;
        int i, ret;
 
-       if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-               return -EINVAL;
-
        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);
 
@@ -1189,36 +1325,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
        blkcg_policy[pol->plid] = pol;
 
        /* allocate and install cpd's */
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;
 
-                       cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+                       cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd) {
                                mutex_unlock(&blkcg_pol_mutex);
                                goto err_free_cpds;
                        }
 
-                       blkcg->pd[pol->plid] = cpd;
+                       blkcg->cpd[pol->plid] = cpd;
+                       cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
-                       pol->cpd_init_fn(blkcg);
+                       pol->cpd_init_fn(cpd);
                }
        }
 
        mutex_unlock(&blkcg_pol_mutex);
 
        /* everything is in place, add intf files for the new policy */
-       if (pol->cftypes)
-               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
-                                                 pol->cftypes));
+       if (pol->dfl_cftypes)
+               WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+                                              pol->dfl_cftypes));
+       if (pol->legacy_cftypes)
+               WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+                                                 pol->legacy_cftypes));
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;
 
 err_free_cpds:
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                }
        }
        blkcg_policy[pol->plid] = NULL;
@@ -1245,16 +1387,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
                goto out_unlock;
 
        /* kill the intf files first */
-       if (pol->cftypes)
-               cgroup_rm_cftypes(pol->cftypes);
+       if (pol->dfl_cftypes)
+               cgroup_rm_cftypes(pol->dfl_cftypes);
+       if (pol->legacy_cftypes)
+               cgroup_rm_cftypes(pol->legacy_cftypes);
 
        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);
 
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                }
        }
        blkcg_policy[pol->plid] = NULL;