blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq

[cascardo/linux.git] / block / blk-cgroup.c
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index 9f97da5..b263207 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,14 @@
  
  #define MAX_KEY_LEN 100
  
+/*
+ * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
+ * blkcg_pol_register_mutex nests outside of it and synchronizes entire
+ * policy [un]register operations including cgroup file additions /
+ * removals.  Putting cgroup file registration outside blkcg_pol_mutex
+ * allows grabbing it from cgroup callbacks.
+ */
+static DEFINE_MUTEX(blkcg_pol_register_mutex);
  static DEFINE_MUTEX(blkcg_pol_mutex);
  
  struct blkcg blkcg_root;
@@ -38,6 +46,8 @@ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  
  static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
+static LIST_HEAD(all_blkcgs);          /* protected by blkcg_pol_mutex */
+
  static bool blkcg_policy_enabled(struct request_queue *q,
                                  const struct blkcg_policy *pol)
  {
@@ -58,9 +68,11 @@ static void blkg_free(struct blkcg_gq *blkg)
                 return;
  
         for (i = 0; i < BLKCG_MAX_POLS; i++)
-               kfree(blkg->pd[i]);
+               if (blkg->pd[i])
+                       blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
  
-       blk_exit_rl(&blkg->rl);
+       if (blkg->blkcg != &blkcg_root)
+               blk_exit_rl(&blkg->rl);
         kfree(blkg);
  }
  
@@ -103,7 +115,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                         continue;
  
                 /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               pd = pol->pd_alloc_fn(gfp_mask, q->node);
                 if (!pd)
                         goto err_free;
  
@@ -119,26 +131,11 @@ err_free:
         return NULL;
  }
  
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint)
  {
         struct blkcg_gq *blkg;
  
-       blkg = rcu_dereference(blkcg->blkg_hint);
-       if (blkg && blkg->q == q)
-               return blkg;
-
         /*
          * Hint didn't match.  Look up from the radix tree.  Note that the
          * hint can only be updated under queue_lock as otherwise @blkg
@@ -156,29 +153,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
  
         return NULL;
  }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (unlikely(blk_queue_bypass(q)))
-               return NULL;
-       return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
  
  /*
   * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
   */
  static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                     struct request_queue *q,
@@ -198,7 +177,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
         }
  
         wb_congested = wb_congested_get_create(&q->backing_dev_info,
-                                              blkcg->css.id, GFP_ATOMIC);
+                                              blkcg->css.id, GFP_NOWAIT);
         if (!wb_congested) {
                 ret = -ENOMEM;
                 goto err_put_css;
@@ -206,7 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
  
         /* allocate */
         if (!new_blkg) {
-               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
                 if (unlikely(!new_blkg)) {
                         ret = -ENOMEM;
                         goto err_put_congested;
@@ -230,7 +209,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                 struct blkcg_policy *pol = blkcg_policy[i];
  
                 if (blkg->pd[i] && pol->pd_init_fn)
-                       pol->pd_init_fn(blkg);
+                       pol->pd_init_fn(blkg->pd[i]);
         }
  
         /* insert */
@@ -244,7 +223,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                         struct blkcg_policy *pol = blkcg_policy[i];
  
                         if (blkg->pd[i] && pol->pd_online_fn)
-                               pol->pd_online_fn(blkg);
+                               pol->pd_online_fn(blkg->pd[i]);
                 }
         }
         blkg->online = true;
@@ -317,7 +296,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                         return blkg;
         }
  }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
  
  static void blkg_destroy(struct blkcg_gq *blkg)
  {
@@ -335,7 +313,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
                 struct blkcg_policy *pol = blkcg_policy[i];
  
                 if (blkg->pd[i] && pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
+                       pol->pd_offline_fn(blkg->pd[i]);
         }
         blkg->online = false;
  
@@ -390,15 +368,6 @@ static void blkg_destroy_all(struct request_queue *q)
  void __blkg_release_rcu(struct rcu_head *rcu_head)
  {
         struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-       int i;
-
-       /* tell policies that this one is being freed */
-       for (i = 0; i < BLKCG_MAX_POLS; i++) {
-               struct blkcg_policy *pol = blkcg_policy[i];
-
-               if (blkg->pd[i] && pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-       }
  
         /* release the blkcg and parent blkg refs this blkg has been holding */
         css_put(&blkg->blkcg->css);
@@ -453,20 +422,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
         struct blkcg_gq *blkg;
         int i;
  
-       /*
-        * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
-        * which ends up putting cgroup's internal cgroup_tree_mutex under
-        * it; however, cgroup_tree_mutex is nested above cgroup file
-        * active protection and grabbing blkcg_pol_mutex from a cgroup
-        * file operation creates a possible circular dependency.  cgroup
-        * internal locking is planned to go through further simplification
-        * and this issue should go away soon.  For now, let's trylock
-        * blkcg_pol_mutex and restart the write on failure.
-        *
-        * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
-        */
-       if (!mutex_trylock(&blkcg_pol_mutex))
-               return restart_syscall();
+       mutex_lock(&blkcg_pol_mutex);
         spin_lock_irq(&blkcg->lock);
  
         /*
@@ -478,9 +434,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
                         struct blkcg_policy *pol = blkcg_policy[i];
  
-                       if (blkcg_policy_enabled(blkg->q, pol) &&
-                           pol->pd_reset_stats_fn)
-                               pol->pd_reset_stats_fn(blkg);
+                       if (blkg->pd[i] && pol->pd_reset_stats_fn)
+                               pol->pd_reset_stats_fn(blkg->pd[i]);
                 }
         }
  
@@ -584,9 +539,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  
         for (i = 0; i < BLKG_RWSTAT_NR; i++)
                 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
  
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
         seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
         return v;
  }
@@ -625,29 +581,39 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  
  /**
   * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
   *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
   */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off)
  {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
         struct blkcg_gq *pos_blkg;
         struct cgroup_subsys_state *pos_css;
         u64 sum = 0;
  
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
  
         rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_stat *stat = (void *)pos_pd + off;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_stat *stat;
+
+               if (!pos_blkg->online)
+                       continue;
  
-               if (pos_blkg->online)
-                       sum += blkg_stat_read(stat);
+               if (pol)
+                       stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       stat = (void *)blkg + off;
+
+               sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
         }
         rcu_read_unlock();
  
@@ -657,37 +623,45 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  
  /**
   * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
   *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
   */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off)
  {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
         struct blkcg_gq *pos_blkg;
         struct cgroup_subsys_state *pos_css;
         struct blkg_rwstat sum = { };
         int i;
  
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
  
         rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-               struct blkg_rwstat tmp;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_rwstat *rwstat, tmp;
  
                 if (!pos_blkg->online)
                         continue;
  
+               if (pol)
+                       rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       rwstat = (void *)pos_blkg + off;
+
                 tmp = blkg_rwstat_read(rwstat);
  
                 for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i];
+                       atomic64_add(atomic64_read(&tmp.aux_cnt[i]) +
+                                    atomic64_read(&rwstat->aux_cnt[i]),
+                                    &sum.aux_cnt[i]);
         }
         rcu_read_unlock();
  
@@ -721,8 +695,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                 return -EINVAL;
  
         disk = get_gendisk(MKDEV(major, minor), &part);
-       if (!disk || part)
+       if (!disk)
                 return -EINVAL;
+       if (part) {
+               put_disk(disk);
+               return -EINVAL;
+       }
  
         rcu_read_lock();
         spin_lock_irq(disk->queue->queue_lock);
@@ -821,9 +799,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
  static void blkcg_css_free(struct cgroup_subsys_state *css)
  {
         struct blkcg *blkcg = css_to_blkcg(css);
+       int i;
  
-       if (blkcg != &blkcg_root)
-               kfree(blkcg);
+       mutex_lock(&blkcg_pol_mutex);
+
+       list_del(&blkcg->all_blkcgs_node);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+
+       mutex_unlock(&blkcg_pol_mutex);
+
+       kfree(blkcg);
  }
  
  static struct cgroup_subsys_state *
@@ -833,15 +821,16 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
         struct cgroup_subsys_state *ret;
         int i;
  
+       mutex_lock(&blkcg_pol_mutex);
+
         if (!parent_css) {
                 blkcg = &blkcg_root;
-               goto done;
-       }
-
-       blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-       if (!blkcg) {
-               ret = ERR_PTR(-ENOMEM);
-               goto free_blkcg;
+       } else {
+               blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+               if (!blkcg) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_blkcg;
+               }
         }
  
         for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -854,35 +843,39 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                  * check if the policy requires any specific per-cgroup
                  * data: if it does, allocate and initialize it.
                  */
-               if (!pol || !pol->cpd_size)
+               if (!pol || !pol->cpd_alloc_fn)
                         continue;
  
-               BUG_ON(blkcg->pd[i]);
-               cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+               cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                 if (!cpd) {
                         ret = ERR_PTR(-ENOMEM);
                         goto free_pd_blkcg;
                 }
-               blkcg->pd[i] = cpd;
+               blkcg->cpd[i] = cpd;
+               cpd->blkcg = blkcg;
                 cpd->plid = i;
-               pol->cpd_init_fn(blkcg);
+               if (pol->cpd_init_fn)
+                       pol->cpd_init_fn(cpd);
         }
  
-done:
         spin_lock_init(&blkcg->lock);
-       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
         INIT_HLIST_HEAD(&blkcg->blkg_list);
  #ifdef CONFIG_CGROUP_WRITEBACK
         INIT_LIST_HEAD(&blkcg->cgwb_list);
  #endif
+       list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
+
+       mutex_unlock(&blkcg_pol_mutex);
         return &blkcg->css;
  
  free_pd_blkcg:
         for (i--; i >= 0; i--)
-               kfree(blkcg->pd[i]);
-
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  free_blkcg:
         kfree(blkcg);
+       mutex_unlock(&blkcg_pol_mutex);
         return ret;
  }
  
@@ -923,7 +916,7 @@ int blkcg_init_queue(struct request_queue *q)
                 radix_tree_preload_end();
  
         if (IS_ERR(blkg)) {
-               kfree(new_blkg);
+               blkg_free(new_blkg);
                 return PTR_ERR(blkg);
         }
  
@@ -1036,96 +1029,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
  int blkcg_activate_policy(struct request_queue *q,
                           const struct blkcg_policy *pol)
  {
-       LIST_HEAD(pds);
-       LIST_HEAD(cpds);
+       struct blkg_policy_data *pd_prealloc = NULL;
         struct blkcg_gq *blkg;
-       struct blkg_policy_data *pd, *nd;
-       struct blkcg_policy_data *cpd, *cnd;
-       int cnt = 0, ret;
+       int ret;
  
         if (blkcg_policy_enabled(q, pol))
                 return 0;
  
-       /* count and allocate policy_data for all existing blkgs */
         blk_queue_bypass_start(q);
-       spin_lock_irq(q->queue_lock);
-       list_for_each_entry(blkg, &q->blkg_list, q_node)
-               cnt++;
-       spin_unlock_irq(q->queue_lock);
-
-       /*
-        * Allocate per-blkg and per-blkcg policy data
-        * for all existing blkgs.
-        */
-       while (cnt--) {
-               pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-               if (!pd) {
-                       ret = -ENOMEM;
-                       goto out_free;
-               }
-               list_add_tail(&pd->alloc_node, &pds);
-
-               if (!pol->cpd_size)
-                       continue;
-               cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
-               if (!cpd) {
+pd_prealloc:
+       if (!pd_prealloc) {
+               pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+               if (!pd_prealloc) {
                         ret = -ENOMEM;
-                       goto out_free;
+                       goto out_bypass_end;
                 }
-               list_add_tail(&cpd->alloc_node, &cpds);
         }
  
-       /*
-        * Install the allocated pds and cpds. With @q bypassing, no new blkg
-        * should have been created while the queue lock was dropped.
-        */
         spin_lock_irq(q->queue_lock);
  
         list_for_each_entry(blkg, &q->blkg_list, q_node) {
-               if (WARN_ON(list_empty(&pds)) ||
-                   WARN_ON(pol->cpd_size && list_empty(&cpds))) {
-                       /* umm... this shouldn't happen, just abort */
-                       ret = -ENOMEM;
-                       goto out_unlock;
-               }
-               cpd = list_first_entry(&cpds, struct blkcg_policy_data,
-                                      alloc_node);
-               list_del_init(&cpd->alloc_node);
-               pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-               list_del_init(&pd->alloc_node);
+               struct blkg_policy_data *pd;
  
-               /* grab blkcg lock too while installing @pd on @blkg */
-               spin_lock(&blkg->blkcg->lock);
+               if (blkg->pd[pol->plid])
+                       continue;
  
-               if (!pol->cpd_size)
-                       goto no_cpd;
-               if (!blkg->blkcg->pd[pol->plid]) {
-                       /* Per-policy per-blkcg data */
-                       blkg->blkcg->pd[pol->plid] = cpd;
-                       cpd->plid = pol->plid;
-                       pol->cpd_init_fn(blkg->blkcg);
-               } else { /* must free it as it has already been extracted */
-                       kfree(cpd);
+               pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+               if (!pd)
+                       swap(pd, pd_prealloc);
+               if (!pd) {
+                       spin_unlock_irq(q->queue_lock);
+                       goto pd_prealloc;
                 }
-no_cpd:
+
                 blkg->pd[pol->plid] = pd;
                 pd->blkg = blkg;
                 pd->plid = pol->plid;
-               pol->pd_init_fn(blkg);
-
-               spin_unlock(&blkg->blkcg->lock);
+               if (pol->pd_init_fn)
+                       pol->pd_init_fn(pd);
         }
  
         __set_bit(pol->plid, q->blkcg_pols);
         ret = 0;
-out_unlock:
+
         spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
         blk_queue_bypass_end(q);
-       list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-               kfree(pd);
-       list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
-               kfree(cpd);
+       if (pd_prealloc)
+               pol->pd_free_fn(pd_prealloc);
         return ret;
  }
  EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1155,15 +1106,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
                 /* grab blkcg lock too while removing @pd from @blkg */
                 spin_lock(&blkg->blkcg->lock);
  
-               if (pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
-               if (pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-
-               kfree(blkg->pd[pol->plid]);
-               blkg->pd[pol->plid] = NULL;
-               kfree(blkg->blkcg->pd[pol->plid]);
-               blkg->blkcg->pd[pol->plid] = NULL;
+               if (blkg->pd[pol->plid]) {
+                       if (pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+               }
  
                 spin_unlock(&blkg->blkcg->lock);
         }
@@ -1182,11 +1130,10 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
   */
  int blkcg_policy_register(struct blkcg_policy *pol)
  {
+       struct blkcg *blkcg;
         int i, ret;
  
-       if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-               return -EINVAL;
-
+       mutex_lock(&blkcg_pol_register_mutex);
         mutex_lock(&blkcg_pol_mutex);
  
         /* find an empty slot */
@@ -1195,19 +1142,52 @@ int blkcg_policy_register(struct blkcg_policy *pol)
                 if (!blkcg_policy[i])
                         break;
         if (i >= BLKCG_MAX_POLS)
-               goto out_unlock;
+               goto err_unlock;
  
-       /* register and update blkgs */
+       /* register @pol */
         pol->plid = i;
-       blkcg_policy[i] = pol;
+       blkcg_policy[pol->plid] = pol;
+
+       /* allocate and install cpd's */
+       if (pol->cpd_alloc_fn) {
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+                       struct blkcg_policy_data *cpd;
+
+                       cpd = pol->cpd_alloc_fn(GFP_KERNEL);
+                       if (!cpd) {
+                               mutex_unlock(&blkcg_pol_mutex);
+                               goto err_free_cpds;
+                       }
+
+                       blkcg->cpd[pol->plid] = cpd;
+                       cpd->blkcg = blkcg;
+                       cpd->plid = pol->plid;
+                       pol->cpd_init_fn(cpd);
+               }
+       }
+
+       mutex_unlock(&blkcg_pol_mutex);
  
         /* everything is in place, add intf files for the new policy */
         if (pol->cftypes)
                 WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
                                                   pol->cftypes));
-       ret = 0;
-out_unlock:
+       mutex_unlock(&blkcg_pol_register_mutex);
+       return 0;
+
+err_free_cpds:
+       if (pol->cpd_alloc_fn) {
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
+               }
+       }
+       blkcg_policy[pol->plid] = NULL;
+err_unlock:
         mutex_unlock(&blkcg_pol_mutex);
+       mutex_unlock(&blkcg_pol_register_mutex);
         return ret;
  }
  EXPORT_SYMBOL_GPL(blkcg_policy_register);
@@ -1220,7 +1200,9 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
   */
  void blkcg_policy_unregister(struct blkcg_policy *pol)
  {
-       mutex_lock(&blkcg_pol_mutex);
+       struct blkcg *blkcg;
+
+       mutex_lock(&blkcg_pol_register_mutex);
  
         if (WARN_ON(blkcg_policy[pol->plid] != pol))
                 goto out_unlock;
@@ -1229,9 +1211,21 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
         if (pol->cftypes)
                 cgroup_rm_cftypes(pol->cftypes);
  
-       /* unregister and update blkgs */
+       /* remove cpds and unregister */
+       mutex_lock(&blkcg_pol_mutex);
+
+       if (pol->cpd_alloc_fn) {
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
+               }
+       }
         blkcg_policy[pol->plid] = NULL;
-out_unlock:
+
         mutex_unlock(&blkcg_pol_mutex);
+out_unlock:
+       mutex_unlock(&blkcg_pol_register_mutex);
  }
  EXPORT_SYMBOL_GPL(blkcg_policy_unregister);