Merge branch 'for-4.3/blkcg' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 11 Sep 2015 01:56:14 +0000 (18:56 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 11 Sep 2015 01:56:14 +0000 (18:56 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Sep 2015 01:56:14 +0000 (18:56 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Sep 2015 01:56:14 +0000 (18:56 -0700)
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt

index 68b6a6a..12686be 100644 (file)
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -201,7 +201,7 @@ Proportional weight policy files
           specifies the number of bytes.
  
  - blkio.io_serviced
-       - Number of IOs completed to/from the disk by the group. These
+       - Number of IOs (bio) issued to the disk by the group. These
           are further divided by the type of operation - read or write, sync
           or async. First two fields specify the major and minor number of the
           device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
        subjected to both the constraints.
  
  - blkio.throttle.io_serviced
-       - Number of IOs (bio) completed to/from the disk by the group (as
-         seen by throttling policy). These are further divided by the type
-         of operation - read or write, sync or async. First two fields specify
-         the major and minor number of the device, third field specifies the
-         operation type and the fourth field specifies the number of IOs.
-
-         blkio.io_serviced does accounting as seen by CFQ and counts are in
-         number of requests (struct request). On the other hand,
-         blkio.throttle.io_serviced counts number of IO in terms of number
-         of bios as seen by throttling policy.  These bios can later be
-         merged by elevator and total number of requests completed can be
-         lesser.
+       - Number of IOs (bio) issued to the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of IOs.
  
  - blkio.throttle.io_service_bytes
         - Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
           device, third field specifies the operation type and the fourth field
           specifies the number of bytes.
  
-         These numbers should roughly be same as blkio.io_service_bytes as
-         updated by CFQ. The difference between two is that
-         blkio.io_service_bytes will not be updated if CFQ is not operating
-         on request queue.
-
  Common files among various policies
  -----------------------------------
  - blkio.reset_stats
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt

index 1ee9caf..e0975c2 100644 (file)
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -27,7 +27,7 @@ CONTENTS
      5-3-1. Format
      5-3-2. Control Knobs
    5-4. Per-Controller Changes
-    5-4-1. blkio
+    5-4-1. io
      5-4-2. cpuset
      5-4-3. memory
  6. Planned Changes
@@ -203,7 +203,7 @@ other issues.  The mapping from nice level to weight isn't obvious or
  universal, and there are various other knobs which simply aren't
  available for tasks.
  
-The blkio controller implicitly creates a hidden leaf node for each
+The io controller implicitly creates a hidden leaf node for each
  cgroup to host the tasks.  The hidden leaf has its own copies of all
  the knobs with "leaf_" prefixed.  While this allows equivalent control
  over internal tasks, it's with serious drawbacks.  It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
  
  5-4. Per-Controller Changes
  
-5-4-1. blkio
+5-4-1. io
  
-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io.  The interface is overhauled anyway.  The
+  new name is more in line with the other two major controllers, cpu
+  and memory, and better suited given that it may be used for cgroup
+  writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+  recursive stat files pointless and, as no internal node can have
+  tasks, leaf weights are meaningless.  The operation model is
+  simplified and the interface is overhauled accordingly.
+
+  io.stat
+
+       The stat file.  The reported stats are from the point where
+       bio's are issued to request_queue.  The stats are counted
+       independent of which policies are enabled.  Each line in the
+       file follows the following format.  More fields may later be
+       added at the end.
+
+         $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+  io.weight
+
+       The weight setting, currently only available and effective if
+       cfq-iosched is in use for the target device.  The weight is
+       between 1 and 10000 and defaults to 100.  The first line
+       always contains the default weight in the following format to
+       use when per-device setting is missing.
+
+         default $WEIGHT
+
+       Subsequent lines list per-device weights of the following
+       format.
+
+         $MAJ:$MIN $WEIGHT
+
+       Writing "$WEIGHT" or "default $WEIGHT" changes the default
+       setting.  Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+       while "$MAJ:$MIN default" clears it.
+
+       This file is available only on non-root cgroups.
+
+  io.max
+
+       The maximum bandwidth and/or iops setting, only available if
+       blk-throttle is enabled.  The file is of the following format.
+
+         $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+       ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+       read/write IOs per second.  "max" indicates no limit.  Writing
+       to the file follows the same format but the individual
+       settings may be ommitted or specified in any order.
+
+       This file is available only on non-root cgroups.
  
  
  5-4-2. cpuset
diff --git a/block/bio.c b/block/bio.c

index 515b543..ad3f276 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
  
         get_io_context_active(ioc);
         bio->bi_ioc = ioc;
-       bio->bi_css = task_get_css(current, blkio_cgrp_id);
+       bio->bi_css = task_get_css(current, io_cgrp_id);
         return 0;
  }
  EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index d6283b3..ac8370c 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
  #include <linux/genhd.h>
  #include <linux/delay.h>
  #include <linux/atomic.h>
+#include <linux/ctype.h>
  #include <linux/blk-cgroup.h>
  #include "blk.h"
  
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
                 return;
  
         for (i = 0; i < BLKCG_MAX_POLS; i++)
-               kfree(blkg->pd[i]);
+               if (blkg->pd[i])
+                       blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
  
-       blk_exit_rl(&blkg->rl);
+       if (blkg->blkcg != &blkcg_root)
+               blk_exit_rl(&blkg->rl);
+
+       blkg_rwstat_exit(&blkg->stat_ios);
+       blkg_rwstat_exit(&blkg->stat_bytes);
         kfree(blkg);
  }
  
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
         if (!blkg)
                 return NULL;
  
+       if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+           blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+               goto err_free;
+
         blkg->q = q;
         INIT_LIST_HEAD(&blkg->q_node);
         blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                         continue;
  
                 /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               pd = pol->pd_alloc_fn(gfp_mask, q->node);
                 if (!pd)
                         goto err_free;
  
@@ -129,26 +139,11 @@ err_free:
         return NULL;
  }
  
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint)
  {
         struct blkcg_gq *blkg;
  
-       blkg = rcu_dereference(blkcg->blkg_hint);
-       if (blkg && blkg->q == q)
-               return blkg;
-
         /*
          * Hint didn't match.  Look up from the radix tree.  Note that the
          * hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
  
         return NULL;
  }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (unlikely(blk_queue_bypass(q)))
-               return NULL;
-       return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
  
  /*
   * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
   */
  static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                     struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
  
         /* blkg holds a reference to blkcg */
         if (!css_tryget_online(&blkcg->css)) {
-               ret = -EINVAL;
+               ret = -ENODEV;
                 goto err_free_blkg;
         }
  
         wb_congested = wb_congested_get_create(&q->backing_dev_info,
-                                              blkcg->css.id, GFP_ATOMIC);
+                                              blkcg->css.id, GFP_NOWAIT);
         if (!wb_congested) {
                 ret = -ENOMEM;
                 goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
  
         /* allocate */
         if (!new_blkg) {
-               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
                 if (unlikely(!new_blkg)) {
                         ret = -ENOMEM;
                         goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
         if (blkcg_parent(blkcg)) {
                 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                 if (WARN_ON_ONCE(!blkg->parent)) {
-                       ret = -EINVAL;
+                       ret = -ENODEV;
                         goto err_put_congested;
                 }
                 blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                 struct blkcg_policy *pol = blkcg_policy[i];
  
                 if (blkg->pd[i] && pol->pd_init_fn)
-                       pol->pd_init_fn(blkg);
+                       pol->pd_init_fn(blkg->pd[i]);
         }
  
         /* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                         struct blkcg_policy *pol = blkcg_policy[i];
  
                         if (blkg->pd[i] && pol->pd_online_fn)
-                               pol->pd_online_fn(blkg);
+                               pol->pd_online_fn(blkg->pd[i]);
                 }
         }
         blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
          * we shouldn't allow anything to go through for a bypassing queue.
          */
         if (unlikely(blk_queue_bypass(q)))
-               return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
  
         blkg = __blkg_lookup(blkcg, q, true);
         if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                         return blkg;
         }
  }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
  
  static void blkg_destroy(struct blkcg_gq *blkg)
  {
         struct blkcg *blkcg = blkg->blkcg;
+       struct blkcg_gq *parent = blkg->parent;
         int i;
  
         lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
                 struct blkcg_policy *pol = blkcg_policy[i];
  
                 if (blkg->pd[i] && pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
+                       pol->pd_offline_fn(blkg->pd[i]);
+       }
+
+       if (parent) {
+               blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+               blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
         }
+
         blkg->online = false;
  
         radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q)
  void __blkg_release_rcu(struct rcu_head *rcu_head)
  {
         struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-       int i;
-
-       /* tell policies that this one is being freed */
-       for (i = 0; i < BLKCG_MAX_POLS; i++) {
-               struct blkcg_policy *pol = blkcg_policy[i];
-
-               if (blkg->pd[i] && pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-       }
  
         /* release the blkcg and parent blkg refs this blkg has been holding */
         css_put(&blkg->blkcg->css);
@@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
          * anyway.  If you get hit by a race, retry.
          */
         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               blkg_rwstat_reset(&blkg->stat_bytes);
+               blkg_rwstat_reset(&blkg->stat_ios);
+
                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
                         struct blkcg_policy *pol = blkcg_policy[i];
  
-                       if (blkcg_policy_enabled(blkg->q, pol) &&
-                           pol->pd_reset_stats_fn)
-                               pol->pd_reset_stats_fn(blkg);
+                       if (blkg->pd[i] && pol->pd_reset_stats_fn)
+                               pol->pd_reset_stats_fn(blkg->pd[i]);
                 }
         }
  
@@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
         return 0;
  }
  
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
  {
         /* some drivers (floppy) instantiate a queue w/o disk registered */
         if (blkg->q->backing_dev_info.dev)
                 return dev_name(blkg->q->backing_dev_info.dev);
         return NULL;
  }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
  
  /**
   * blkcg_print_blkgs - helper for printing per-blkg data
@@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  
         for (i = 0; i < BLKG_RWSTAT_NR; i++)
                 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
  
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
         seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
         return v;
  }
@@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  }
  EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
  
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+                                   struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+                                             struct blkg_policy_data *pd,
+                                             int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+                                                             NULL, off);
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
  /**
   * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
   *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
   */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off)
  {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
         struct blkcg_gq *pos_blkg;
         struct cgroup_subsys_state *pos_css;
         u64 sum = 0;
  
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
  
         rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_stat *stat = (void *)pos_pd + off;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_stat *stat;
+
+               if (!pos_blkg->online)
+                       continue;
+
+               if (pol)
+                       stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       stat = (void *)blkg + off;
  
-               if (pos_blkg->online)
-                       sum += blkg_stat_read(stat);
+               sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
         }
         rcu_read_unlock();
  
@@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
  
  /**
   * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
   *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
   */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off)
  {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
         struct blkcg_gq *pos_blkg;
         struct cgroup_subsys_state *pos_css;
         struct blkg_rwstat sum = { };
         int i;
  
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
  
         rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-               struct blkg_rwstat tmp;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_rwstat *rwstat;
  
                 if (!pos_blkg->online)
                         continue;
  
-               tmp = blkg_rwstat_read(rwstat);
+               if (pol)
+                       rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       rwstat = (void *)pos_blkg + off;
  
                 for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i];
+                       atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+                               percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+                               &sum.aux_cnt[i]);
         }
         rcu_read_unlock();
  
@@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
   * @ctx: blkg_conf_ctx to be filled
   *
   * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
   */
  int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx)
+                  char *input, struct blkg_conf_ctx *ctx)
         __acquires(rcu) __acquires(disk->queue->queue_lock)
  {
         struct gendisk *disk;
         struct blkcg_gq *blkg;
         unsigned int major, minor;
-       unsigned long long v;
-       int part, ret;
+       int key_len, part, ret;
+       char *body;
  
-       if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+       if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                 return -EINVAL;
  
+       body = input + key_len;
+       if (!isspace(*body))
+               return -EINVAL;
+       body = skip_spaces(body);
+
         disk = get_gendisk(MKDEV(major, minor), &part);
         if (!disk)
-               return -EINVAL;
+               return -ENODEV;
         if (part) {
                 put_disk(disk);
-               return -EINVAL;
+               return -ENODEV;
         }
  
         rcu_read_lock();
@@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
         if (blkcg_policy_enabled(disk->queue, pol))
                 blkg = blkg_lookup_create(blkcg, disk->queue);
         else
-               blkg = ERR_PTR(-EINVAL);
+               blkg = ERR_PTR(-EOPNOTSUPP);
  
         if (IS_ERR(blkg)) {
                 ret = PTR_ERR(blkg);
@@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
  
         ctx->disk = disk;
         ctx->blkg = blkg;
-       ctx->v = v;
+       ctx->body = body;
         return 0;
  }
  EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -774,7 +854,54 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
  }
  EXPORT_SYMBOL_GPL(blkg_conf_finish);
  
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               const char *dname;
+               struct blkg_rwstat rwstat;
+               u64 rbytes, wbytes, rios, wios;
+
+               dname = blkg_dev_name(blkg);
+               if (!dname)
+                       continue;
+
+               spin_lock_irq(blkg->q->queue_lock);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+               rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_ios));
+               rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               spin_unlock_irq(blkg->q->queue_lock);
+
+               if (rbytes || wbytes || rios || wios)
+                       seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+                                  dname, rbytes, wbytes, rios, wios);
+       }
+
+       rcu_read_unlock();
+       return 0;
+}
+
  struct cftype blkcg_files[] = {
+       {
+               .name = "stat",
+               .seq_show = blkcg_print_stat,
+       },
+       { }     /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
         {
                 .name = "reset_stats",
                 .write_u64 = blkcg_reset_stats,
@@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
  static void blkcg_css_free(struct cgroup_subsys_state *css)
  {
         struct blkcg *blkcg = css_to_blkcg(css);
+       int i;
  
         mutex_lock(&blkcg_pol_mutex);
+
         list_del(&blkcg->all_blkcgs_node);
-       mutex_unlock(&blkcg_pol_mutex);
  
-       if (blkcg != &blkcg_root) {
-               int i;
+       for (i = 0; i < BLKCG_MAX_POLS; i++)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  
-               for (i = 0; i < BLKCG_MAX_POLS; i++)
-                       kfree(blkcg->pd[i]);
-               kfree(blkcg);
-       }
+       mutex_unlock(&blkcg_pol_mutex);
+
+       kfree(blkcg);
  }
  
  static struct cgroup_subsys_state *
@@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
  
         if (!parent_css) {
                 blkcg = &blkcg_root;
-               goto done;
-       }
-
-       blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-       if (!blkcg) {
-               ret = ERR_PTR(-ENOMEM);
-               goto free_blkcg;
+       } else {
+               blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+               if (!blkcg) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_blkcg;
+               }
         }
  
         for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                  * check if the policy requires any specific per-cgroup
                  * data: if it does, allocate and initialize it.
                  */
-               if (!pol || !pol->cpd_size)
+               if (!pol || !pol->cpd_alloc_fn)
                         continue;
  
-               BUG_ON(blkcg->pd[i]);
-               cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+               cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                 if (!cpd) {
                         ret = ERR_PTR(-ENOMEM);
                         goto free_pd_blkcg;
                 }
-               blkcg->pd[i] = cpd;
+               blkcg->cpd[i] = cpd;
+               cpd->blkcg = blkcg;
                 cpd->plid = i;
-               pol->cpd_init_fn(blkcg);
+               if (pol->cpd_init_fn)
+                       pol->cpd_init_fn(cpd);
         }
  
-done:
         spin_lock_init(&blkcg->lock);
-       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
         INIT_HLIST_HEAD(&blkcg->blkg_list);
  #ifdef CONFIG_CGROUP_WRITEBACK
         INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -894,7 +1021,8 @@ done:
  
  free_pd_blkcg:
         for (i--; i >= 0; i--)
-               kfree(blkcg->pd[i]);
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
  free_blkcg:
         kfree(blkcg);
         mutex_unlock(&blkcg_pol_mutex);
@@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q)
                 radix_tree_preload_end();
  
         if (IS_ERR(blkg)) {
-               kfree(new_blkg);
+               blkg_free(new_blkg);
                 return PTR_ERR(blkg);
         }
  
@@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
         return ret;
  }
  
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+               struct blkcg *blkcg;
+
+               if (!pol || !pol->cpd_bind_fn)
+                       continue;
+
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+                       if (blkcg->cpd[pol->plid])
+                               pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+       }
+       mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
         .css_alloc = blkcg_css_alloc,
         .css_offline = blkcg_css_offline,
         .css_free = blkcg_css_free,
         .can_attach = blkcg_can_attach,
-       .legacy_cftypes = blkcg_files,
+       .bind = blkcg_bind,
+       .dfl_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_legacy_files,
+       .legacy_name = "blkio",
  #ifdef CONFIG_MEMCG
         /*
          * This ensures that, if available, memcg is automatically enabled
@@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
         .depends_on = 1 << memory_cgrp_id,
  #endif
  };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
  
  /**
   * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
  int blkcg_activate_policy(struct request_queue *q,
                           const struct blkcg_policy *pol)
  {
-       LIST_HEAD(pds);
+       struct blkg_policy_data *pd_prealloc = NULL;
         struct blkcg_gq *blkg;
-       struct blkg_policy_data *pd, *nd;
-       int cnt = 0, ret;
+       int ret;
  
         if (blkcg_policy_enabled(q, pol))
                 return 0;
  
-       /* count and allocate policy_data for all existing blkgs */
         blk_queue_bypass_start(q);
-       spin_lock_irq(q->queue_lock);
-       list_for_each_entry(blkg, &q->blkg_list, q_node)
-               cnt++;
-       spin_unlock_irq(q->queue_lock);
-
-       /* allocate per-blkg policy data for all existing blkgs */
-       while (cnt--) {
-               pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-               if (!pd) {
+pd_prealloc:
+       if (!pd_prealloc) {
+               pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+               if (!pd_prealloc) {
                         ret = -ENOMEM;
-                       goto out_free;
+                       goto out_bypass_end;
                 }
-               list_add_tail(&pd->alloc_node, &pds);
         }
  
-       /*
-        * Install the allocated pds and cpds. With @q bypassing, no new blkg
-        * should have been created while the queue lock was dropped.
-        */
         spin_lock_irq(q->queue_lock);
  
         list_for_each_entry(blkg, &q->blkg_list, q_node) {
-               if (WARN_ON(list_empty(&pds))) {
-                       /* umm... this shouldn't happen, just abort */
-                       ret = -ENOMEM;
-                       goto out_unlock;
-               }
-               pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-               list_del_init(&pd->alloc_node);
+               struct blkg_policy_data *pd;
  
-               /* grab blkcg lock too while installing @pd on @blkg */
-               spin_lock(&blkg->blkcg->lock);
+               if (blkg->pd[pol->plid])
+                       continue;
+
+               pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+               if (!pd)
+                       swap(pd, pd_prealloc);
+               if (!pd) {
+                       spin_unlock_irq(q->queue_lock);
+                       goto pd_prealloc;
+               }
  
                 blkg->pd[pol->plid] = pd;
                 pd->blkg = blkg;
                 pd->plid = pol->plid;
-               pol->pd_init_fn(blkg);
-
-               spin_unlock(&blkg->blkcg->lock);
+               if (pol->pd_init_fn)
+                       pol->pd_init_fn(pd);
         }
  
         __set_bit(pol->plid, q->blkcg_pols);
         ret = 0;
-out_unlock:
+
         spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
         blk_queue_bypass_end(q);
-       list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-               kfree(pd);
+       if (pd_prealloc)
+               pol->pd_free_fn(pd_prealloc);
         return ret;
  }
  EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
                 /* grab blkcg lock too while removing @pd from @blkg */
                 spin_lock(&blkg->blkcg->lock);
  
-               if (pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
-               if (pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-
-               kfree(blkg->pd[pol->plid]);
-               blkg->pd[pol->plid] = NULL;
+               if (blkg->pd[pol->plid]) {
+                       if (pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+               }
  
                 spin_unlock(&blkg->blkcg->lock);
         }
@@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
         struct blkcg *blkcg;
         int i, ret;
  
-       if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-               return -EINVAL;
-
         mutex_lock(&blkcg_pol_register_mutex);
         mutex_lock(&blkcg_pol_mutex);
  
@@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
         blkcg_policy[pol->plid] = pol;
  
         /* allocate and install cpd's */
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                         struct blkcg_policy_data *cpd;
  
-                       cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+                       cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                         if (!cpd) {
                                 mutex_unlock(&blkcg_pol_mutex);
                                 goto err_free_cpds;
                         }
  
-                       blkcg->pd[pol->plid] = cpd;
+                       blkcg->cpd[pol->plid] = cpd;
+                       cpd->blkcg = blkcg;
                         cpd->plid = pol->plid;
-                       pol->cpd_init_fn(blkcg);
+                       pol->cpd_init_fn(cpd);
                 }
         }
  
         mutex_unlock(&blkcg_pol_mutex);
  
         /* everything is in place, add intf files for the new policy */
-       if (pol->cftypes)
-               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
-                                                 pol->cftypes));
+       if (pol->dfl_cftypes)
+               WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+                                              pol->dfl_cftypes));
+       if (pol->legacy_cftypes)
+               WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+                                                 pol->legacy_cftypes));
         mutex_unlock(&blkcg_pol_register_mutex);
         return 0;
  
  err_free_cpds:
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                 }
         }
         blkcg_policy[pol->plid] = NULL;
@@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
                 goto out_unlock;
  
         /* kill the intf files first */
-       if (pol->cftypes)
-               cgroup_rm_cftypes(pol->cftypes);
+       if (pol->dfl_cftypes)
+               cgroup_rm_cftypes(pol->dfl_cftypes);
+       if (pol->legacy_cftypes)
+               cgroup_rm_cftypes(pol->legacy_cftypes);
  
         /* remove cpds and unregister */
         mutex_lock(&blkcg_pol_mutex);
  
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                 }
         }
         blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c

index 60912e9..2eb722d 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio)
          */
         create_io_context(GFP_ATOMIC, q->node);
  
-       if (blk_throtl_bio(q, bio))
-               return false;   /* throttled, will be resubmitted later */
+       if (!blkcg_bio_issue_check(q, bio))
+               return false;
  
         trace_block_bio_queue(q, bio);
         return true;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index b231935..c75a263 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
  
  #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
  
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
-};
-
  struct throtl_grp {
         /* must be the first member */
         struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
         /* When did we start a new slice */
         unsigned long slice_start[2];
         unsigned long slice_end[2];
-
-       /* Per cpu stats pointer */
-       struct tg_stats_cpu __percpu *stats_cpu;
-
-       /* List of tgs waiting for per cpu stats memory to be allocated */
-       struct list_head stats_alloc_node;
  };
  
  struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
         struct work_struct dispatch_work;
  };
  
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
  static void throtl_pending_timer_fn(unsigned long arg);
  
  static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
         return pd_to_blkg(&tg->pd);
  }
  
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-       return blkg_to_tg(td->queue->root_blkg);
-}
-
  /**
   * sq_to_tg - return the throl_grp the specified service queue belongs to
   * @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
         }                                                               \
  } while (0)
  
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
-       blkg_rwstat_init(&tg_stats->service_bytes);
-       blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
-       static struct tg_stats_cpu *stats_cpu;  /* this fn is non-reentrant */
-       struct delayed_work *dwork = to_delayed_work(work);
-       bool empty = false;
-
-alloc_stats:
-       if (!stats_cpu) {
-               int cpu;
-
-               stats_cpu = alloc_percpu(struct tg_stats_cpu);
-               if (!stats_cpu) {
-                       /* allocation failed, try again after some time */
-                       schedule_delayed_work(dwork, msecs_to_jiffies(10));
-                       return;
-               }
-               for_each_possible_cpu(cpu)
-                       tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
-       }
-
-       spin_lock_irq(&tg_stats_alloc_lock);
-
-       if (!list_empty(&tg_stats_alloc_list)) {
-               struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
-                                                        struct throtl_grp,
-                                                        stats_alloc_node);
-               swap(tg->stats_cpu, stats_cpu);
-               list_del_init(&tg->stats_alloc_node);
-       }
-
-       empty = list_empty(&tg_stats_alloc_list);
-       spin_unlock_irq(&tg_stats_alloc_lock);
-       if (!empty)
-               goto alloc_stats;
-}
-
  static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  {
         INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
  }
  
  /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
-                                     struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
  {
         INIT_LIST_HEAD(&sq->queued[0]);
         INIT_LIST_HEAD(&sq->queued[1]);
         sq->pending_tree = RB_ROOT;
-       sq->parent_sq = parent_sq;
         setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
                     (unsigned long)sq);
  }
  
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
  {
-       del_timer_sync(&sq->pending_timer);
+       struct throtl_grp *tg;
+       int rw;
+
+       tg = kzalloc_node(sizeof(*tg), gfp, node);
+       if (!tg)
+               return NULL;
+
+       throtl_service_queue_init(&tg->service_queue);
+
+       for (rw = READ; rw <= WRITE; rw++) {
+               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+       }
+
+       RB_CLEAR_NODE(&tg->rb_node);
+       tg->bps[READ] = -1;
+       tg->bps[WRITE] = -1;
+       tg->iops[READ] = -1;
+       tg->iops[WRITE] = -1;
+
+       return &tg->pd;
  }
  
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
  {
-       struct throtl_grp *tg = blkg_to_tg(blkg);
+       struct throtl_grp *tg = pd_to_tg(pd);
+       struct blkcg_gq *blkg = tg_to_blkg(tg);
         struct throtl_data *td = blkg->q->td;
-       struct throtl_service_queue *parent_sq;
-       unsigned long flags;
-       int rw;
+       struct throtl_service_queue *sq = &tg->service_queue;
  
         /*
          * If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
          * Limits of a group don't interact with limits of other groups
          * regardless of the position of the group in the hierarchy.
          */
-       parent_sq = &td->service_queue;
-
+       sq->parent_sq = &td->service_queue;
         if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
-               parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
-       throtl_service_queue_init(&tg->service_queue, parent_sq);
-
-       for (rw = READ; rw <= WRITE; rw++) {
-               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
-       }
-
-       RB_CLEAR_NODE(&tg->rb_node);
+               sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
         tg->td = td;
-
-       tg->bps[READ] = -1;
-       tg->bps[WRITE] = -1;
-       tg->iops[READ] = -1;
-       tg->iops[WRITE] = -1;
-
-       /*
-        * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
-        * but percpu allocator can't be called from IO path.  Queue tg on
-        * tg_stats_alloc_list and allocate from work item.
-        */
-       spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-       list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
-       schedule_delayed_work(&tg_stats_alloc_work, 0);
-       spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
  }
  
  /*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
                                     (tg->bps[rw] != -1 || tg->iops[rw] != -1);
  }
  
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
  {
         /*
          * We don't want new groups to escape the limits of its ancestors.
          * Update has_rules[] after a new group is brought online.
          */
-       tg_update_has_rules(blkg_to_tg(blkg));
-}
-
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       unsigned long flags;
-
-       spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-       list_del_init(&tg->stats_alloc_node);
-       spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
-       free_percpu(tg->stats_cpu);
-
-       throtl_service_queue_exit(&tg->service_queue);
-}
-
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       int cpu;
-
-       if (tg->stats_cpu == NULL)
-               return;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               blkg_rwstat_reset(&sc->service_bytes);
-               blkg_rwstat_reset(&sc->serviced);
-       }
-}
-
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
-                                          struct blkcg *blkcg)
-{
-       /*
-        * This is the common case when there are no blkcgs.  Avoid lookup
-        * in this case
-        */
-       if (blkcg == &blkcg_root)
-               return td_root_tg(td);
-
-       return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+       tg_update_has_rules(pd_to_tg(pd));
  }
  
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-                                                 struct blkcg *blkcg)
+static void throtl_pd_free(struct blkg_policy_data *pd)
  {
-       struct request_queue *q = td->queue;
-       struct throtl_grp *tg = NULL;
-
-       /*
-        * This is the common case when there are no blkcgs.  Avoid lookup
-        * in this case
-        */
-       if (blkcg == &blkcg_root) {
-               tg = td_root_tg(td);
-       } else {
-               struct blkcg_gq *blkg;
-
-               blkg = blkg_lookup_create(blkcg, q);
-
-               /* if %NULL and @q is alive, fall back to root_tg */
-               if (!IS_ERR(blkg))
-                       tg = blkg_to_tg(blkg);
-               else if (!blk_queue_dying(q))
-                       tg = td_root_tg(td);
-       }
+       struct throtl_grp *tg = pd_to_tg(pd);
  
-       return tg;
+       del_timer_sync(&tg->service_queue.pending_timer);
+       kfree(tg);
  }
  
  static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
         return 0;
  }
  
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-                                        int rw)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       struct tg_stats_cpu *stats_cpu;
-       unsigned long flags;
-
-       /* If per cpu stats are not allocated yet, don't do any accounting. */
-       if (tg->stats_cpu == NULL)
-               return;
-
-       /*
-        * Disabling interrupts to provide mutual exclusion between two
-        * writes on same cpu. It probably is not needed for 64bit. Not
-        * optimizing that case yet.
-        */
-       local_irq_save(flags);
-
-       stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-       blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-       blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
-       local_irq_restore(flags);
-}
-
  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  {
         bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
          * more than once as a throttled bio will go through blk-throtl the
          * second time when it eventually gets issued.  Set it when a bio
          * is being charged to a tg.
-        *
-        * Dispatch stats aren't recursive and each @bio should only be
-        * accounted by the @tg it was originally associated with.  Let's
-        * update the stats when setting REQ_THROTTLED for the first time
-        * which is guaranteed to be for the @bio's original tg.
          */
-       if (!(bio->bi_rw & REQ_THROTTLED)) {
+       if (!(bio->bi_rw & REQ_THROTTLED))
                 bio->bi_rw |= REQ_THROTTLED;
-               throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                            bio->bi_iter.bi_size, bio->bi_rw);
-       }
  }
  
  /**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
         }
  }
  
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-                               struct blkg_policy_data *pd, int off)
-{
-       struct throtl_grp *tg = pd_to_tg(pd);
-       struct blkg_rwstat rwstat = { }, tmp;
-       int i, cpu;
-
-       if (tg->stats_cpu == NULL)
-               return 0;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               tmp = blkg_rwstat_read((void *)sc + off);
-               for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       rwstat.cnt[i] += tmp.cnt[i];
-       }
-
-       return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
-{
-       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
-                         &blkcg_policy_throtl, seq_cft(sf)->private, true);
-       return 0;
-}
-
  static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
                               int off)
  {
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
         return 0;
  }
  
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
-                          char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
  {
-       struct blkcg *blkcg = css_to_blkcg(of_css(of));
-       struct blkg_conf_ctx ctx;
-       struct throtl_grp *tg;
-       struct throtl_service_queue *sq;
-       struct blkcg_gq *blkg;
+       struct throtl_service_queue *sq = &tg->service_queue;
         struct cgroup_subsys_state *pos_css;
-       int ret;
-
-       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-       if (ret)
-               return ret;
-
-       tg = blkg_to_tg(ctx.blkg);
-       sq = &tg->service_queue;
-
-       if (!ctx.v)
-               ctx.v = -1;
-
-       if (is_u64)
-               *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
-       else
-               *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+       struct blkcg_gq *blkg;
  
         throtl_log(&tg->service_queue,
                    "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
          * restrictions in the whole hierarchy and allows them to bypass
          * blk-throttle.
          */
-       blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+       blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
                 tg_update_has_rules(blkg_to_tg(blkg));
  
         /*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
                 tg_update_disptime(tg);
                 throtl_schedule_next_dispatch(sq->parent_sq, true);
         }
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+                          char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkg_conf_ctx ctx;
+       struct throtl_grp *tg;
+       int ret;
+       u64 v;
  
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+       if (sscanf(ctx.body, "%llu", &v) != 1)
+               goto out_finish;
+       if (!v)
+               v = -1;
+
+       tg = blkg_to_tg(ctx.blkg);
+
+       if (is_u64)
+               *(u64 *)((void *)tg + of_cft(of)->private) = v;
+       else
+               *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+
+       tg_conf_updated(tg);
+       ret = 0;
+out_finish:
         blkg_conf_finish(&ctx);
-       return nbytes;
+       return ret ?: nbytes;
  }
  
  static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
         return tg_set_conf(of, buf, nbytes, off, false);
  }
  
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
         {
                 .name = "throttle.read_bps_device",
                 .private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
         },
         {
                 .name = "throttle.io_service_bytes",
-               .private = offsetof(struct tg_stats_cpu, service_bytes),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_bytes,
         },
         {
                 .name = "throttle.io_serviced",
-               .private = offsetof(struct tg_stats_cpu, serviced),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_ios,
+       },
+       { }     /* terminate */
+};
+
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+                        int off)
+{
+       struct throtl_grp *tg = pd_to_tg(pd);
+       const char *dname = blkg_dev_name(pd->blkg);
+       char bufs[4][21] = { "max", "max", "max", "max" };
+
+       if (!dname)
+               return 0;
+       if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+           tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+               return 0;
+
+       if (tg->bps[READ] != -1)
+               snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+       if (tg->bps[WRITE] != -1)
+               snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+       if (tg->iops[READ] != -1)
+               snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+       if (tg->iops[WRITE] != -1)
+               snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+                  dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+       return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+                         &blkcg_policy_throtl, seq_cft(sf)->private, false);
+       return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+                         char *buf, size_t nbytes, loff_t off)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkg_conf_ctx ctx;
+       struct throtl_grp *tg;
+       u64 v[4];
+       int ret;
+
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+       if (ret)
+               return ret;
+
+       tg = blkg_to_tg(ctx.blkg);
+
+       v[0] = tg->bps[READ];
+       v[1] = tg->bps[WRITE];
+       v[2] = tg->iops[READ];
+       v[3] = tg->iops[WRITE];
+
+       while (true) {
+               char tok[27];   /* wiops=18446744073709551616 */
+               char *p;
+               u64 val = -1;
+               int len;
+
+               if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+                       break;
+               if (tok[0] == '\0')
+                       break;
+               ctx.body += len;
+
+               ret = -EINVAL;
+               p = tok;
+               strsep(&p, "=");
+               if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+                       goto out_finish;
+
+               ret = -ERANGE;
+               if (!val)
+                       goto out_finish;
+
+               ret = -EINVAL;
+               if (!strcmp(tok, "rbps"))
+                       v[0] = val;
+               else if (!strcmp(tok, "wbps"))
+                       v[1] = val;
+               else if (!strcmp(tok, "riops"))
+                       v[2] = min_t(u64, val, UINT_MAX);
+               else if (!strcmp(tok, "wiops"))
+                       v[3] = min_t(u64, val, UINT_MAX);
+               else
+                       goto out_finish;
+       }
+
+       tg->bps[READ] = v[0];
+       tg->bps[WRITE] = v[1];
+       tg->iops[READ] = v[2];
+       tg->iops[WRITE] = v[3];
+
+       tg_conf_updated(tg);
+       ret = 0;
+out_finish:
+       blkg_conf_finish(&ctx);
+       return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+       {
+               .name = "max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = tg_print_max,
+               .write = tg_set_max,
         },
         { }     /* terminate */
  };
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
  }
  
  static struct blkcg_policy blkcg_policy_throtl = {
-       .pd_size                = sizeof(struct throtl_grp),
-       .cftypes                = throtl_files,
+       .dfl_cftypes            = throtl_files,
+       .legacy_cftypes         = throtl_legacy_files,
  
+       .pd_alloc_fn            = throtl_pd_alloc,
         .pd_init_fn             = throtl_pd_init,
         .pd_online_fn           = throtl_pd_online,
-       .pd_exit_fn             = throtl_pd_exit,
-       .pd_reset_stats_fn      = throtl_pd_reset_stats,
+       .pd_free_fn             = throtl_pd_free,
  };
  
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                   struct bio *bio)
  {
-       struct throtl_data *td = q->td;
         struct throtl_qnode *qn = NULL;
-       struct throtl_grp *tg;
+       struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
         struct throtl_service_queue *sq;
         bool rw = bio_data_dir(bio);
-       struct blkcg *blkcg;
         bool throttled = false;
  
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
         /* see throtl_charge_bio() */
-       if (bio->bi_rw & REQ_THROTTLED)
+       if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
                 goto out;
  
-       /*
-        * A throtl_grp pointer retrieved under rcu can be used to access
-        * basic fields like stats and io rates. If a group has no rules,
-        * just update the dispatch stats in lockless manner and return.
-        */
-       rcu_read_lock();
-       blkcg = bio_blkcg(bio);
-       tg = throtl_lookup_tg(td, blkcg);
-       if (tg) {
-               if (!tg->has_rules[rw]) {
-                       throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                       bio->bi_iter.bi_size, bio->bi_rw);
-                       goto out_unlock_rcu;
-               }
-       }
-
-       /*
-        * Either group has not been allocated yet or it is not an unlimited
-        * IO group
-        */
         spin_lock_irq(q->queue_lock);
-       tg = throtl_lookup_create_tg(td, blkcg);
-       if (unlikely(!tg))
+
+       if (unlikely(blk_queue_bypass(q)))
                 goto out_unlock;
  
         sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
  
  out_unlock:
         spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-       rcu_read_unlock();
  out:
         /*
          * As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
                 return -ENOMEM;
  
         INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-       throtl_service_queue_init(&td->service_queue, NULL);
+       throtl_service_queue_init(&td->service_queue);
  
         q->td = td;
         td->queue = q;
diff --git a/block/blk.h b/block/blk.h

index 838188b..98614ad 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
   * Internal throttling interface
   */
  #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
  extern void blk_throtl_drain(struct request_queue *q);
  extern int blk_throtl_init(struct request_queue *q);
  extern void blk_throtl_exit(struct request_queue *q);
  #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-       return false;
-}
  static inline void blk_throtl_drain(struct request_queue *q) { }
  static inline int blk_throtl_init(struct request_queue *q) { return 0; }
  static inline void blk_throtl_exit(struct request_queue *q) { }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index c62bb2e..04de884 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
  #define rb_entry_cfqg(node)    rb_entry((node), struct cfq_group, rb_node)
  
  /* blkio-related constants */
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
-#define CFQ_WEIGHT_DEFAULT      500
+#define CFQ_WEIGHT_LEGACY_MIN  10
+#define CFQ_WEIGHT_LEGACY_DFL  500
+#define CFQ_WEIGHT_LEGACY_MAX  1000
  
  struct cfq_ttime {
         unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
  
  struct cfqg_stats {
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
         /* number of ios merged */
         struct blkg_rwstat              merged;
         /* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
         struct blkg_rwstat              wait_time;
         /* number of IOs queued up */
         struct blkg_rwstat              queued;
-       /* total sectors transferred */
-       struct blkg_stat                sectors;
         /* total disk time and nr sectors dispatched by this group */
         struct blkg_stat                time;
  #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
  /* Per-cgroup data */
  struct cfq_group_data {
         /* must be the first member */
-       struct blkcg_policy_data pd;
+       struct blkcg_policy_data cpd;
  
         unsigned int weight;
         unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
         int dispatched;
         struct cfq_ttime ttime;
         struct cfqg_stats stats;        /* stats for this cfqg */
-       struct cfqg_stats dead_stats;   /* stats pushed from dead children */
+
+       /* async queue for each priority case */
+       struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+       struct cfq_queue *async_idle_cfqq;
+
  };
  
  struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
         struct cfq_queue *active_queue;
         struct cfq_io_cq *active_cic;
  
-       /*
-        * async queue for each priority case
-        */
-       struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
-       struct cfq_queue *async_idle_cfqq;
-
         sector_t last_position;
  
         /*
@@ -401,6 +393,7 @@ struct cfq_data {
  };
  
  static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
  
  static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
                                             enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
  static struct cfq_group_data
  *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
  {
-       return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+       return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
  }
  
  static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
         blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
  }
  
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                             uint64_t bytes, int rw)
-{
-       blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-       blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-       blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-
  static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                         uint64_t start_time, uint64_t io_start_time, int rw)
  {
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
  static void cfqg_stats_reset(struct cfqg_stats *stats)
  {
         /* queued stats shouldn't be cleared */
-       blkg_rwstat_reset(&stats->service_bytes);
-       blkg_rwstat_reset(&stats->serviced);
         blkg_rwstat_reset(&stats->merged);
         blkg_rwstat_reset(&stats->service_time);
         blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
  }
  
  /* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
  {
         /* queued stats shouldn't be cleared */
-       blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-       blkg_rwstat_merge(&to->serviced, &from->serviced);
-       blkg_rwstat_merge(&to->merged, &from->merged);
-       blkg_rwstat_merge(&to->service_time, &from->service_time);
-       blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-       blkg_stat_merge(&from->time, &from->time);
+       blkg_rwstat_add_aux(&to->merged, &from->merged);
+       blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+       blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+       blkg_stat_add_aux(&from->time, &from->time);
  #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-       blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-       blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-       blkg_stat_merge(&to->dequeue, &from->dequeue);
-       blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-       blkg_stat_merge(&to->idle_time, &from->idle_time);
-       blkg_stat_merge(&to->empty_time, &from->empty_time);
+       blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+       blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+       blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+       blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+       blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+       blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+       blkg_stat_add_aux(&to->empty_time, &from->empty_time);
  #endif
  }
  
  /*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
   * recursive stats can still account for the amount used by this cfqg after
   * it's gone.
   */
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
         if (unlikely(!parent))
                 return;
  
-       cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
-       cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+       cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
         cfqg_stats_reset(&cfqg->stats);
-       cfqg_stats_reset(&cfqg->dead_stats);
  }
  
  #else  /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
                         unsigned long time, unsigned long unaccounted_time) { }
  static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
  static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                             uint64_t bytes, int rw) { }
  static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                         uint64_t start_time, uint64_t io_start_time, int rw) { }
  
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
  
  static void cfq_dispatch_insert(struct request_queue *, struct request *);
  static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-                                      struct cfq_io_cq *cic, struct bio *bio,
-                                      gfp_t gfp_mask);
+                                      struct cfq_io_cq *cic, struct bio *bio);
  
  static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
  {
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
  }
  
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+                           bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
+static void cfqg_stats_exit(struct cfqg_stats *stats)
  {
-       blkg_rwstat_init(&stats->service_bytes);
-       blkg_rwstat_init(&stats->serviced);
-       blkg_rwstat_init(&stats->merged);
-       blkg_rwstat_init(&stats->service_time);
-       blkg_rwstat_init(&stats->wait_time);
-       blkg_rwstat_init(&stats->queued);
+       blkg_rwstat_exit(&stats->merged);
+       blkg_rwstat_exit(&stats->service_time);
+       blkg_rwstat_exit(&stats->wait_time);
+       blkg_rwstat_exit(&stats->queued);
+       blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_exit(&stats->unaccounted_time);
+       blkg_stat_exit(&stats->avg_queue_size_sum);
+       blkg_stat_exit(&stats->avg_queue_size_samples);
+       blkg_stat_exit(&stats->dequeue);
+       blkg_stat_exit(&stats->group_wait_time);
+       blkg_stat_exit(&stats->idle_time);
+       blkg_stat_exit(&stats->empty_time);
+#endif
+}
  
-       blkg_stat_init(&stats->sectors);
-       blkg_stat_init(&stats->time);
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+       if (blkg_rwstat_init(&stats->merged, gfp) ||
+           blkg_rwstat_init(&stats->service_time, gfp) ||
+           blkg_rwstat_init(&stats->wait_time, gfp) ||
+           blkg_rwstat_init(&stats->queued, gfp) ||
+           blkg_stat_init(&stats->time, gfp))
+               goto err;
  
  #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_init(&stats->unaccounted_time);
-       blkg_stat_init(&stats->avg_queue_size_sum);
-       blkg_stat_init(&stats->avg_queue_size_samples);
-       blkg_stat_init(&stats->dequeue);
-       blkg_stat_init(&stats->group_wait_time);
-       blkg_stat_init(&stats->idle_time);
-       blkg_stat_init(&stats->empty_time);
+       if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+           blkg_stat_init(&stats->dequeue, gfp) ||
+           blkg_stat_init(&stats->group_wait_time, gfp) ||
+           blkg_stat_init(&stats->idle_time, gfp) ||
+           blkg_stat_init(&stats->empty_time, gfp))
+               goto err;
  #endif
+       return 0;
+err:
+       cfqg_stats_exit(stats);
+       return -ENOMEM;
  }
  
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
  {
-       struct cfq_group_data *cgd =
-               cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+       struct cfq_group_data *cgd;
  
-       if (blkcg == &blkcg_root) {
-               cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
-               cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
-       } else {
-               cgd->weight = CFQ_WEIGHT_DEFAULT;
-               cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
-       }
+       cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+       if (!cgd)
+               return NULL;
+       return &cgd->cpd;
+}
+
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+       struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+       unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+                             CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+       if (cpd_to_blkcg(cpd) == &blkcg_root)
+               weight *= 2;
+
+       cgd->weight = weight;
+       cgd->leaf_weight = weight;
  }
  
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
  {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-       struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+       kfree(cpd_to_cfqgd(cpd));
+}
+
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+       struct blkcg *blkcg = cpd_to_blkcg(cpd);
+       bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+       unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+       if (blkcg == &blkcg_root)
+               weight *= 2;
+
+       WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+       WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+       struct cfq_group *cfqg;
+
+       cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+       if (!cfqg)
+               return NULL;
  
         cfq_init_cfqg_base(cfqg);
+       if (cfqg_stats_init(&cfqg->stats, gfp)) {
+               kfree(cfqg);
+               return NULL;
+       }
+
+       return &cfqg->pd;
+}
+
+static void cfq_pd_init(struct blkg_policy_data *pd)
+{
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+       struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
+
         cfqg->weight = cgd->weight;
         cfqg->leaf_weight = cgd->leaf_weight;
-       cfqg_stats_init(&cfqg->stats);
-       cfqg_stats_init(&cfqg->dead_stats);
  }
  
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
  {
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+       int i;
+
+       for (i = 0; i < IOPRIO_BE_NR; i++) {
+               if (cfqg->async_cfqq[0][i])
+                       cfq_put_queue(cfqg->async_cfqq[0][i]);
+               if (cfqg->async_cfqq[1][i])
+                       cfq_put_queue(cfqg->async_cfqq[1][i]);
+       }
+
+       if (cfqg->async_idle_cfqq)
+               cfq_put_queue(cfqg->async_idle_cfqq);
+
         /*
          * @blkg is going offline and will be ignored by
          * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
          * that they don't get lost.  If IOs complete after this point, the
          * stats for them will be lost.  Oh well...
          */
-       cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+       cfqg_stats_xfer_dead(cfqg);
  }
  
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
-                                       offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void cfq_pd_free(struct blkg_policy_data *pd)
  {
-       u64 sum = 0;
-
-       sum += blkg_stat_recursive_sum(pd, off);
-       sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-       return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-                                                      int off)
-{
-       struct blkg_rwstat a, b;
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
  
-       a = blkg_rwstat_recursive_sum(pd, off);
-       b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-       blkg_rwstat_merge(&a, &b);
-       return a;
+       cfqg_stats_exit(&cfqg->stats);
+       return kfree(cfqg);
  }
  
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
  {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
  
         cfqg_stats_reset(&cfqg->stats);
-       cfqg_stats_reset(&cfqg->dead_stats);
  }
  
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                               struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+                                        struct blkcg *blkcg)
  {
-       struct request_queue *q = cfqd->queue;
-       struct cfq_group *cfqg = NULL;
-
-       /* avoid lookup for the common case where there's no blkcg */
-       if (blkcg == &blkcg_root) {
-               cfqg = cfqd->root_group;
-       } else {
-               struct blkcg_gq *blkg;
-
-               blkg = blkg_lookup_create(blkcg, q);
-               if (!IS_ERR(blkg))
-                       cfqg = blkg_to_cfqg(blkg);
-       }
+       struct blkcg_gq *blkg;
  
-       return cfqg;
+       blkg = blkg_lookup(blkcg, cfqd->queue);
+       if (likely(blkg))
+               return blkg_to_cfqg(blkg);
+       return NULL;
  }
  
  static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
  {
-       /* Currently, all async queues are mapped to root group */
-       if (!cfq_cfqq_sync(cfqq))
-               cfqg = cfqq->cfqd->root_group;
-
         cfqq->cfqg = cfqg;
         /* cfqq reference on cfqg */
         cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
  
  static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
                                         char *buf, size_t nbytes, loff_t off,
-                                       bool is_leaf_weight)
+                                       bool on_dfl, bool is_leaf_weight)
  {
+       unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+       unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
         struct blkcg *blkcg = css_to_blkcg(of_css(of));
         struct blkg_conf_ctx ctx;
         struct cfq_group *cfqg;
         struct cfq_group_data *cfqgd;
         int ret;
+       u64 v;
  
         ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
         if (ret)
                 return ret;
  
-       ret = -EINVAL;
+       if (sscanf(ctx.body, "%llu", &v) == 1) {
+               /* require "default" on dfl */
+               ret = -ERANGE;
+               if (!v && on_dfl)
+                       goto out_finish;
+       } else if (!strcmp(strim(ctx.body), "default")) {
+               v = 0;
+       } else {
+               ret = -EINVAL;
+               goto out_finish;
+       }
+
         cfqg = blkg_to_cfqg(ctx.blkg);
         cfqgd = blkcg_to_cfqgd(blkcg);
-       if (!cfqg || !cfqgd)
-               goto err;
  
-       if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+       ret = -ERANGE;
+       if (!v || (v >= min && v <= max)) {
                 if (!is_leaf_weight) {
-                       cfqg->dev_weight = ctx.v;
-                       cfqg->new_weight = ctx.v ?: cfqgd->weight;
+                       cfqg->dev_weight = v;
+                       cfqg->new_weight = v ?: cfqgd->weight;
                 } else {
-                       cfqg->dev_leaf_weight = ctx.v;
-                       cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+                       cfqg->dev_leaf_weight = v;
+                       cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
                 }
                 ret = 0;
         }
-
-err:
+out_finish:
         blkg_conf_finish(&ctx);
         return ret ?: nbytes;
  }
@@ -1776,25 +1805,27 @@ err:
  static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
                                       char *buf, size_t nbytes, loff_t off)
  {
-       return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+       return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
  }
  
  static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes, loff_t off)
  {
-       return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+       return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
  }
  
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-                           u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+                           bool on_dfl, bool reset_dev, bool is_leaf_weight)
  {
+       unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+       unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
         struct blkcg *blkcg = css_to_blkcg(css);
         struct blkcg_gq *blkg;
         struct cfq_group_data *cfqgd;
         int ret = 0;
  
-       if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
-               return -EINVAL;
+       if (val < min || val > max)
+               return -ERANGE;
  
         spin_lock_irq(&blkcg->lock);
         cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                         continue;
  
                 if (!is_leaf_weight) {
+                       if (reset_dev)
+                               cfqg->dev_weight = 0;
                         if (!cfqg->dev_weight)
                                 cfqg->new_weight = cfqgd->weight;
                 } else {
+                       if (reset_dev)
+                               cfqg->dev_leaf_weight = 0;
                         if (!cfqg->dev_leaf_weight)
                                 cfqg->new_leaf_weight = cfqgd->leaf_weight;
                 }
@@ -1831,13 +1866,13 @@ out:
  static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                           u64 val)
  {
-       return __cfq_set_weight(css, cft, val, false);
+       return __cfq_set_weight(css, val, false, false, false);
  }
  
  static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
                                struct cftype *cft, u64 val)
  {
-       return __cfq_set_weight(css, cft, val, true);
+       return __cfq_set_weight(css, val, false, false, true);
  }
  
  static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
  static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
                                       struct blkg_policy_data *pd, int off)
  {
-       u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
-
+       u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+                                         &blkcg_policy_cfq, off);
         return __blkg_prfill_u64(sf, pd, sum);
  }
  
  static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
                                         struct blkg_policy_data *pd, int off)
  {
-       struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
-
+       struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+                                                       &blkcg_policy_cfq, off);
         return __blkg_prfill_rwstat(sf, pd, &sum);
  }
  
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
         return 0;
  }
  
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+                              int off)
+{
+       u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+       return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+                                        struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+       u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+                         false);
+       return 0;
+}
+
  #ifdef CONFIG_DEBUG_BLK_CGROUP
  static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
                                       struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
  }
  #endif /* CONFIG_DEBUG_BLK_CGROUP */
  
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
         /* on root, weight is mapped to leaf_weight */
         {
                 .name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
         },
         {
                 .name = "sectors",
-               .private = offsetof(struct cfq_group, stats.sectors),
-               .seq_show = cfqg_print_stat,
+               .seq_show = cfqg_print_stat_sectors,
         },
         {
                 .name = "io_service_bytes",
-               .private = offsetof(struct cfq_group, stats.service_bytes),
-               .seq_show = cfqg_print_rwstat,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_bytes,
         },
         {
                 .name = "io_serviced",
-               .private = offsetof(struct cfq_group, stats.serviced),
-               .seq_show = cfqg_print_rwstat,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_ios,
         },
         {
                 .name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
         },
         {
                 .name = "sectors_recursive",
-               .private = offsetof(struct cfq_group, stats.sectors),
-               .seq_show = cfqg_print_stat_recursive,
+               .seq_show = cfqg_print_stat_sectors_recursive,
         },
         {
                 .name = "io_service_bytes_recursive",
-               .private = offsetof(struct cfq_group, stats.service_bytes),
-               .seq_show = cfqg_print_rwstat_recursive,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_bytes_recursive,
         },
         {
                 .name = "io_serviced_recursive",
-               .private = offsetof(struct cfq_group, stats.serviced),
-               .seq_show = cfqg_print_rwstat_recursive,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_ios_recursive,
         },
         {
                 .name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
  #endif /* CONFIG_DEBUG_BLK_CGROUP */
         { }     /* terminate */
  };
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+       seq_printf(sf, "default %u\n", cgd->weight);
+       blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+                         &blkcg_policy_cfq, 0, false);
+       return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+                                    char *buf, size_t nbytes, loff_t off)
+{
+       char *endp;
+       int ret;
+       u64 v;
+
+       buf = strim(buf);
+
+       /* "WEIGHT" or "default WEIGHT" sets the default weight */
+       v = simple_strtoull(buf, &endp, 0);
+       if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+               ret = __cfq_set_weight(of_css(of), v, true, false, false);
+               return ret ?: nbytes;
+       }
+
+       /* "MAJ:MIN WEIGHT" */
+       return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+       {
+               .name = "weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cfq_print_weight_on_dfl,
+               .write = cfq_set_weight_on_dfl,
+       },
+       { }     /* terminate */
+};
+
  #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                               struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+                                        struct blkcg *blkcg)
  {
         return cfqd->root_group;
  }
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
  
         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
         cfqq->nr_sectors += blk_rq_sectors(rq);
-       cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
  }
  
  /*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
         struct cfq_io_cq *cic = icq_to_cic(icq);
         struct cfq_data *cfqd = cic_to_cfqd(cic);
  
-       if (cic->cfqq[BLK_RW_ASYNC]) {
-               cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-               cic->cfqq[BLK_RW_ASYNC] = NULL;
+       if (cic_to_cfqq(cic, false)) {
+               cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+               cic_set_cfqq(cic, NULL, false);
         }
  
-       if (cic->cfqq[BLK_RW_SYNC]) {
-               cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
-               cic->cfqq[BLK_RW_SYNC] = NULL;
+       if (cic_to_cfqq(cic, true)) {
+               cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+               cic_set_cfqq(cic, NULL, true);
         }
  }
  
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
         if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
                 return;
  
-       cfqq = cic->cfqq[BLK_RW_ASYNC];
+       cfqq = cic_to_cfqq(cic, false);
         if (cfqq) {
-               struct cfq_queue *new_cfqq;
-               new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-                                        GFP_ATOMIC);
-               if (new_cfqq) {
-                       cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
-                       cfq_put_queue(cfqq);
-               }
+               cfq_put_queue(cfqq);
+               cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
+               cic_set_cfqq(cic, cfqq, false);
         }
  
-       cfqq = cic->cfqq[BLK_RW_SYNC];
+       cfqq = cic_to_cfqq(cic, true);
         if (cfqq)
                 cfq_mark_cfqq_prio_changed(cfqq);
  
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
  static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
  {
         struct cfq_data *cfqd = cic_to_cfqd(cic);
-       struct cfq_queue *sync_cfqq;
+       struct cfq_queue *cfqq;
         uint64_t serial_nr;
  
         rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
         if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
                 return;
  
-       sync_cfqq = cic_to_cfqq(cic, 1);
-       if (sync_cfqq) {
-               /*
-                * Drop reference to sync queue. A new sync queue will be
-                * assigned in new group upon arrival of a fresh request.
-                */
-               cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
-               cic_set_cfqq(cic, NULL, 1);
-               cfq_put_queue(sync_cfqq);
+       /*
+        * Drop reference to queues.  New queues will be assigned in new
+        * group upon arrival of fresh requests.
+        */
+       cfqq = cic_to_cfqq(cic, false);
+       if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+               cic_set_cfqq(cic, NULL, false);
+               cfq_put_queue(cfqq);
+       }
+
+       cfqq = cic_to_cfqq(cic, true);
+       if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+               cic_set_cfqq(cic, NULL, true);
+               cfq_put_queue(cfqq);
         }
  
         cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
  static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
  #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
  
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-                    struct bio *bio, gfp_t gfp_mask)
-{
-       struct blkcg *blkcg;
-       struct cfq_queue *cfqq, *new_cfqq = NULL;
-       struct cfq_group *cfqg;
-
-retry:
-       rcu_read_lock();
-
-       blkcg = bio_blkcg(bio);
-       cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
-       if (!cfqg) {
-               cfqq = &cfqd->oom_cfqq;
-               goto out;
-       }
-
-       cfqq = cic_to_cfqq(cic, is_sync);
-
-       /*
-        * Always try a new alloc if we fell back to the OOM cfqq
-        * originally, since it should just be a temporary situation.
-        */
-       if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-               cfqq = NULL;
-               if (new_cfqq) {
-                       cfqq = new_cfqq;
-                       new_cfqq = NULL;
-               } else if (gfp_mask & __GFP_WAIT) {
-                       rcu_read_unlock();
-                       spin_unlock_irq(cfqd->queue->queue_lock);
-                       new_cfqq = kmem_cache_alloc_node(cfq_pool,
-                                       gfp_mask | __GFP_ZERO,
-                                       cfqd->queue->node);
-                       spin_lock_irq(cfqd->queue->queue_lock);
-                       if (new_cfqq)
-                               goto retry;
-                       else
-                               return &cfqd->oom_cfqq;
-               } else {
-                       cfqq = kmem_cache_alloc_node(cfq_pool,
-                                       gfp_mask | __GFP_ZERO,
-                                       cfqd->queue->node);
-               }
-
-               if (cfqq) {
-                       cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-                       cfq_init_prio_data(cfqq, cic);
-                       cfq_link_cfqq_cfqg(cfqq, cfqg);
-                       cfq_log_cfqq(cfqd, cfqq, "alloced");
-               } else
-                       cfqq = &cfqd->oom_cfqq;
-       }
-out:
-       if (new_cfqq)
-               kmem_cache_free(cfq_pool, new_cfqq);
-
-       rcu_read_unlock();
-       return cfqq;
-}
-
  static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
  {
         switch (ioprio_class) {
         case IOPRIO_CLASS_RT:
-               return &cfqd->async_cfqq[0][ioprio];
+               return &cfqg->async_cfqq[0][ioprio];
         case IOPRIO_CLASS_NONE:
                 ioprio = IOPRIO_NORM;
                 /* fall through */
         case IOPRIO_CLASS_BE:
-               return &cfqd->async_cfqq[1][ioprio];
+               return &cfqg->async_cfqq[1][ioprio];
         case IOPRIO_CLASS_IDLE:
-               return &cfqd->async_idle_cfqq;
+               return &cfqg->async_idle_cfqq;
         default:
                 BUG();
         }
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
  
  static struct cfq_queue *
  cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-             struct bio *bio, gfp_t gfp_mask)
+             struct bio *bio)
  {
         int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
         int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
         struct cfq_queue **async_cfqq = NULL;
-       struct cfq_queue *cfqq = NULL;
+       struct cfq_queue *cfqq;
+       struct cfq_group *cfqg;
+
+       rcu_read_lock();
+       cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+       if (!cfqg) {
+               cfqq = &cfqd->oom_cfqq;
+               goto out;
+       }
  
         if (!is_sync) {
                 if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
                         ioprio = task_nice_ioprio(tsk);
                         ioprio_class = task_nice_ioclass(tsk);
                 }
-               async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+               async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
                 cfqq = *async_cfqq;
+               if (cfqq)
+                       goto out;
         }
  
-       if (!cfqq)
-               cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+       cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+                                    cfqd->queue->node);
+       if (!cfqq) {
+               cfqq = &cfqd->oom_cfqq;
+               goto out;
+       }
  
-       /*
-        * pin the queue now that it's allocated, scheduler exit will prune it
-        */
-       if (!is_sync && !(*async_cfqq)) {
+       cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+       cfq_init_prio_data(cfqq, cic);
+       cfq_link_cfqq_cfqg(cfqq, cfqg);
+       cfq_log_cfqq(cfqd, cfqq, "alloced");
+
+       if (async_cfqq) {
+               /* a new async queue is created, pin and remember */
                 cfqq->ref++;
                 *async_cfqq = cfqq;
         }
-
+out:
         cfqq->ref++;
+       rcu_read_unlock();
         return cfqq;
  }
  
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
         const bool is_sync = rq_is_sync(rq);
         struct cfq_queue *cfqq;
  
-       might_sleep_if(gfp_mask & __GFP_WAIT);
-
         spin_lock_irq(q->queue_lock);
  
         check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
  new_queue:
         cfqq = cic_to_cfqq(cic, is_sync);
         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-               cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+               if (cfqq)
+                       cfq_put_queue(cfqq);
+               cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
                 cic_set_cfqq(cic, cfqq, is_sync);
         } else {
                 /*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
         cancel_work_sync(&cfqd->unplug_work);
  }
  
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
-       int i;
-
-       for (i = 0; i < IOPRIO_BE_NR; i++) {
-               if (cfqd->async_cfqq[0][i])
-                       cfq_put_queue(cfqd->async_cfqq[0][i]);
-               if (cfqd->async_cfqq[1][i])
-                       cfq_put_queue(cfqd->async_cfqq[1][i]);
-       }
-
-       if (cfqd->async_idle_cfqq)
-               cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
  static void cfq_exit_queue(struct elevator_queue *e)
  {
         struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
         if (cfqd->active_queue)
                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
  
-       cfq_put_async_queues(cfqd);
-
         spin_unlock_irq(q->queue_lock);
  
         cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
                 goto out_free;
  
         cfq_init_cfqg_base(cfqd->root_group);
+       cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
  #endif
-       cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
-       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
  
         /*
          * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
                 cfqd->prio_trees[i] = RB_ROOT;
  
         /*
-        * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+        * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
          * Grab a permanent reference to it, so that the normal code flow
          * will not attempt to free it.  oom_cfqq is linked to root_group
          * but shouldn't hold a reference as it'll never be unlinked.  Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
  
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
  static struct blkcg_policy blkcg_policy_cfq = {
-       .pd_size                = sizeof(struct cfq_group),
-       .cpd_size               = sizeof(struct cfq_group_data),
-       .cftypes                = cfq_blkcg_files,
+       .dfl_cftypes            = cfq_blkcg_files,
+       .legacy_cftypes         = cfq_blkcg_legacy_files,
  
+       .cpd_alloc_fn           = cfq_cpd_alloc,
         .cpd_init_fn            = cfq_cpd_init,
+       .cpd_free_fn            = cfq_cpd_free,
+       .cpd_bind_fn            = cfq_cpd_bind,
+
+       .pd_alloc_fn            = cfq_pd_alloc,
         .pd_init_fn             = cfq_pd_init,
         .pd_offline_fn          = cfq_pd_offline,
+       .pd_free_fn             = cfq_pd_free,
         .pd_reset_stats_fn      = cfq_pd_reset_stats,
  };
  #endif
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index ae0f438..2448912 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
         unsigned int for_background:1;
         unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
         unsigned int auto_free:1;       /* free on completion */
-       unsigned int single_wait:1;
-       unsigned int single_done:1;
         enum wb_reason reason;          /* why was writeback initiated? */
  
         struct list_head list;          /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
  static void wb_queue_work(struct bdi_writeback *wb,
                           struct wb_writeback_work *work)
  {
-       trace_writeback_queue(wb->bdi, work);
+       trace_writeback_queue(wb, work);
  
         spin_lock_bh(&wb->work_lock);
-       if (!test_bit(WB_registered, &wb->state)) {
-               if (work->single_wait)
-                       work->single_done = 1;
+       if (!test_bit(WB_registered, &wb->state))
                 goto out_unlock;
-       }
         if (work->done)
                 atomic_inc(&work->done->cnt);
         list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
  
  /**
   * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
   * @cong_bits: mask of WB_[a]sync_congested bits to test
   *
   * Tests whether @inode is congested.  @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
   * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
   * associated with @inode is congested; otherwise, the root wb's congestion
   * state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
   */
  int inode_congested(struct inode *inode, int cong_bits)
  {
@@ -737,32 +735,6 @@ int inode_congested(struct inode *inode, int cong_bits)
  }
  EXPORT_SYMBOL_GPL(inode_congested);
  
-/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's.  The caller must have set @work->single_wait before
- * issuing it.  This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
-                                   struct wb_writeback_work *work)
-{
-       if (WARN_ON_ONCE(!work->single_wait))
-               return;
-
-       wait_event(bdi->wb_waitq, work->single_done);
-
-       /*
-        * Paired with smp_wmb() in wb_do_writeback() and ensures that all
-        * modifications to @work prior to assertion of ->single_done is
-        * visible to the caller once this function returns.
-        */
-       smp_rmb();
-}
-
  /**
   * wb_split_bdi_pages - split nr_pages to write according to bandwidth
   * @wb: target bdi_writeback to split @nr_pages to
@@ -791,38 +763,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
                 return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
  }
  
-/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb.  If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned.  In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion.  @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
-                                   struct wb_writeback_work *base_work)
-{
-       struct wb_writeback_work *work;
-
-       work = kmalloc(sizeof(*work), GFP_ATOMIC);
-       if (work) {
-               *work = *base_work;
-               work->auto_free = 1;
-               work->single_wait = 0;
-       } else {
-               work = base_work;
-               work->auto_free = 0;
-               work->single_wait = 1;
-       }
-       work->single_done = 0;
-       wb_queue_work(wb, work);
-       return work != base_work;
-}
-
  /**
   * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
   * @bdi: target backing_dev_info
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                   struct wb_writeback_work *base_work,
                                   bool skip_if_busy)
  {
-       long nr_pages = base_work->nr_pages;
-       int next_blkcg_id = 0;
+       int next_memcg_id = 0;
         struct bdi_writeback *wb;
         struct wb_iter iter;
  
         might_sleep();
  restart:
         rcu_read_lock();
-       bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+       bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+               DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+               struct wb_writeback_work fallback_work;
+               struct wb_writeback_work *work;
+               long nr_pages;
+
                 /* SYNC_ALL writes out I_DIRTY_TIME too */
                 if (!wb_has_dirty_io(wb) &&
                     (base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
                 if (skip_if_busy && writeback_in_progress(wb))
                         continue;
  
-               base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
-               if (!wb_clone_and_queue_work(wb, base_work)) {
-                       next_blkcg_id = wb->blkcg_css->id + 1;
-                       rcu_read_unlock();
-                       wb_wait_for_single_work(bdi, base_work);
-                       goto restart;
+               nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+               work = kmalloc(sizeof(*work), GFP_ATOMIC);
+               if (work) {
+                       *work = *base_work;
+                       work->nr_pages = nr_pages;
+                       work->auto_free = 1;
+                       wb_queue_work(wb, work);
+                       continue;
                 }
+
+               /* alloc failed, execute synchronously using on-stack fallback */
+               work = &fallback_work;
+               *work = *base_work;
+               work->nr_pages = nr_pages;
+               work->auto_free = 0;
+               work->done = &fallback_work_done;
+
+               wb_queue_work(wb, work);
+
+               next_memcg_id = wb->memcg_css->id + 1;
+               rcu_read_unlock();
+               wb_wait_for_completion(bdi, &fallback_work_done);
+               goto restart;
         }
         rcu_read_unlock();
  }
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
  
         if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                 base_work->auto_free = 0;
-               base_work->single_wait = 0;
-               base_work->single_done = 0;
                 wb_queue_work(&bdi->wb, base_work);
         }
  }
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
          */
         work = kzalloc(sizeof(*work), GFP_ATOMIC);
         if (!work) {
-               trace_writeback_nowork(wb->bdi);
+               trace_writeback_nowork(wb);
                 wb_wakeup(wb);
                 return;
         }
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
          * We just wake up the flusher thread. It will perform background
          * writeback as soon as there is no other work to do.
          */
-       trace_writeback_wake_background(wb->bdi);
+       trace_writeback_wake_background(wb);
         wb_wakeup(wb);
  }
  
@@ -1660,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb,
                 } else if (work->for_background)
                         oldest_jif = jiffies;
  
-               trace_writeback_start(wb->bdi, work);
+               trace_writeback_start(wb, work);
                 if (list_empty(&wb->b_io))
                         queue_io(wb, work);
                 if (work->sb)
                         progress = writeback_sb_inodes(work->sb, wb, work);
                 else
                         progress = __writeback_inodes_wb(wb, work);
-               trace_writeback_written(wb->bdi, work);
+               trace_writeback_written(wb, work);
  
                 wb_update_bandwidth(wb, wb_start);
  
@@ -1692,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                  * we'll just busyloop.
                  */
                 if (!list_empty(&wb->b_more_io))  {
-                       trace_writeback_wait(wb->bdi, work);
+                       trace_writeback_wait(wb, work);
                         inode = wb_inode(wb->b_more_io.prev);
                         spin_lock(&inode->i_lock);
                         spin_unlock(&wb->list_lock);
@@ -1797,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
         set_bit(WB_writeback_running, &wb->state);
         while ((work = get_next_work_item(wb)) != NULL) {
                 struct wb_completion *done = work->done;
-               bool need_wake_up = false;
  
-               trace_writeback_exec(wb->bdi, work);
+               trace_writeback_exec(wb, work);
  
                 wrote += wb_writeback(wb, work);
  
-               if (work->single_wait) {
-                       WARN_ON_ONCE(work->auto_free);
-                       /* paired w/ rmb in wb_wait_for_single_work() */
-                       smp_wmb();
-                       work->single_done = 1;
-                       need_wake_up = true;
-               } else if (work->auto_free) {
+               if (work->auto_free)
                         kfree(work);
-               }
-
                 if (done && atomic_dec_and_test(&done->cnt))
-                       need_wake_up = true;
-
-               if (need_wake_up)
                         wake_up_all(&wb->bdi->wb_waitq);
         }
  
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c

index 2d48d28..91e0045 100644 (file)
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -91,6 +91,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
         return ret;
  }
  
+/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+       size_t len = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+       do {
+               len += strlen(kn->name) + 1;
+               kn = kn->parent;
+       } while (kn && kn->parent);
+
+       spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+       return len;
+}
+
  /**
   * kernfs_path - build full path of a given node
   * @kn: kernfs_node of interest
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index 0fe9df9..5a5d79e 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
          * %current's blkcg equals the effective blkcg of its memcg.  No
          * need to use the relatively expensive cgroup_get_e_css().
          */
-       if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+       if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                 return wb;
         return NULL;
  }
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
  }
  
  struct wb_iter {
-       int                     start_blkcg_id;
+       int                     start_memcg_id;
         struct radix_tree_iter  tree_iter;
         void                    **slot;
  };
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
  
         WARN_ON_ONCE(!rcu_read_lock_held());
  
-       if (iter->start_blkcg_id >= 0) {
-               iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
-               iter->start_blkcg_id = -1;
+       if (iter->start_memcg_id >= 0) {
+               iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+               iter->start_memcg_id = -1;
         } else {
                 iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
         }
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
  
  static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
                                                    struct backing_dev_info *bdi,
-                                                  int start_blkcg_id)
+                                                  int start_memcg_id)
  {
-       iter->start_blkcg_id = start_blkcg_id;
+       iter->start_memcg_id = start_memcg_id;
  
-       if (start_blkcg_id)
+       if (start_memcg_id)
                 return __wb_iter_next(iter, bdi);
         else
                 return &bdi->wb;
  }
  
  /**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
   * @wb_cur: cursor struct bdi_writeback pointer
   * @bdi: bdi to walk wb's of
   * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
   *
   * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
   * to be used as temp storage during iteration.  rcu_read_lock() must be
   * held throughout iteration.
   */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
-       for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);      \
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)             \
+       for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
              (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
  
  #else  /* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h

index a4cd164..0a5cc7a 100644 (file)
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
   */
  
  #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
  #include <linux/seq_file.h>
  #include <linux/radix-tree.h>
  #include <linux/blkdev.h>
  #include <linux/atomic.h>
  
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH    (INT_MAX / 2)
+
  /* Max limits for throttle policy */
  #define THROTL_IOPS_MAX                UINT_MAX
  
@@ -45,7 +48,7 @@ struct blkcg {
         struct blkcg_gq                 *blkg_hint;
         struct hlist_head               blkg_list;
  
-       struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+       struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
  
         struct list_head                all_blkcgs_node;
  #ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
  #endif
  };
  
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
+ */
  struct blkg_stat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt;
+       struct percpu_counter           cpu_cnt;
+       atomic64_t                      aux_cnt;
  };
  
  struct blkg_rwstat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt[BLKG_RWSTAT_NR];
+       struct percpu_counter           cpu_cnt[BLKG_RWSTAT_NR];
+       atomic64_t                      aux_cnt[BLKG_RWSTAT_NR];
  };
  
  /*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
   * request_queue (q).  This is used by blkcg policies which need to track
   * information per blkcg - q pair.
   *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
   */
  struct blkg_policy_data {
         /* the blkg and policy id this per-policy data belongs to */
         struct blkcg_gq                 *blkg;
         int                             plid;
-
-       /* used during policy activation */
-       struct list_head                alloc_node;
  };
  
  /*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
   */
  struct blkcg_policy_data {
-       /* the policy id this per-policy data belongs to */
+       /* the blkcg and policy id this per-policy data belongs to */
+       struct blkcg                    *blkcg;
         int                             plid;
  };
  
@@ -123,40 +127,50 @@ struct blkcg_gq {
         /* is this blkg online? protected by both blkcg and q locks */
         bool                            online;
  
+       struct blkg_rwstat              stat_bytes;
+       struct blkg_rwstat              stat_ios;
+
         struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
  
         struct rcu_head                 rcu_head;
  };
  
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
  
  struct blkcg_policy {
         int                             plid;
-       /* policy specific private data size */
-       size_t                          pd_size;
-       /* policy specific per-blkcg data size */
-       size_t                          cpd_size;
         /* cgroup files for the policy */
-       struct cftype                   *cftypes;
+       struct cftype                   *dfl_cftypes;
+       struct cftype                   *legacy_cftypes;
  
         /* operations */
+       blkcg_pol_alloc_cpd_fn          *cpd_alloc_fn;
         blkcg_pol_init_cpd_fn           *cpd_init_fn;
+       blkcg_pol_free_cpd_fn           *cpd_free_fn;
+       blkcg_pol_bind_cpd_fn           *cpd_bind_fn;
+
+       blkcg_pol_alloc_pd_fn           *pd_alloc_fn;
         blkcg_pol_init_pd_fn            *pd_init_fn;
         blkcg_pol_online_pd_fn          *pd_online_fn;
         blkcg_pol_offline_pd_fn         *pd_offline_fn;
-       blkcg_pol_exit_pd_fn            *pd_exit_fn;
+       blkcg_pol_free_pd_fn            *pd_free_fn;
         blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
  };
  
  extern struct blkcg blkcg_root;
  extern struct cgroup_subsys_state * const blkcg_root_css;
  
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint);
  struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                                     struct request_queue *q);
  int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
  void blkcg_deactivate_policy(struct request_queue *q,
                              const struct blkcg_policy *pol);
  
+const char *blkg_dev_name(struct blkcg_gq *blkg);
  void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                        u64 (*prfill)(struct seq_file *,
                                      struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
  u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
                        int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
  
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off);
  
  struct blkg_conf_ctx {
         struct gendisk                  *disk;
         struct blkcg_gq                 *blkg;
-       u64                             v;
+       char                            *body;
  };
  
  int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx);
+                  char *input, struct blkg_conf_ctx *ctx);
  void blkg_conf_finish(struct blkg_conf_ctx *ctx);
  
  
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
  
  static inline struct blkcg *task_blkcg(struct task_struct *tsk)
  {
-       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+       return css_to_blkcg(task_css(tsk, io_cgrp_id));
  }
  
  static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
  static inline struct cgroup_subsys_state *
  task_get_blkcg_css(struct task_struct *task)
  {
-       return task_get_css(task, blkio_cgrp_id);
+       return task_get_css(task, io_cgrp_id);
  }
  
  /**
@@ -232,6 +252,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
         return css_to_blkcg(blkcg->css.parent);
  }
  
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+                                            struct request_queue *q,
+                                            bool update_hint)
+{
+       struct blkcg_gq *blkg;
+
+       if (blkcg == &blkcg_root)
+               return q->root_blkg;
+
+       blkg = rcu_dereference(blkcg->blkg_hint);
+       if (blkg && blkg->q == q)
+               return blkg;
+
+       return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+                                          struct request_queue *q)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (unlikely(blk_queue_bypass(q)))
+               return NULL;
+       return __blkg_lookup(blkcg, q, false);
+}
+
  /**
   * blkg_to_pdata - get policy private data
   * @blkg: blkg of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
  static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                      struct blkcg_policy *pol)
  {
-       return blkcg ? blkcg->pd[pol->plid] : NULL;
+       return blkcg ? blkcg->cpd[pol->plid] : NULL;
  }
  
  /**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
         return pd ? pd->blkg : NULL;
  }
  
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+       return cpd ? cpd->blkcg : NULL;
+}
+
  /**
   * blkg_path - format cgroup path of blkg
   * @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
                 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
  }
  
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint);
-
  /**
   * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
   * @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
          * or if either the blkcg or queue is going away.  Fall back to
          * root_rl in such cases.
          */
-       blkg = blkg_lookup_create(blkcg, q);
-       if (IS_ERR(blkg))
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg))
                 goto root_rl;
  
         blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
   */
  static inline void blk_put_rl(struct request_list *rl)
  {
-       /* root_rl may not have blkg set */
-       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+       if (rl->blkg->blkcg != &blkcg_root)
                 blkg_put(rl->blkg);
  }
  
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
  #define blk_queue_for_each_rl(rl, q)   \
         for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
  
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
  {
-       u64_stats_init(&stat->syncp);
+       int ret;
+
+       ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+       if (ret)
+               return ret;
+
+       atomic64_set(&stat->aux_cnt, 0);
+       return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+       percpu_counter_destroy(&stat->cpu_cnt);
  }
  
  /**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
   * @stat: target blkg_stat
   * @val: value to add
   *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
   */
  static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
  {
-       u64_stats_update_begin(&stat->syncp);
-       stat->cnt += val;
-       u64_stats_update_end(&stat->syncp);
+       __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
  }
  
  /**
   * blkg_stat_read - read the current value of a blkg_stat
   * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
   */
  static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  {
-       unsigned int start;
-       uint64_t v;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&stat->syncp);
-               v = stat->cnt;
-       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-       return v;
+       return percpu_counter_sum_positive(&stat->cpu_cnt);
  }
  
  /**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
   */
  static inline void blkg_stat_reset(struct blkg_stat *stat)
  {
-       stat->cnt = 0;
+       percpu_counter_set(&stat->cpu_cnt, 0);
+       atomic64_set(&stat->aux_cnt, 0);
  }
  
  /**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
   * @to: the destination blkg_stat
   * @from: the source
   *
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
   */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+                                    struct blkg_stat *from)
  {
-       blkg_stat_add(to, blkg_stat_read(from));
+       atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+                    &to->aux_cnt);
  }
  
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
  {
-       u64_stats_init(&rwstat->syncp);
+       int i, ret;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+               if (ret) {
+                       while (--i >= 0)
+                               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+                       return ret;
+               }
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
+       return 0;
+}
+
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+       int i;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
  }
  
  /**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
  static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
                                    int rw, uint64_t val)
  {
-       u64_stats_update_begin(&rwstat->syncp);
+       struct percpu_counter *cnt;
  
         if (rw & REQ_WRITE)
-               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
         else
-               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
         if (rw & REQ_SYNC)
-               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
         else
-               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
  
-       u64_stats_update_end(&rwstat->syncp);
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
  }
  
  /**
   * blkg_rwstat_read - read the current values of a blkg_rwstat
   * @rwstat: blkg_rwstat to read
   *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
   */
  static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
  {
-       unsigned int start;
-       struct blkg_rwstat tmp;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-               tmp = *rwstat;
-       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+       struct blkg_rwstat result;
+       int i;
  
-       return tmp;
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               atomic64_set(&result.aux_cnt[i],
+                            percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+       return result;
  }
  
  /**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
  {
         struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
  
-       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+       return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
  }
  
  /**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
   */
  static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
  {
-       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+       int i;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
  }
  
  /**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
   * @to: the destination blkg_rwstat
   * @from: the source
   *
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
   */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-                                    struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+                                      struct blkg_rwstat *from)
  {
         struct blkg_rwstat v = blkg_rwstat_read(from);
         int i;
  
-       u64_stats_update_begin(&to->syncp);
         for (i = 0; i < BLKG_RWSTAT_NR; i++)
-               to->cnt[i] += v.cnt[i];
-       u64_stats_update_end(&to->syncp);
+               atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+                            atomic64_read(&from->aux_cnt[i]),
+                            &to->aux_cnt[i]);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                          struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                                 struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+                                        struct bio *bio)
+{
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       bool throtl = false;
+
+       rcu_read_lock();
+       blkcg = bio_blkcg(bio);
+
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg)) {
+               spin_lock_irq(q->queue_lock);
+               blkg = blkg_lookup_create(blkcg, q);
+               if (IS_ERR(blkg))
+                       blkg = NULL;
+               spin_unlock_irq(q->queue_lock);
+       }
+
+       throtl = blk_throtl_bio(q, blkg, bio);
+
+       if (!throtl) {
+               blkg = blkg ?: q->root_blkg;
+               blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+                               bio->bi_iter.bi_size);
+               blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+       }
+
+       rcu_read_unlock();
+       return !throtl;
  }
  
  #else  /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
  static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
  static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
  
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+                                        struct bio *bio) { return true; }
+
  #define blk_queue_for_each_rl(rl, q)   \
         for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
  
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

index 1f36945..1a96fda 100644 (file)
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
  #endif
  
  #if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
  #endif
  
  #if IS_ENABLED(CONFIG_MEMCG)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h

index 123be25..5d4e9c4 100644 (file)
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
  }
  
  int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
  char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
                                 size_t buflen);
  void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
  static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
  { return -ENOSYS; }
  
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
  static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
                                               size_t buflen)
  { return NULL; }
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h

index a7aa607..fff846b 100644 (file)
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
         TP_ARGS(inode, flags)
  );
  
+#ifdef CREATE_TRACE_POINTS
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+       struct cgroup *cgrp = wb->memcg_css->cgroup;
+       char *path;
+
+       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
+       WARN_ON_ONCE(path != buf);
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+       if (wbc->wb)
+               return __trace_wb_cgroup_size(wbc->wb);
+       else
+               return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+                                            struct writeback_control *wbc)
+{
+       if (wbc->wb)
+               __trace_wb_assign_cgroup(buf, wbc->wb);
+       else
+               strcpy(buf, "/");
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+       return 2;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+       strcpy(buf, "/");
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+       return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+                                            struct writeback_control *wbc)
+{
+       strcpy(buf, "/");
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+#endif /* CREATE_TRACE_POINTS */
+
  DECLARE_EVENT_CLASS(writeback_write_inode_template,
  
         TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
                 __array(char, name, 32)
                 __field(unsigned long, ino)
                 __field(int, sync_mode)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
         ),
  
         TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
                         dev_name(inode_to_bdi(inode)->dev), 32);
                 __entry->ino            = inode->i_ino;
                 __entry->sync_mode      = wbc->sync_mode;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
         ),
  
-       TP_printk("bdi %s: ino=%lu sync_mode=%d",
+       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
                 __entry->name,
                 __entry->ino,
-               __entry->sync_mode
+               __entry->sync_mode,
+               __get_str(cgroup)
         )
  );
  
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
  );
  
  DECLARE_EVENT_CLASS(writeback_work_class,
-       TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
-       TP_ARGS(bdi, work),
+       TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
+       TP_ARGS(wb, work),
         TP_STRUCT__entry(
                 __array(char, name, 32)
                 __field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                 __field(int, range_cyclic)
                 __field(int, for_background)
                 __field(int, reason)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
         ),
         TP_fast_assign(
                 strncpy(__entry->name,
-                       bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+                       wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
                 __entry->nr_pages = work->nr_pages;
                 __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                 __entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                 __entry->range_cyclic = work->range_cyclic;
                 __entry->for_background = work->for_background;
                 __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
         ),
         TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-                 "kupdate=%d range_cyclic=%d background=%d reason=%s",
+                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
                   __entry->name,
                   MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                   __entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                   __entry->for_kupdate,
                   __entry->range_cyclic,
                   __entry->for_background,
-                 __print_symbolic(__entry->reason, WB_WORK_REASON)
+                 __print_symbolic(__entry->reason, WB_WORK_REASON),
+                 __get_str(cgroup)
         )
  );
  #define DEFINE_WRITEBACK_WORK_EVENT(name) \
  DEFINE_EVENT(writeback_work_class, name, \
-       TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
-       TP_ARGS(bdi, work))
+       TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
+       TP_ARGS(wb, work))
  DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
  DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
  DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
  );
  
  DECLARE_EVENT_CLASS(writeback_class,
-       TP_PROTO(struct backing_dev_info *bdi),
-       TP_ARGS(bdi),
+       TP_PROTO(struct bdi_writeback *wb),
+       TP_ARGS(wb),
         TP_STRUCT__entry(
                 __array(char, name, 32)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
         ),
         TP_fast_assign(
-               strncpy(__entry->name, dev_name(bdi->dev), 32);
+               strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
         ),
-       TP_printk("bdi %s",
-                 __entry->name
+       TP_printk("bdi %s: cgroup=%s",
+                 __entry->name,
+                 __get_str(cgroup)
         )
  );
  #define DEFINE_WRITEBACK_EVENT(name) \
  DEFINE_EVENT(writeback_class, name, \
-       TP_PROTO(struct backing_dev_info *bdi), \
-       TP_ARGS(bdi))
+       TP_PROTO(struct bdi_writeback *wb), \
+       TP_ARGS(wb))
  
  DEFINE_WRITEBACK_EVENT(writeback_nowork);
  DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+
+TRACE_EVENT(writeback_bdi_register,
+       TP_PROTO(struct backing_dev_info *bdi),
+       TP_ARGS(bdi),
+       TP_STRUCT__entry(
+               __array(char, name, 32)
+       ),
+       TP_fast_assign(
+               strncpy(__entry->name, dev_name(bdi->dev), 32);
+       ),
+       TP_printk("bdi %s",
+               __entry->name
+       )
+);
  
  DECLARE_EVENT_CLASS(wbc_class,
         TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
                 __field(int, range_cyclic)
                 __field(long, range_start)
                 __field(long, range_end)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
         ),
  
         TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
                 __entry->range_cyclic   = wbc->range_cyclic;
                 __entry->range_start    = (long)wbc->range_start;
                 __entry->range_end      = (long)wbc->range_end;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
         ),
  
         TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                 "bgrd=%d reclm=%d cyclic=%d "
-               "start=0x%lx end=0x%lx",
+               "start=0x%lx end=0x%lx cgroup=%s",
                 __entry->name,
                 __entry->nr_to_write,
                 __entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
                 __entry->for_reclaim,
                 __entry->range_cyclic,
                 __entry->range_start,
-               __entry->range_end)
+               __entry->range_end,
+               __get_str(cgroup)
+       )
  )
  
  #define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
                 __field(long,           age)
                 __field(int,            moved)
                 __field(int,            reason)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
         ),
         TP_fast_assign(
                 unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
                                   (jiffies - *older_than_this) * 1000 / HZ : -1;
                 __entry->moved  = moved;
                 __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
         ),
-       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
                 __entry->name,
                 __entry->older, /* older_than_this in jiffies */
                 __entry->age,   /* older_than_this in relative milliseconds */
                 __entry->moved,
-               __print_symbolic(__entry->reason, WB_WORK_REASON)
+               __print_symbolic(__entry->reason, WB_WORK_REASON),
+               __get_str(cgroup)
         )
  );
  
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
  
  TRACE_EVENT(bdi_dirty_ratelimit,
  
-       TP_PROTO(struct backing_dev_info *bdi,
+       TP_PROTO(struct bdi_writeback *wb,
                  unsigned long dirty_rate,
                  unsigned long task_ratelimit),
  
-       TP_ARGS(bdi, dirty_rate, task_ratelimit),
+       TP_ARGS(wb, dirty_rate, task_ratelimit),
  
         TP_STRUCT__entry(
                 __array(char,           bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
                 __field(unsigned long,  dirty_ratelimit)
                 __field(unsigned long,  task_ratelimit)
                 __field(unsigned long,  balanced_dirty_ratelimit)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
         ),
  
         TP_fast_assign(
-               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-               __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
-               __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
+               strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+               __entry->write_bw       = KBps(wb->write_bandwidth);
+               __entry->avg_write_bw   = KBps(wb->avg_write_bandwidth);
                 __entry->dirty_rate     = KBps(dirty_rate);
-               __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
+               __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                 __entry->task_ratelimit = KBps(task_ratelimit);
                 __entry->balanced_dirty_ratelimit =
-                                       KBps(bdi->wb.balanced_dirty_ratelimit);
+                                       KBps(wb->balanced_dirty_ratelimit);
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
         ),
  
         TP_printk("bdi %s: "
                   "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                   "dirty_ratelimit=%lu task_ratelimit=%lu "
-                 "balanced_dirty_ratelimit=%lu",
+                 "balanced_dirty_ratelimit=%lu cgroup=%s",
                   __entry->bdi,
                   __entry->write_bw,            /* write bandwidth */
                   __entry->avg_write_bw,        /* avg write bandwidth */
                   __entry->dirty_rate,          /* bdi dirty rate */
                   __entry->dirty_ratelimit,     /* base ratelimit */
                   __entry->task_ratelimit, /* ratelimit with position control */
-                 __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
+                 __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
+                 __get_str(cgroup)
         )
  );
  
  TRACE_EVENT(balance_dirty_pages,
  
-       TP_PROTO(struct backing_dev_info *bdi,
+       TP_PROTO(struct bdi_writeback *wb,
                  unsigned long thresh,
                  unsigned long bg_thresh,
                  unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
                  long pause,
                  unsigned long start_time),
  
-       TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+       TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                 dirty_ratelimit, task_ratelimit,
                 dirtied, period, pause, start_time),
  
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
                 __field(         long,  pause)
                 __field(unsigned long,  period)
                 __field(         long,  think)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
         ),
  
         TP_fast_assign(
                 unsigned long freerun = (thresh + bg_thresh) / 2;
-               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+               strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
  
                 __entry->limit          = global_wb_domain.dirty_limit;
                 __entry->setpoint       = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
                 __entry->period         = period * 1000 / HZ;
                 __entry->pause          = pause * 1000 / HZ;
                 __entry->paused         = (jiffies - start_time) * 1000 / HZ;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
         ),
  
  
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
                   "bdi_setpoint=%lu bdi_dirty=%lu "
                   "dirty_ratelimit=%lu task_ratelimit=%lu "
                   "dirtied=%u dirtied_pause=%u "
-                 "paused=%lu pause=%ld period=%lu think=%ld",
+                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
                   __entry->bdi,
                   __entry->limit,
                   __entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
                   __entry->paused,      /* ms */
                   __entry->pause,       /* ms */
                   __entry->period,      /* ms */
-                 __entry->think        /* ms */
+                 __entry->think,       /* ms */
+                 __get_str(cgroup)
           )
  );
  
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
                 __field(unsigned long, ino)
                 __field(unsigned long, state)
                 __field(unsigned long, dirtied_when)
+               __dynamic_array(char, cgroup,
+                               __trace_wb_cgroup_size(inode_to_wb(inode)))
         ),
  
         TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
                 __entry->ino            = inode->i_ino;
                 __entry->state          = inode->i_state;
                 __entry->dirtied_when   = inode->dirtied_when;
+               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
         ),
  
-       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
                   __entry->name,
                   __entry->ino,
                   show_inode_state(__entry->state),
                   __entry->dirtied_when,
-                 (jiffies - __entry->dirtied_when) / HZ
+                 (jiffies - __entry->dirtied_when) / HZ,
+                 __get_str(cgroup)
         )
  );
  
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                 __field(unsigned long, writeback_index)
                 __field(long, nr_to_write)
                 __field(unsigned long, wrote)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
         ),
  
         TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                 __entry->writeback_index = inode->i_mapping->writeback_index;
                 __entry->nr_to_write    = nr_to_write;
                 __entry->wrote          = nr_to_write - wbc->nr_to_write;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
         ),
  
         TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
-                 "index=%lu to_write=%ld wrote=%lu",
+                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
                   __entry->name,
                   __entry->ino,
                   show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                   (jiffies - __entry->dirtied_when) / HZ,
                   __entry->writeback_index,
                   __entry->nr_to_write,
-                 __entry->wrote
+                 __entry->wrote,
+                 __get_str(cgroup)
         )
  );
  
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index ee8d7fd..2df8ddc 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
         int ret = 0;
  
         memcg = mem_cgroup_from_css(memcg_css);
-       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
         blkcg = css_to_blkcg(blkcg_css);
         memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
         blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
  
                         /* see whether the blkcg association has changed */
                         blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
-                                                    &blkio_cgrp_subsys);
+                                                    &io_cgrp_subsys);
                         if (unlikely(wb->blkcg_css != blkcg_css ||
                                      !wb_tryget(wb)))
                                 wb = NULL;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index 5cccc12..0a931cd 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
         wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
         wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
  
-       trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
+       trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
  }
  
  static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
@@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                  * do a reset, as it may be a light dirtier.
                  */
                 if (pause < min_pause) {
-                       trace_balance_dirty_pages(bdi,
+                       trace_balance_dirty_pages(wb,
                                                   sdtc->thresh,
                                                   sdtc->bg_thresh,
                                                   sdtc->dirty,
@@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                 }
  
  pause:
-               trace_balance_dirty_pages(bdi,
+               trace_balance_dirty_pages(wb,
                                           sdtc->thresh,
                                           sdtc->bg_thresh,
                                           sdtc->dirty,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 11 Sep 2015 01:56:14 +0000 (18:56 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 11 Sep 2015 01:56:14 +0000 (18:56 -0700)
Documentation/cgroups/blkio-controller.txt		patch \| blob \| history
Documentation/cgroups/unified-hierarchy.txt		patch \| blob \| history
block/bio.c		patch \| blob \| history
block/blk-cgroup.c		patch \| blob \| history
block/blk-core.c		patch \| blob \| history
block/blk-throttle.c		patch \| blob \| history
block/blk.h		patch \| blob \| history
block/cfq-iosched.c		patch \| blob \| history
fs/fs-writeback.c		patch \| blob \| history
fs/kernfs/dir.c		patch \| blob \| history
include/linux/backing-dev.h		patch \| blob \| history
include/linux/blk-cgroup.h		patch \| blob \| history
include/linux/cgroup_subsys.h		patch \| blob \| history
include/linux/kernfs.h		patch \| blob \| history
include/trace/events/writeback.h		patch \| blob \| history
mm/backing-dev.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history