blkcg: make blkcg_[rw]stat per-cpu
authorTejun Heo <tj@kernel.org>
Tue, 18 Aug 2015 21:55:22 +0000 (14:55 -0700)
committerJens Axboe <axboe@fb.com>
Tue, 18 Aug 2015 22:49:17 +0000 (15:49 -0700)
blkcg_[rw]stat are used as stat counters for blkcg policies.  It isn't
per-cpu by itself and blk-throttle makes it per-cpu by wrapping around
it.  This patch makes blkcg_[rw]stat per-cpu and drop the ad-hoc
per-cpu wrapping in blk-throttle.

* blkg_[rw]stat->cnt is replaced with cpu_cnt which is struct
  percpu_counter.  This makes syncp unnecessary as remote accesses are
  handled by percpu_counter itself.

* blkg_[rw]stat_init() can now fail due to percpu allocation failure
  and thus are updated to return int.

* percpu_counters need explicit freeing.  blkg_[rw]stat_exit() added.

* As blkg_rwstat->cpu_cnt[] can't be read directly anymore, reading
  and summing results are stored in ->aux_cnt[] instead.

* Custom per-cpu stat implementation in blk-throttle is removed.

This makes all blkcg stat counters per-cpu without complicating policy
implmentations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
block/blk-cgroup.c
block/blk-throttle.c
block/cfq-iosched.c
include/linux/blk-cgroup.h

index ff79b52..02a2d02 100644 (file)
@@ -539,9 +539,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
        return v;
 }
@@ -643,8 +644,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
                tmp = blkg_rwstat_read(rwstat);
 
                for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i] +
-                               atomic64_read(&rwstat->aux_cnt[i]);
+                       atomic64_add(atomic64_read(&tmp.aux_cnt[i]) +
+                                    atomic64_read(&rwstat->aux_cnt[i]),
+                                    &sum.aux_cnt[i]);
        }
        rcu_read_unlock();
 
index 29c22ed..c0b2263 100644 (file)
@@ -83,14 +83,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
 
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
-};
-
 struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;
@@ -142,8 +134,10 @@ struct throtl_grp {
        unsigned long slice_start[2];
        unsigned long slice_end[2];
 
-       /* Per cpu stats pointer */
-       struct tg_stats_cpu __percpu *stats_cpu;
+       /* total bytes transferred */
+       struct blkg_rwstat              service_bytes;
+       /* total IOs serviced, post merge */
+       struct blkg_rwstat              serviced;
 };
 
 struct throtl_data
@@ -337,17 +331,15 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
        struct throtl_grp *tg;
-       int rw, cpu;
+       int rw;
 
        tg = kzalloc_node(sizeof(*tg), gfp, node);
        if (!tg)
-               return NULL;
+               goto err;
 
-       tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp);
-       if (!tg->stats_cpu) {
-               kfree(tg);
-               return NULL;
-       }
+       if (blkg_rwstat_init(&tg->service_bytes, gfp) ||
+           blkg_rwstat_init(&tg->serviced, gfp))
+               goto err_free_tg;
 
        throtl_service_queue_init(&tg->service_queue);
 
@@ -362,14 +354,14 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
        tg->iops[READ] = -1;
        tg->iops[WRITE] = -1;
 
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               blkg_rwstat_init(&stats_cpu->service_bytes);
-               blkg_rwstat_init(&stats_cpu->serviced);
-       }
-
        return &tg->pd;
+
+err_free_tg:
+       blkg_rwstat_exit(&tg->serviced);
+       blkg_rwstat_exit(&tg->service_bytes);
+       kfree(tg);
+err:
+       return NULL;
 }
 
 static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -427,21 +419,17 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
        struct throtl_grp *tg = pd_to_tg(pd);
 
        del_timer_sync(&tg->service_queue.pending_timer);
-       free_percpu(tg->stats_cpu);
+       blkg_rwstat_exit(&tg->serviced);
+       blkg_rwstat_exit(&tg->service_bytes);
        kfree(tg);
 }
 
 static void throtl_pd_reset_stats(struct blkg_policy_data *pd)
 {
        struct throtl_grp *tg = pd_to_tg(pd);
-       int cpu;
 
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               blkg_rwstat_reset(&sc->service_bytes);
-               blkg_rwstat_reset(&sc->serviced);
-       }
+       blkg_rwstat_reset(&tg->service_bytes);
+       blkg_rwstat_reset(&tg->serviced);
 }
 
 static struct throtl_grp *
@@ -855,7 +843,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
                                         int rw)
 {
        struct throtl_grp *tg = blkg_to_tg(blkg);
-       struct tg_stats_cpu *stats_cpu;
        unsigned long flags;
 
        /*
@@ -865,10 +852,8 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
         */
        local_irq_save(flags);
 
-       stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-       blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-       blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+       blkg_rwstat_add(&tg->serviced, rw, 1);
+       blkg_rwstat_add(&tg->service_bytes, rw, bytes);
 
        local_irq_restore(flags);
 }
@@ -1176,27 +1161,9 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
        }
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-                               struct blkg_policy_data *pd, int off)
-{
-       struct throtl_grp *tg = pd_to_tg(pd);
-       struct blkg_rwstat rwstat = { }, tmp;
-       int i, cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               tmp = blkg_rwstat_read((void *)sc + off);
-               for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       rwstat.cnt[i] += tmp.cnt[i];
-       }
-
-       return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
+static int tg_print_rwstat(struct seq_file *sf, void *v)
 {
-       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
                          &blkcg_policy_throtl, seq_cft(sf)->private, true);
        return 0;
 }
@@ -1337,13 +1304,13 @@ static struct cftype throtl_files[] = {
        },
        {
                .name = "throttle.io_service_bytes",
-               .private = offsetof(struct tg_stats_cpu, service_bytes),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = offsetof(struct throtl_grp, service_bytes),
+               .seq_show = tg_print_rwstat,
        },
        {
                .name = "throttle.io_serviced",
-               .private = offsetof(struct tg_stats_cpu, serviced),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = offsetof(struct throtl_grp, serviced),
+               .seq_show = tg_print_rwstat,
        },
        { }     /* terminate */
 };
index b272cff..71e55c9 100644 (file)
@@ -1542,27 +1542,55 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-       blkg_rwstat_init(&stats->service_bytes);
-       blkg_rwstat_init(&stats->serviced);
-       blkg_rwstat_init(&stats->merged);
-       blkg_rwstat_init(&stats->service_time);
-       blkg_rwstat_init(&stats->wait_time);
-       blkg_rwstat_init(&stats->queued);
+       blkg_rwstat_exit(&stats->service_bytes);
+       blkg_rwstat_exit(&stats->serviced);
+       blkg_rwstat_exit(&stats->merged);
+       blkg_rwstat_exit(&stats->service_time);
+       blkg_rwstat_exit(&stats->wait_time);
+       blkg_rwstat_exit(&stats->queued);
 
-       blkg_stat_init(&stats->sectors);
-       blkg_stat_init(&stats->time);
+       blkg_stat_exit(&stats->sectors);
+       blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_exit(&stats->unaccounted_time);
+       blkg_stat_exit(&stats->avg_queue_size_sum);
+       blkg_stat_exit(&stats->avg_queue_size_samples);
+       blkg_stat_exit(&stats->dequeue);
+       blkg_stat_exit(&stats->group_wait_time);
+       blkg_stat_exit(&stats->idle_time);
+       blkg_stat_exit(&stats->empty_time);
+#endif
+}
+
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+       if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
+           blkg_rwstat_init(&stats->serviced, gfp) ||
+           blkg_rwstat_init(&stats->merged, gfp) ||
+           blkg_rwstat_init(&stats->service_time, gfp) ||
+           blkg_rwstat_init(&stats->wait_time, gfp) ||
+           blkg_rwstat_init(&stats->queued, gfp) ||
+
+           blkg_stat_init(&stats->sectors, gfp) ||
+           blkg_stat_init(&stats->time, gfp))
+               goto err;
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_init(&stats->unaccounted_time);
-       blkg_stat_init(&stats->avg_queue_size_sum);
-       blkg_stat_init(&stats->avg_queue_size_samples);
-       blkg_stat_init(&stats->dequeue);
-       blkg_stat_init(&stats->group_wait_time);
-       blkg_stat_init(&stats->idle_time);
-       blkg_stat_init(&stats->empty_time);
+       if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+           blkg_stat_init(&stats->dequeue, gfp) ||
+           blkg_stat_init(&stats->group_wait_time, gfp) ||
+           blkg_stat_init(&stats->idle_time, gfp) ||
+           blkg_stat_init(&stats->empty_time, gfp))
+               goto err;
 #endif
+       return 0;
+err:
+       cfqg_stats_exit(stats);
+       return -ENOMEM;
 }
 
 static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
@@ -1602,7 +1630,10 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
                return NULL;
 
        cfq_init_cfqg_base(cfqg);
-       cfqg_stats_init(&cfqg->stats);
+       if (cfqg_stats_init(&cfqg->stats, gfp)) {
+               kfree(cfqg);
+               return NULL;
+       }
 
        return &cfqg->pd;
 }
@@ -1642,7 +1673,10 @@ static void cfq_pd_offline(struct blkg_policy_data *pd)
 
 static void cfq_pd_free(struct blkg_policy_data *pd)
 {
-       return kfree(pd);
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+       cfqg_stats_exit(&cfqg->stats);
+       return kfree(cfqg);
 }
 
 static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
index e809227..fdc7ac0 100644 (file)
  */
 
 #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH    (INT_MAX / 2)
+
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX                UINT_MAX
 
@@ -55,17 +58,16 @@ struct blkcg {
 
 /*
  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive.  Used to carry stats of dead children.
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
  */
 struct blkg_stat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt;
+       struct percpu_counter           cpu_cnt;
        atomic64_t                      aux_cnt;
 };
 
 struct blkg_rwstat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt[BLKG_RWSTAT_NR];
+       struct percpu_counter           cpu_cnt[BLKG_RWSTAT_NR];
        atomic64_t                      aux_cnt[BLKG_RWSTAT_NR];
 };
 
@@ -486,10 +488,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)   \
        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
-       u64_stats_init(&stat->syncp);
+       int ret;
+
+       ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+       if (ret)
+               return ret;
+
        atomic64_set(&stat->aux_cnt, 0);
+       return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+       percpu_counter_destroy(&stat->cpu_cnt);
 }
 
 /**
@@ -497,35 +510,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
  * @stat: target blkg_stat
  * @val: value to add
  *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-       u64_stats_update_begin(&stat->syncp);
-       stat->cnt += val;
-       u64_stats_update_end(&stat->syncp);
+       __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  The returned value doesn't include the
- * aux count.  This function can be called without synchroniztion and takes
- * care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
-       unsigned int start;
-       uint64_t v;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&stat->syncp);
-               v = stat->cnt;
-       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-       return v;
+       return percpu_counter_sum_positive(&stat->cpu_cnt);
 }
 
 /**
@@ -534,7 +533,7 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  */
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
-       stat->cnt = 0;
+       percpu_counter_set(&stat->cpu_cnt, 0);
        atomic64_set(&stat->aux_cnt, 0);
 }
 
@@ -552,14 +551,28 @@ static inline void blkg_stat_add_aux(struct blkg_stat *to,
                     &to->aux_cnt);
 }
 
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
-       int i;
+       int i, ret;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+               if (ret) {
+                       while (--i >= 0)
+                               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+                       return ret;
+               }
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
+       return 0;
+}
 
-       u64_stats_init(&rwstat->syncp);
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+       int i;
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
-               atomic64_set(&rwstat->aux_cnt[i], 0);
+               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -574,39 +587,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
                                   int rw, uint64_t val)
 {
-       u64_stats_update_begin(&rwstat->syncp);
+       struct percpu_counter *cnt;
 
        if (rw & REQ_WRITE)
-               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
        else
-               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
        if (rw & REQ_SYNC)
-               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
        else
-               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-       u64_stats_update_end(&rwstat->syncp);
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
-       unsigned int start;
-       struct blkg_rwstat tmp;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-               tmp = *rwstat;
-       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+       struct blkg_rwstat result;
+       int i;
 
-       return tmp;
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               atomic64_set(&result.aux_cnt[i],
+                            percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+       return result;
 }
 
 /**
@@ -621,7 +633,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
        struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 
-       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+       return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
 
 /**
@@ -632,10 +645,10 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
        int i;
 
-       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-
-       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               percpu_counter_set(&rwstat->cpu_cnt[i], 0);
                atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
 }
 
 /**
@@ -652,7 +665,8 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
        int i;
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
-               atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]),
+               atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+                            atomic64_read(&from->aux_cnt[i]),
                             &to->aux_cnt[i]);
 }