block/cfq-iosched.c

   1 /*
   2  *  CFQ, or complete fairness queueing, disk scheduler.
   3  *
   4  *  Based on ideas from a previously unfinished io
   5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6  *
   7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   8  */
   9 #include <linux/module.h>
  10 #include <linux/slab.h>
  11 #include <linux/blkdev.h>
  12 #include <linux/elevator.h>
  13 #include <linux/jiffies.h>
  14 #include <linux/rbtree.h>
  15 #include <linux/ioprio.h>
  16 #include <linux/blktrace_api.h>
  17 #include "blk.h"
  18 #include "blk-cgroup.h"
  19
  20 /*
  21  * tunables
  22  */
  23 /* max queue in one round of service */
  24 static const int cfq_quantum = 8;
  25 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  26 /* maximum backwards seek, in KiB */
  27 static const int cfq_back_max = 16 * 1024;
  28 /* penalty of a backwards seek */
  29 static const int cfq_back_penalty = 2;
  30 static const int cfq_slice_sync = HZ / 10;
  31 static int cfq_slice_async = HZ / 25;
  32 static const int cfq_slice_async_rq = 2;
  33 static int cfq_slice_idle = HZ / 125;
  34 static int cfq_group_idle = HZ / 125;
  35 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
  36 static const int cfq_hist_divisor = 4;
  37
  38 /*
  39  * offset from end of service tree
  40  */
  41 #define CFQ_IDLE_DELAY          (HZ / 5)
  42
  43 /*
  44  * below this threshold, we consider thinktime immediate
  45  */
  46 #define CFQ_MIN_TT              (2)
  47
  48 #define CFQ_SLICE_SCALE         (5)
  49 #define CFQ_HW_QUEUE_MIN        (5)
  50 #define CFQ_SERVICE_SHIFT       12
  51
  52 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
  53 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
  54 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
  55 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
  56
  57 #define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
  58 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
  59 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
  60
  61 static struct kmem_cache *cfq_pool;
  62
  63 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  64 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  65 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  66
  67 #define sample_valid(samples)   ((samples) > 80)
  68 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
  69
  70 /* blkio-related constants */
  71 #define CFQ_WEIGHT_MIN          10
  72 #define CFQ_WEIGHT_MAX          1000
  73 #define CFQ_WEIGHT_DEFAULT      500
  74
  75 struct cfq_ttime {
  76         unsigned long last_end_request;
  77
  78         unsigned long ttime_total;
  79         unsigned long ttime_samples;
  80         unsigned long ttime_mean;
  81 };
  82
  83 /*
  84  * Most of our rbtree usage is for sorting with min extraction, so
  85  * if we cache the leftmost node we don't have to walk down the tree
  86  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  87  * move this into the elevator for the rq sorting as well.
  88  */
  89 struct cfq_rb_root {
  90         struct rb_root rb;
  91         struct rb_node *left;
  92         unsigned count;
  93         u64 min_vdisktime;
  94         struct cfq_ttime ttime;
  95 };
  96 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, \
  97                         .ttime = {.last_end_request = jiffies,},}
  98
  99 /*
 100  * Per process-grouping structure
 101  */
 102 struct cfq_queue {
 103         /* reference count */
 104         int ref;
 105         /* various state flags, see below */
 106         unsigned int flags;
 107         /* parent cfq_data */
 108         struct cfq_data *cfqd;
 109         /* service_tree member */
 110         struct rb_node rb_node;
 111         /* service_tree key */
 112         unsigned long rb_key;
 113         /* prio tree member */
 114         struct rb_node p_node;
 115         /* prio tree root we belong to, if any */
 116         struct rb_root *p_root;
 117         /* sorted list of pending requests */
 118         struct rb_root sort_list;
 119         /* if fifo isn't expired, next request to serve */
 120         struct request *next_rq;
 121         /* requests queued in sort_list */
 122         int queued[2];
 123         /* currently allocated requests */
 124         int allocated[2];
 125         /* fifo list of requests in sort_list */
 126         struct list_head fifo;
 127
 128         /* time when queue got scheduled in to dispatch first request. */
 129         unsigned long dispatch_start;
 130         unsigned int allocated_slice;
 131         unsigned int slice_dispatch;
 132         /* time when first request from queue completed and slice started. */
 133         unsigned long slice_start;
 134         unsigned long slice_end;
 135         long slice_resid;
 136
 137         /* pending priority requests */
 138         int prio_pending;
 139         /* number of requests that are on the dispatch list or inside driver */
 140         int dispatched;
 141
 142         /* io prio of this group */
 143         unsigned short ioprio, org_ioprio;
 144         unsigned short ioprio_class;
 145
 146         pid_t pid;
 147
 148         u32 seek_history;
 149         sector_t last_request_pos;
 150
 151         struct cfq_rb_root *service_tree;
 152         struct cfq_queue *new_cfqq;
 153         struct cfq_group *cfqg;
 154         /* Number of sectors dispatched from queue in single dispatch round */
 155         unsigned long nr_sectors;
 156 };
 157
 158 /*
 159  * First index in the service_trees.
 160  * IDLE is handled separately, so it has negative index
 161  */
 162 enum wl_class_t {
 163         BE_WORKLOAD = 0,
 164         RT_WORKLOAD = 1,
 165         IDLE_WORKLOAD = 2,
 166         CFQ_PRIO_NR,
 167 };
 168
 169 /*
 170  * Second index in the service_trees.
 171  */
 172 enum wl_type_t {
 173         ASYNC_WORKLOAD = 0,
 174         SYNC_NOIDLE_WORKLOAD = 1,
 175         SYNC_WORKLOAD = 2
 176 };
 177
 178 struct cfqg_stats {
 179 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 180         /* total bytes transferred */
 181         struct blkg_rwstat              service_bytes;
 182         /* total IOs serviced, post merge */
 183         struct blkg_rwstat              serviced;
 184         /* number of ios merged */
 185         struct blkg_rwstat              merged;
 186         /* total time spent on device in ns, may not be accurate w/ queueing */
 187         struct blkg_rwstat              service_time;
 188         /* total time spent waiting in scheduler queue in ns */
 189         struct blkg_rwstat              wait_time;
 190         /* number of IOs queued up */
 191         struct blkg_rwstat              queued;
 192         /* total sectors transferred */
 193         struct blkg_stat                sectors;
 194         /* total disk time and nr sectors dispatched by this group */
 195         struct blkg_stat                time;
 196 #ifdef CONFIG_DEBUG_BLK_CGROUP
 197         /* time not charged to this cgroup */
 198         struct blkg_stat                unaccounted_time;
 199         /* sum of number of ios queued across all samples */
 200         struct blkg_stat                avg_queue_size_sum;
 201         /* count of samples taken for average */
 202         struct blkg_stat                avg_queue_size_samples;
 203         /* how many times this group has been removed from service tree */
 204         struct blkg_stat                dequeue;
 205         /* total time spent waiting for it to be assigned a timeslice. */
 206         struct blkg_stat                group_wait_time;
 207         /* time spent idling for this blkcg_gq */
 208         struct blkg_stat                idle_time;
 209         /* total time with empty current active q with other requests queued */
 210         struct blkg_stat                empty_time;
 211         /* fields after this shouldn't be cleared on stat reset */
 212         uint64_t                        start_group_wait_time;
 213         uint64_t                        start_idle_time;
 214         uint64_t                        start_empty_time;
 215         uint16_t                        flags;
 216 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
 217 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 218 };
 219
 220 /* Per-cgroup data */
 221 struct cfq_group_data {
 222         /* must be the first member */
 223         struct blkcg_policy_data pd;
 224
 225         unsigned int weight;
 226         unsigned int leaf_weight;
 227 };
 228
 229 /* This is per cgroup per device grouping structure */
 230 struct cfq_group {
 231         /* must be the first member */
 232         struct blkg_policy_data pd;
 233
 234         /* group service_tree member */
 235         struct rb_node rb_node;
 236
 237         /* group service_tree key */
 238         u64 vdisktime;
 239
 240         /*
 241          * The number of active cfqgs and sum of their weights under this
 242          * cfqg.  This covers this cfqg's leaf_weight and all children's
 243          * weights, but does not cover weights of further descendants.
 244          *
 245          * If a cfqg is on the service tree, it's active.  An active cfqg
 246          * also activates its parent and contributes to the children_weight
 247          * of the parent.
 248          */
 249         int nr_active;
 250         unsigned int children_weight;
 251
 252         /*
 253          * vfraction is the fraction of vdisktime that the tasks in this
 254          * cfqg are entitled to.  This is determined by compounding the
 255          * ratios walking up from this cfqg to the root.
 256          *
 257          * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
 258          * vfractions on a service tree is approximately 1.  The sum may
 259          * deviate a bit due to rounding errors and fluctuations caused by
 260          * cfqgs entering and leaving the service tree.
 261          */
 262         unsigned int vfraction;
 263
 264         /*
 265          * There are two weights - (internal) weight is the weight of this
 266          * cfqg against the sibling cfqgs.  leaf_weight is the wight of
 267          * this cfqg against the child cfqgs.  For the root cfqg, both
 268          * weights are kept in sync for backward compatibility.
 269          */
 270         unsigned int weight;
 271         unsigned int new_weight;
 272         unsigned int dev_weight;
 273
 274         unsigned int leaf_weight;
 275         unsigned int new_leaf_weight;
 276         unsigned int dev_leaf_weight;
 277
 278         /* number of cfqq currently on this group */
 279         int nr_cfqq;
 280
 281         /*
 282          * Per group busy queues average. Useful for workload slice calc. We
 283          * create the array for each prio class but at run time it is used
 284          * only for RT and BE class and slot for IDLE class remains unused.
 285          * This is primarily done to avoid confusion and a gcc warning.
 286          */
 287         unsigned int busy_queues_avg[CFQ_PRIO_NR];
 288         /*
 289          * rr lists of queues with requests. We maintain service trees for
 290          * RT and BE classes. These trees are subdivided in subclasses
 291          * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
 292          * class there is no subclassification and all the cfq queues go on
 293          * a single tree service_tree_idle.
 294          * Counts are embedded in the cfq_rb_root
 295          */
 296         struct cfq_rb_root service_trees[2][3];
 297         struct cfq_rb_root service_tree_idle;
 298
 299         unsigned long saved_wl_slice;
 300         enum wl_type_t saved_wl_type;
 301         enum wl_class_t saved_wl_class;
 302
 303         /* number of requests that are on the dispatch list or inside driver */
 304         int dispatched;
 305         struct cfq_ttime ttime;
 306         struct cfqg_stats stats;        /* stats for this cfqg */
 307         struct cfqg_stats dead_stats;   /* stats pushed from dead children */
 308 };
 309
 310 struct cfq_io_cq {
 311         struct io_cq            icq;            /* must be the first member */
 312         struct cfq_queue        *cfqq[2];
 313         struct cfq_ttime        ttime;
 314         int                     ioprio;         /* the current ioprio */
 315 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 316         uint64_t                blkcg_serial_nr; /* the current blkcg serial */
 317 #endif
 318 };
 319
 320 /*
 321  * Per block device queue structure
 322  */
 323 struct cfq_data {
 324         struct request_queue *queue;
 325         /* Root service tree for cfq_groups */
 326         struct cfq_rb_root grp_service_tree;
 327         struct cfq_group *root_group;
 328
 329         /*
 330          * The priority currently being served
 331          */
 332         enum wl_class_t serving_wl_class;
 333         enum wl_type_t serving_wl_type;
 334         unsigned long workload_expires;
 335         struct cfq_group *serving_group;
 336
 337         /*
 338          * Each priority tree is sorted by next_request position.  These
 339          * trees are used when determining if two or more queues are
 340          * interleaving requests (see cfq_close_cooperator).
 341          */
 342         struct rb_root prio_trees[CFQ_PRIO_LISTS];
 343
 344         unsigned int busy_queues;
 345         unsigned int busy_sync_queues;
 346
 347         int rq_in_driver;
 348         int rq_in_flight[2];
 349
 350         /*
 351          * queue-depth detection
 352          */
 353         int rq_queued;
 354         int hw_tag;
 355         /*
 356          * hw_tag can be
 357          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 358          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 359          *  0 => no NCQ
 360          */
 361         int hw_tag_est_depth;
 362         unsigned int hw_tag_samples;
 363
 364         /*
 365          * idle window management
 366          */
 367         struct timer_list idle_slice_timer;
 368         struct work_struct unplug_work;
 369
 370         struct cfq_queue *active_queue;
 371         struct cfq_io_cq *active_cic;
 372
 373         /*
 374          * async queue for each priority case
 375          */
 376         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 377         struct cfq_queue *async_idle_cfqq;
 378
 379         sector_t last_position;
 380
 381         /*
 382          * tunables, see top of file
 383          */
 384         unsigned int cfq_quantum;
 385         unsigned int cfq_fifo_expire[2];
 386         unsigned int cfq_back_penalty;
 387         unsigned int cfq_back_max;
 388         unsigned int cfq_slice[2];
 389         unsigned int cfq_slice_async_rq;
 390         unsigned int cfq_slice_idle;
 391         unsigned int cfq_group_idle;
 392         unsigned int cfq_latency;
 393         unsigned int cfq_target_latency;
 394
 395         /*
 396          * Fallback dummy cfqq for extreme OOM conditions
 397          */
 398         struct cfq_queue oom_cfqq;
 399
 400         unsigned long last_delayed_sync;
 401 };
 402
 403 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 404
 405 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
 406                                             enum wl_class_t class,
 407                                             enum wl_type_t type)
 408 {
 409         if (!cfqg)
 410                 return NULL;
 411
 412         if (class == IDLE_WORKLOAD)
 413                 return &cfqg->service_tree_idle;
 414
 415         return &cfqg->service_trees[class][type];
 416 }
 417
 418 enum cfqq_state_flags {
 419         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
 420         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
 421         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
 422         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
 423         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
 424         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
 425         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
 426         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
 427         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
 428         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
 429         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
 430         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
 431         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
 432 };
 433
 434 #define CFQ_CFQQ_FNS(name)                                              \
 435 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 436 {                                                                       \
 437         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
 438 }                                                                       \
 439 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 440 {                                                                       \
 441         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
 442 }                                                                       \
 443 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 444 {                                                                       \
 445         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
 446 }
 447
 448 CFQ_CFQQ_FNS(on_rr);
 449 CFQ_CFQQ_FNS(wait_request);
 450 CFQ_CFQQ_FNS(must_dispatch);
 451 CFQ_CFQQ_FNS(must_alloc_slice);
 452 CFQ_CFQQ_FNS(fifo_expire);
 453 CFQ_CFQQ_FNS(idle_window);
 454 CFQ_CFQQ_FNS(prio_changed);
 455 CFQ_CFQQ_FNS(slice_new);
 456 CFQ_CFQQ_FNS(sync);
 457 CFQ_CFQQ_FNS(coop);
 458 CFQ_CFQQ_FNS(split_coop);
 459 CFQ_CFQQ_FNS(deep);
 460 CFQ_CFQQ_FNS(wait_busy);
 461 #undef CFQ_CFQQ_FNS
 462
 463 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 464 {
 465         return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 466 }
 467
 468 static struct cfq_group_data
 469 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 470 {
 471         return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
 472 }
 473
 474 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 475 {
 476         return pd_to_blkg(&cfqg->pd);
 477 }
 478
 479 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 480
 481 /* cfqg stats flags */
 482 enum cfqg_stats_flags {
 483         CFQG_stats_waiting = 0,
 484         CFQG_stats_idling,
 485         CFQG_stats_empty,
 486 };
 487
 488 #define CFQG_FLAG_FNS(name)                                             \
 489 static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
 490 {                                                                       \
 491         stats->flags |= (1 << CFQG_stats_##name);                       \
 492 }                                                                       \
 493 static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
 494 {                                                                       \
 495         stats->flags &= ~(1 << CFQG_stats_##name);                      \
 496 }                                                                       \
 497 static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
 498 {                                                                       \
 499         return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
 500 }                                                                       \
 501
 502 CFQG_FLAG_FNS(waiting)
 503 CFQG_FLAG_FNS(idling)
 504 CFQG_FLAG_FNS(empty)
 505 #undef CFQG_FLAG_FNS
 506
 507 /* This should be called with the queue_lock held. */
 508 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 509 {
 510         unsigned long long now;
 511
 512         if (!cfqg_stats_waiting(stats))
 513                 return;
 514
 515         now = sched_clock();
 516         if (time_after64(now, stats->start_group_wait_time))
 517                 blkg_stat_add(&stats->group_wait_time,
 518                               now - stats->start_group_wait_time);
 519         cfqg_stats_clear_waiting(stats);
 520 }
 521
 522 /* This should be called with the queue_lock held. */
 523 static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 524                                                  struct cfq_group *curr_cfqg)
 525 {
 526         struct cfqg_stats *stats = &cfqg->stats;
 527
 528         if (cfqg_stats_waiting(stats))
 529                 return;
 530         if (cfqg == curr_cfqg)
 531                 return;
 532         stats->start_group_wait_time = sched_clock();
 533         cfqg_stats_mark_waiting(stats);
 534 }
 535
 536 /* This should be called with the queue_lock held. */
 537 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 538 {
 539         unsigned long long now;
 540
 541         if (!cfqg_stats_empty(stats))
 542                 return;
 543
 544         now = sched_clock();
 545         if (time_after64(now, stats->start_empty_time))
 546                 blkg_stat_add(&stats->empty_time,
 547                               now - stats->start_empty_time);
 548         cfqg_stats_clear_empty(stats);
 549 }
 550
 551 static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
 552 {
 553         blkg_stat_add(&cfqg->stats.dequeue, 1);
 554 }
 555
 556 static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 557 {
 558         struct cfqg_stats *stats = &cfqg->stats;
 559
 560         if (blkg_rwstat_total(&stats->queued))
 561                 return;
 562
 563         /*
 564          * group is already marked empty. This can happen if cfqq got new
 565          * request in parent group and moved to this group while being added
 566          * to service tree. Just ignore the event and move on.
 567          */
 568         if (cfqg_stats_empty(stats))
 569                 return;
 570
 571         stats->start_empty_time = sched_clock();
 572         cfqg_stats_mark_empty(stats);
 573 }
 574
 575 static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 576 {
 577         struct cfqg_stats *stats = &cfqg->stats;
 578
 579         if (cfqg_stats_idling(stats)) {
 580                 unsigned long long now = sched_clock();
 581
 582                 if (time_after64(now, stats->start_idle_time))
 583                         blkg_stat_add(&stats->idle_time,
 584                                       now - stats->start_idle_time);
 585                 cfqg_stats_clear_idling(stats);
 586         }
 587 }
 588
 589 static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 590 {
 591         struct cfqg_stats *stats = &cfqg->stats;
 592
 593         BUG_ON(cfqg_stats_idling(stats));
 594
 595         stats->start_idle_time = sched_clock();
 596         cfqg_stats_mark_idling(stats);
 597 }
 598
 599 static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 600 {
 601         struct cfqg_stats *stats = &cfqg->stats;
 602
 603         blkg_stat_add(&stats->avg_queue_size_sum,
 604                       blkg_rwstat_total(&stats->queued));
 605         blkg_stat_add(&stats->avg_queue_size_samples, 1);
 606         cfqg_stats_update_group_wait_time(stats);
 607 }
 608
 609 #else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 610
 611 static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
 612 static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
 613 static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
 614 static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
 615 static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
 616 static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
 617 static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 618
 619 #endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 620
 621 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 622
 623 static struct blkcg_policy blkcg_policy_cfq;
 624
 625 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 626 {
 627         return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 628 }
 629
 630 static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
 631 {
 632         return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
 633 }
 634
 635 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 636 {
 637         struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
 638
 639         return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 640 }
 641
 642 static inline void cfqg_get(struct cfq_group *cfqg)
 643 {
 644         return blkg_get(cfqg_to_blkg(cfqg));
 645 }
 646
 647 static inline void cfqg_put(struct cfq_group *cfqg)
 648 {
 649         return blkg_put(cfqg_to_blkg(cfqg));
 650 }
 651
 652 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
 653         char __pbuf[128];                                               \
 654                                                                         \
 655         blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));  \
 656         blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
 657                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 658                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 659                           __pbuf, ##args);                              \
 660 } while (0)
 661
 662 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
 663         char __pbuf[128];                                               \
 664                                                                         \
 665         blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));          \
 666         blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);    \
 667 } while (0)
 668
 669 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 670                                             struct cfq_group *curr_cfqg, int rw)
 671 {
 672         blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
 673         cfqg_stats_end_empty_time(&cfqg->stats);
 674         cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 675 }
 676
 677 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 678                         unsigned long time, unsigned long unaccounted_time)
 679 {
 680         blkg_stat_add(&cfqg->stats.time, time);
 681 #ifdef CONFIG_DEBUG_BLK_CGROUP
 682         blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
 683 #endif
 684 }
 685
 686 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
 687 {
 688         blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
 689 }
 690
 691 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
 692 {
 693         blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 694 }
 695
 696 static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
 697                                               uint64_t bytes, int rw)
 698 {
 699         blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
 700         blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
 701         blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
 702 }
 703
 704 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 705                         uint64_t start_time, uint64_t io_start_time, int rw)
 706 {
 707         struct cfqg_stats *stats = &cfqg->stats;
 708         unsigned long long now = sched_clock();
 709
 710         if (time_after64(now, io_start_time))
 711                 blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
 712         if (time_after64(io_start_time, start_time))
 713                 blkg_rwstat_add(&stats->wait_time, rw,
 714                                 io_start_time - start_time);
 715 }
 716
 717 /* @stats = 0 */
 718 static void cfqg_stats_reset(struct cfqg_stats *stats)
 719 {
 720         /* queued stats shouldn't be cleared */
 721         blkg_rwstat_reset(&stats->service_bytes);
 722         blkg_rwstat_reset(&stats->serviced);
 723         blkg_rwstat_reset(&stats->merged);
 724         blkg_rwstat_reset(&stats->service_time);
 725         blkg_rwstat_reset(&stats->wait_time);
 726         blkg_stat_reset(&stats->time);
 727 #ifdef CONFIG_DEBUG_BLK_CGROUP
 728         blkg_stat_reset(&stats->unaccounted_time);
 729         blkg_stat_reset(&stats->avg_queue_size_sum);
 730         blkg_stat_reset(&stats->avg_queue_size_samples);
 731         blkg_stat_reset(&stats->dequeue);
 732         blkg_stat_reset(&stats->group_wait_time);
 733         blkg_stat_reset(&stats->idle_time);
 734         blkg_stat_reset(&stats->empty_time);
 735 #endif
 736 }
 737
 738 /* @to += @from */
 739 static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
 740 {
 741         /* queued stats shouldn't be cleared */
 742         blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
 743         blkg_rwstat_merge(&to->serviced, &from->serviced);
 744         blkg_rwstat_merge(&to->merged, &from->merged);
 745         blkg_rwstat_merge(&to->service_time, &from->service_time);
 746         blkg_rwstat_merge(&to->wait_time, &from->wait_time);
 747         blkg_stat_merge(&from->time, &from->time);
 748 #ifdef CONFIG_DEBUG_BLK_CGROUP
 749         blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
 750         blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
 751         blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
 752         blkg_stat_merge(&to->dequeue, &from->dequeue);
 753         blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
 754         blkg_stat_merge(&to->idle_time, &from->idle_time);
 755         blkg_stat_merge(&to->empty_time, &from->empty_time);
 756 #endif
 757 }
 758
 759 /*
 760  * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
 761  * recursive stats can still account for the amount used by this cfqg after
 762  * it's gone.
 763  */
 764 static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 765 {
 766         struct cfq_group *parent = cfqg_parent(cfqg);
 767
 768         lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
 769
 770         if (unlikely(!parent))
 771                 return;
 772
 773         cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
 774         cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
 775         cfqg_stats_reset(&cfqg->stats);
 776         cfqg_stats_reset(&cfqg->dead_stats);
 777 }
 778
 779 #else   /* CONFIG_CFQ_GROUP_IOSCHED */
 780
 781 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
 782 static inline void cfqg_get(struct cfq_group *cfqg) { }
 783 static inline void cfqg_put(struct cfq_group *cfqg) { }
 784
 785 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
 786         blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
 787                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
 788                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
 789                                 ##args)
 790 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
 791
 792 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 793                         struct cfq_group *curr_cfqg, int rw) { }
 794 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 795                         unsigned long time, unsigned long unaccounted_time) { }
 796 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
 797 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
 798 static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
 799                                               uint64_t bytes, int rw) { }
 800 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 801                         uint64_t start_time, uint64_t io_start_time, int rw) { }
 802
 803 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 804
 805 #define cfq_log(cfqd, fmt, args...)     \
 806         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 807
 808 /* Traverses through cfq group service trees */
 809 #define for_each_cfqg_st(cfqg, i, j, st) \
 810         for (i = 0; i <= IDLE_WORKLOAD; i++) \
 811                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 812                         : &cfqg->service_tree_idle; \
 813                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 814                         (i == IDLE_WORKLOAD && j == 0); \
 815                         j++, st = i < IDLE_WORKLOAD ? \
 816                         &cfqg->service_trees[i][j]: NULL) \
 817
 818 static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
 819         struct cfq_ttime *ttime, bool group_idle)
 820 {
 821         unsigned long slice;
 822         if (!sample_valid(ttime->ttime_samples))
 823                 return false;
 824         if (group_idle)
 825                 slice = cfqd->cfq_group_idle;
 826         else
 827                 slice = cfqd->cfq_slice_idle;
 828         return ttime->ttime_mean > slice;
 829 }
 830
 831 static inline bool iops_mode(struct cfq_data *cfqd)
 832 {
 833         /*
 834          * If we are not idling on queues and it is a NCQ drive, parallel
 835          * execution of requests is on and measuring time is not possible
 836          * in most of the cases until and unless we drive shallower queue
 837          * depths and that becomes a performance bottleneck. In such cases
 838          * switch to start providing fairness in terms of number of IOs.
 839          */
 840         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
 841                 return true;
 842         else
 843                 return false;
 844 }
 845
 846 static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
 847 {
 848         if (cfq_class_idle(cfqq))
 849                 return IDLE_WORKLOAD;
 850         if (cfq_class_rt(cfqq))
 851                 return RT_WORKLOAD;
 852         return BE_WORKLOAD;
 853 }
 854
 855
 856 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 857 {
 858         if (!cfq_cfqq_sync(cfqq))
 859                 return ASYNC_WORKLOAD;
 860         if (!cfq_cfqq_idle_window(cfqq))
 861                 return SYNC_NOIDLE_WORKLOAD;
 862         return SYNC_WORKLOAD;
 863 }
 864
 865 static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
 866                                         struct cfq_data *cfqd,
 867                                         struct cfq_group *cfqg)
 868 {
 869         if (wl_class == IDLE_WORKLOAD)
 870                 return cfqg->service_tree_idle.count;
 871
 872         return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
 873                 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
 874                 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
 875 }
 876
 877 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 878                                         struct cfq_group *cfqg)
 879 {
 880         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
 881                 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 882 }
 883
 884 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 885 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
 886                                        struct cfq_io_cq *cic, struct bio *bio,
 887                                        gfp_t gfp_mask);
 888
 889 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 890 {
 891         /* cic->icq is the first member, %NULL will convert to %NULL */
 892         return container_of(icq, struct cfq_io_cq, icq);
 893 }
 894
 895 static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
 896                                                struct io_context *ioc)
 897 {
 898         if (ioc)
 899                 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
 900         return NULL;
 901 }
 902
 903 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 904 {
 905         return cic->cfqq[is_sync];
 906 }
 907
 908 static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
 909                                 bool is_sync)
 910 {
 911         cic->cfqq[is_sync] = cfqq;
 912 }
 913
 914 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 915 {
 916         return cic->icq.q->elevator->elevator_data;
 917 }
 918
 919 /*
 920  * We regard a request as SYNC, if it's either a read or has the SYNC bit
 921  * set (in which case it could also be direct WRITE).
 922  */
 923 static inline bool cfq_bio_sync(struct bio *bio)
 924 {
 925         return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
 926 }
 927
 928 /*
 929  * scheduler run of queue, if there are requests pending and no one in the
 930  * driver that will restart queueing
 931  */
 932 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 933 {
 934         if (cfqd->busy_queues) {
 935                 cfq_log(cfqd, "schedule dispatch");
 936                 kblockd_schedule_work(&cfqd->unplug_work);
 937         }
 938 }
 939
 940 /*
 941  * Scale schedule slice based on io priority. Use the sync time slice only
 942  * if a queue is marked sync and has sync io queued. A sync queue with async
 943  * io only, should not get full sync slice length.
 944  */
 945 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 946                                  unsigned short prio)
 947 {
 948         const int base_slice = cfqd->cfq_slice[sync];
 949
 950         WARN_ON(prio >= IOPRIO_BE_NR);
 951
 952         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
 953 }
 954
 955 static inline int
 956 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 957 {
 958         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 959 }
 960
 961 /**
 962  * cfqg_scale_charge - scale disk time charge according to cfqg weight
 963  * @charge: disk time being charged
 964  * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
 965  *
 966  * Scale @charge according to @vfraction, which is in range (0, 1].  The
 967  * scaling is inversely proportional.
 968  *
 969  * scaled = charge / vfraction
 970  *
 971  * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
 972  */
 973 static inline u64 cfqg_scale_charge(unsigned long charge,
 974                                     unsigned int vfraction)
 975 {
 976         u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
 977
 978         /* charge / vfraction */
 979         c <<= CFQ_SERVICE_SHIFT;
 980         do_div(c, vfraction);
 981         return c;
 982 }
 983
 984 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 985 {
 986         s64 delta = (s64)(vdisktime - min_vdisktime);
 987         if (delta > 0)
 988                 min_vdisktime = vdisktime;
 989
 990         return min_vdisktime;
 991 }
 992
 993 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 994 {
 995         s64 delta = (s64)(vdisktime - min_vdisktime);
 996         if (delta < 0)
 997                 min_vdisktime = vdisktime;
 998
 999         return min_vdisktime;
1000 }
1001
1002 static void update_min_vdisktime(struct cfq_rb_root *st)
1003 {
1004         struct cfq_group *cfqg;
1005
1006         if (st->left) {
1007                 cfqg = rb_entry_cfqg(st->left);
1008                 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
1009                                                   cfqg->vdisktime);
1010         }
1011 }
1012
1013 /*
1014  * get averaged number of queues of RT/BE priority.
1015  * average is updated, with a formula that gives more weight to higher numbers,
1016  * to quickly follows sudden increases and decrease slowly
1017  */
1018
1019 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1020                                         struct cfq_group *cfqg, bool rt)
1021 {
1022         unsigned min_q, max_q;
1023         unsigned mult  = cfq_hist_divisor - 1;
1024         unsigned round = cfq_hist_divisor / 2;
1025         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1026
1027         min_q = min(cfqg->busy_queues_avg[rt], busy);
1028         max_q = max(cfqg->busy_queues_avg[rt], busy);
1029         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1030                 cfq_hist_divisor;
1031         return cfqg->busy_queues_avg[rt];
1032 }
1033
1034 static inline unsigned
1035 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1036 {
1037         return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1038 }
1039
1040 static inline unsigned
1041 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1042 {
1043         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
1044         if (cfqd->cfq_latency) {
1045                 /*
1046                  * interested queues (we consider only the ones with the same
1047                  * priority class in the cfq group)
1048                  */
1049                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1050                                                 cfq_class_rt(cfqq));
1051                 unsigned sync_slice = cfqd->cfq_slice[1];
1052                 unsigned expect_latency = sync_slice * iq;
1053                 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1054
1055                 if (expect_latency > group_slice) {
1056                         unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
1057                         /* scale low_slice according to IO priority
1058                          * and sync vs async */
1059                         unsigned low_slice =
1060                                 min(slice, base_low_slice * slice / sync_slice);
1061                         /* the adapted slice value is scaled to fit all iqs
1062                          * into the target latency */
1063                         slice = max(slice * group_slice / expect_latency,
1064                                     low_slice);
1065                 }
1066         }
1067         return slice;
1068 }
1069
1070 static inline void
1071 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1072 {
1073         unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1074
1075         cfqq->slice_start = jiffies;
1076         cfqq->slice_end = jiffies + slice;
1077         cfqq->allocated_slice = slice;
1078         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
1079 }
1080
1081 /*
1082  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1083  * isn't valid until the first request from the dispatch is activated
1084  * and the slice time set.
1085  */
1086 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1087 {
1088         if (cfq_cfqq_slice_new(cfqq))
1089                 return false;
1090         if (time_before(jiffies, cfqq->slice_end))
1091                 return false;
1092
1093         return true;
1094 }
1095
1096 /*
1097  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1098  * We choose the request that is closest to the head right now. Distance
1099  * behind the head is penalized and only allowed to a certain extent.
1100  */
1101 static struct request *
1102 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1103 {
1104         sector_t s1, s2, d1 = 0, d2 = 0;
1105         unsigned long back_max;
1106 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
1107 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
1108         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1109
1110         if (rq1 == NULL || rq1 == rq2)
1111                 return rq2;
1112         if (rq2 == NULL)
1113                 return rq1;
1114
1115         if (rq_is_sync(rq1) != rq_is_sync(rq2))
1116                 return rq_is_sync(rq1) ? rq1 : rq2;
1117
1118         if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1119                 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1120
1121         s1 = blk_rq_pos(rq1);
1122         s2 = blk_rq_pos(rq2);
1123
1124         /*
1125          * by definition, 1KiB is 2 sectors
1126          */
1127         back_max = cfqd->cfq_back_max * 2;
1128
1129         /*
1130          * Strict one way elevator _except_ in the case where we allow
1131          * short backward seeks which are biased as twice the cost of a
1132          * similar forward seek.
1133          */
1134         if (s1 >= last)
1135                 d1 = s1 - last;
1136         else if (s1 + back_max >= last)
1137                 d1 = (last - s1) * cfqd->cfq_back_penalty;
1138         else
1139                 wrap |= CFQ_RQ1_WRAP;
1140
1141         if (s2 >= last)
1142                 d2 = s2 - last;
1143         else if (s2 + back_max >= last)
1144                 d2 = (last - s2) * cfqd->cfq_back_penalty;
1145         else
1146                 wrap |= CFQ_RQ2_WRAP;
1147
1148         /* Found required data */
1149
1150         /*
1151          * By doing switch() on the bit mask "wrap" we avoid having to
1152          * check two variables for all permutations: --> faster!
1153          */
1154         switch (wrap) {
1155         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1156                 if (d1 < d2)
1157                         return rq1;
1158                 else if (d2 < d1)
1159                         return rq2;
1160                 else {
1161                         if (s1 >= s2)
1162                                 return rq1;
1163                         else
1164                                 return rq2;
1165                 }
1166
1167         case CFQ_RQ2_WRAP:
1168                 return rq1;
1169         case CFQ_RQ1_WRAP:
1170                 return rq2;
1171         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1172         default:
1173                 /*
1174                  * Since both rqs are wrapped,
1175                  * start with the one that's further behind head
1176                  * (--> only *one* back seek required),
1177                  * since back seek takes more time than forward.
1178                  */
1179                 if (s1 <= s2)
1180                         return rq1;
1181                 else
1182                         return rq2;
1183         }
1184 }
1185
1186 /*
1187  * The below is leftmost cache rbtree addon
1188  */
1189 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1190 {
1191         /* Service tree is empty */
1192         if (!root->count)
1193                 return NULL;
1194
1195         if (!root->left)
1196                 root->left = rb_first(&root->rb);
1197
1198         if (root->left)
1199                 return rb_entry(root->left, struct cfq_queue, rb_node);
1200
1201         return NULL;
1202 }
1203
1204 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1205 {
1206         if (!root->left)
1207                 root->left = rb_first(&root->rb);
1208
1209         if (root->left)
1210                 return rb_entry_cfqg(root->left);
1211
1212         return NULL;
1213 }
1214
1215 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
1216 {
1217         rb_erase(n, root);
1218         RB_CLEAR_NODE(n);
1219 }
1220
1221 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1222 {
1223         if (root->left == n)
1224                 root->left = NULL;
1225         rb_erase_init(n, &root->rb);
1226         --root->count;
1227 }
1228
1229 /*
1230  * would be nice to take fifo expire time into account as well
1231  */
1232 static struct request *
1233 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1234                   struct request *last)
1235 {
1236         struct rb_node *rbnext = rb_next(&last->rb_node);
1237         struct rb_node *rbprev = rb_prev(&last->rb_node);
1238         struct request *next = NULL, *prev = NULL;
1239
1240         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1241
1242         if (rbprev)
1243                 prev = rb_entry_rq(rbprev);
1244
1245         if (rbnext)
1246                 next = rb_entry_rq(rbnext);
1247         else {
1248                 rbnext = rb_first(&cfqq->sort_list);
1249                 if (rbnext && rbnext != &last->rb_node)
1250                         next = rb_entry_rq(rbnext);
1251         }
1252
1253         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1254 }
1255
1256 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
1257                                       struct cfq_queue *cfqq)
1258 {
1259         /*
1260          * just an approximation, should be ok.
1261          */
1262         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1263                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1264 }
1265
1266 static inline s64
1267 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1268 {
1269         return cfqg->vdisktime - st->min_vdisktime;
1270 }
1271
1272 static void
1273 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1274 {
1275         struct rb_node **node = &st->rb.rb_node;
1276         struct rb_node *parent = NULL;
1277         struct cfq_group *__cfqg;
1278         s64 key = cfqg_key(st, cfqg);
1279         int left = 1;
1280
1281         while (*node != NULL) {
1282                 parent = *node;
1283                 __cfqg = rb_entry_cfqg(parent);
1284
1285                 if (key < cfqg_key(st, __cfqg))
1286                         node = &parent->rb_left;
1287                 else {
1288                         node = &parent->rb_right;
1289                         left = 0;
1290                 }
1291         }
1292
1293         if (left)
1294                 st->left = &cfqg->rb_node;
1295
1296         rb_link_node(&cfqg->rb_node, parent, node);
1297         rb_insert_color(&cfqg->rb_node, &st->rb);
1298 }
1299
1300 /*
1301  * This has to be called only on activation of cfqg
1302  */
1303 static void
1304 cfq_update_group_weight(struct cfq_group *cfqg)
1305 {
1306         if (cfqg->new_weight) {
1307                 cfqg->weight = cfqg->new_weight;
1308                 cfqg->new_weight = 0;
1309         }
1310 }
1311
1312 static void
1313 cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1314 {
1315         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1316
1317         if (cfqg->new_leaf_weight) {
1318                 cfqg->leaf_weight = cfqg->new_leaf_weight;
1319                 cfqg->new_leaf_weight = 0;
1320         }
1321 }
1322
1323 static void
1324 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1325 {
1326         unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
1327         struct cfq_group *pos = cfqg;
1328         struct cfq_group *parent;
1329         bool propagate;
1330
1331         /* add to the service tree */
1332         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1333
1334         /*
1335          * Update leaf_weight.  We cannot update weight at this point
1336          * because cfqg might already have been activated and is
1337          * contributing its current weight to the parent's child_weight.
1338          */
1339         cfq_update_group_leaf_weight(cfqg);
1340         __cfq_group_service_tree_add(st, cfqg);
1341
1342         /*
1343          * Activate @cfqg and calculate the portion of vfraction @cfqg is
1344          * entitled to.  vfraction is calculated by walking the tree
1345          * towards the root calculating the fraction it has at each level.
1346          * The compounded ratio is how much vfraction @cfqg owns.
1347          *
1348          * Start with the proportion tasks in this cfqg has against active
1349          * children cfqgs - its leaf_weight against children_weight.
1350          */
1351         propagate = !pos->nr_active++;
1352         pos->children_weight += pos->leaf_weight;
1353         vfr = vfr * pos->leaf_weight / pos->children_weight;
1354
1355         /*
1356          * Compound ->weight walking up the tree.  Both activation and
1357          * vfraction calculation are done in the same loop.  Propagation
1358          * stops once an already activated node is met.  vfraction
1359          * calculation should always continue to the root.
1360          */
1361         while ((parent = cfqg_parent(pos))) {
1362                 if (propagate) {
1363                         cfq_update_group_weight(pos);
1364                         propagate = !parent->nr_active++;
1365                         parent->children_weight += pos->weight;
1366                 }
1367                 vfr = vfr * pos->weight / parent->children_weight;
1368                 pos = parent;
1369         }
1370
1371         cfqg->vfraction = max_t(unsigned, vfr, 1);
1372 }
1373
1374 static void
1375 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1376 {
1377         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1378         struct cfq_group *__cfqg;
1379         struct rb_node *n;
1380
1381         cfqg->nr_cfqq++;
1382         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1383                 return;
1384
1385         /*
1386          * Currently put the group at the end. Later implement something
1387          * so that groups get lesser vtime based on their weights, so that
1388          * if group does not loose all if it was not continuously backlogged.
1389          */
1390         n = rb_last(&st->rb);
1391         if (n) {
1392                 __cfqg = rb_entry_cfqg(n);
1393                 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
1394         } else
1395                 cfqg->vdisktime = st->min_vdisktime;
1396         cfq_group_service_tree_add(st, cfqg);
1397 }
1398
1399 static void
1400 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1401 {
1402         struct cfq_group *pos = cfqg;
1403         bool propagate;
1404
1405         /*
1406          * Undo activation from cfq_group_service_tree_add().  Deactivate
1407          * @cfqg and propagate deactivation upwards.
1408          */
1409         propagate = !--pos->nr_active;
1410         pos->children_weight -= pos->leaf_weight;
1411
1412         while (propagate) {
1413                 struct cfq_group *parent = cfqg_parent(pos);
1414
1415                 /* @pos has 0 nr_active at this point */
1416                 WARN_ON_ONCE(pos->children_weight);
1417                 pos->vfraction = 0;
1418
1419                 if (!parent)
1420                         break;
1421
1422                 propagate = !--parent->nr_active;
1423                 parent->children_weight -= pos->weight;
1424                 pos = parent;
1425         }
1426
1427         /* remove from the service tree */
1428         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1429                 cfq_rb_erase(&cfqg->rb_node, st);
1430 }
1431
1432 static void
1433 cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1434 {
1435         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1436
1437         BUG_ON(cfqg->nr_cfqq < 1);
1438         cfqg->nr_cfqq--;
1439
1440         /* If there are other cfq queues under this group, don't delete it */
1441         if (cfqg->nr_cfqq)
1442                 return;
1443
1444         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1445         cfq_group_service_tree_del(st, cfqg);
1446         cfqg->saved_wl_slice = 0;
1447         cfqg_stats_update_dequeue(cfqg);
1448 }
1449
1450 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1451                                                 unsigned int *unaccounted_time)
1452 {
1453         unsigned int slice_used;
1454
1455         /*
1456          * Queue got expired before even a single request completed or
1457          * got expired immediately after first request completion.
1458          */
1459         if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
1460                 /*
1461                  * Also charge the seek time incurred to the group, otherwise
1462                  * if there are mutiple queues in the group, each can dispatch
1463                  * a single request on seeky media and cause lots of seek time
1464                  * and group will never know it.
1465                  */
1466                 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
1467                                         1);
1468         } else {
1469                 slice_used = jiffies - cfqq->slice_start;
1470                 if (slice_used > cfqq->allocated_slice) {
1471                         *unaccounted_time = slice_used - cfqq->allocated_slice;
1472                         slice_used = cfqq->allocated_slice;
1473                 }
1474                 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
1475                         *unaccounted_time += cfqq->slice_start -
1476                                         cfqq->dispatch_start;
1477         }
1478
1479         return slice_used;
1480 }
1481
1482 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1483                                 struct cfq_queue *cfqq)
1484 {
1485         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1486         unsigned int used_sl, charge, unaccounted_sl = 0;
1487         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1488                         - cfqg->service_tree_idle.count;
1489         unsigned int vfr;
1490
1491         BUG_ON(nr_sync < 0);
1492         used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1493
1494         if (iops_mode(cfqd))
1495                 charge = cfqq->slice_dispatch;
1496         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1497                 charge = cfqq->allocated_slice;
1498
1499         /*
1500          * Can't update vdisktime while on service tree and cfqg->vfraction
1501          * is valid only while on it.  Cache vfr, leave the service tree,
1502          * update vdisktime and go back on.  The re-addition to the tree
1503          * will also update the weights as necessary.
1504          */
1505         vfr = cfqg->vfraction;
1506         cfq_group_service_tree_del(st, cfqg);
1507         cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1508         cfq_group_service_tree_add(st, cfqg);
1509
1510         /* This group is being expired. Save the context */
1511         if (time_after(cfqd->workload_expires, jiffies)) {
1512                 cfqg->saved_wl_slice = cfqd->workload_expires
1513                                                 - jiffies;
1514                 cfqg->saved_wl_type = cfqd->serving_wl_type;
1515                 cfqg->saved_wl_class = cfqd->serving_wl_class;
1516         } else
1517                 cfqg->saved_wl_slice = 0;
1518
1519         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1520                                         st->min_vdisktime);
1521         cfq_log_cfqq(cfqq->cfqd, cfqq,
1522                      "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1523                      used_sl, cfqq->slice_dispatch, charge,
1524                      iops_mode(cfqd), cfqq->nr_sectors);
1525         cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1526         cfqg_stats_set_start_empty_time(cfqg);
1527 }
1528
1529 /**
1530  * cfq_init_cfqg_base - initialize base part of a cfq_group
1531  * @cfqg: cfq_group to initialize
1532  *
1533  * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1534  * is enabled or not.
1535  */
1536 static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1537 {
1538         struct cfq_rb_root *st;
1539         int i, j;
1540
1541         for_each_cfqg_st(cfqg, i, j, st)
1542                 *st = CFQ_RB_ROOT;
1543         RB_CLEAR_NODE(&cfqg->rb_node);
1544
1545         cfqg->ttime.last_end_request = jiffies;
1546 }
1547
1548 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1549 static void cfqg_stats_init(struct cfqg_stats *stats)
1550 {
1551         blkg_rwstat_init(&stats->service_bytes);
1552         blkg_rwstat_init(&stats->serviced);
1553         blkg_rwstat_init(&stats->merged);
1554         blkg_rwstat_init(&stats->service_time);
1555         blkg_rwstat_init(&stats->wait_time);
1556         blkg_rwstat_init(&stats->queued);
1557
1558         blkg_stat_init(&stats->sectors);
1559         blkg_stat_init(&stats->time);
1560
1561 #ifdef CONFIG_DEBUG_BLK_CGROUP
1562         blkg_stat_init(&stats->unaccounted_time);
1563         blkg_stat_init(&stats->avg_queue_size_sum);
1564         blkg_stat_init(&stats->avg_queue_size_samples);
1565         blkg_stat_init(&stats->dequeue);
1566         blkg_stat_init(&stats->group_wait_time);
1567         blkg_stat_init(&stats->idle_time);
1568         blkg_stat_init(&stats->empty_time);
1569 #endif
1570 }
1571
1572 static void cfq_cpd_init(const struct blkcg *blkcg)
1573 {
1574         struct cfq_group_data *cgd =
1575                 cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
1576
1577         if (blkcg == &blkcg_root) {
1578                 cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
1579                 cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
1580         } else {
1581                 cgd->weight = CFQ_WEIGHT_DEFAULT;
1582                 cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
1583         }
1584 }
1585
1586 static void cfq_pd_init(struct blkcg_gq *blkg)
1587 {
1588         struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1589         struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
1590
1591         cfq_init_cfqg_base(cfqg);
1592         cfqg->weight = cgd->weight;
1593         cfqg->leaf_weight = cgd->leaf_weight;
1594         cfqg_stats_init(&cfqg->stats);
1595         cfqg_stats_init(&cfqg->dead_stats);
1596 }
1597
1598 static void cfq_pd_offline(struct blkcg_gq *blkg)
1599 {
1600         /*
1601          * @blkg is going offline and will be ignored by
1602          * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
1603          * that they don't get lost.  If IOs complete after this point, the
1604          * stats for them will be lost.  Oh well...
1605          */
1606         cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
1607 }
1608
1609 /* offset delta from cfqg->stats to cfqg->dead_stats */
1610 static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
1611                                         offsetof(struct cfq_group, stats);
1612
1613 /* to be used by recursive prfill, sums live and dead stats recursively */
1614 static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
1615 {
1616         u64 sum = 0;
1617
1618         sum += blkg_stat_recursive_sum(pd, off);
1619         sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
1620         return sum;
1621 }
1622
1623 /* to be used by recursive prfill, sums live and dead rwstats recursively */
1624 static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
1625                                                        int off)
1626 {
1627         struct blkg_rwstat a, b;
1628
1629         a = blkg_rwstat_recursive_sum(pd, off);
1630         b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
1631         blkg_rwstat_merge(&a, &b);
1632         return a;
1633 }
1634
1635 static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
1636 {
1637         struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1638
1639         cfqg_stats_reset(&cfqg->stats);
1640         cfqg_stats_reset(&cfqg->dead_stats);
1641 }
1642
1643 /*
1644  * Search for the cfq group current task belongs to. request_queue lock must
1645  * be held.
1646  */
1647 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1648                                                 struct blkcg *blkcg)
1649 {
1650         struct request_queue *q = cfqd->queue;
1651         struct cfq_group *cfqg = NULL;
1652
1653         /* avoid lookup for the common case where there's no blkcg */
1654         if (blkcg == &blkcg_root) {
1655                 cfqg = cfqd->root_group;
1656         } else {
1657                 struct blkcg_gq *blkg;
1658
1659                 blkg = blkg_lookup_create(blkcg, q);
1660                 if (!IS_ERR(blkg))
1661                         cfqg = blkg_to_cfqg(blkg);
1662         }
1663
1664         return cfqg;
1665 }
1666
1667 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1668 {
1669         /* Currently, all async queues are mapped to root group */
1670         if (!cfq_cfqq_sync(cfqq))
1671                 cfqg = cfqq->cfqd->root_group;
1672
1673         cfqq->cfqg = cfqg;
1674         /* cfqq reference on cfqg */
1675         cfqg_get(cfqg);
1676 }
1677
1678 static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1679                                      struct blkg_policy_data *pd, int off)
1680 {
1681         struct cfq_group *cfqg = pd_to_cfqg(pd);
1682
1683         if (!cfqg->dev_weight)
1684                 return 0;
1685         return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1686 }
1687
1688 static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1689 {
1690         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1691                           cfqg_prfill_weight_device, &blkcg_policy_cfq,
1692                           0, false);
1693         return 0;
1694 }
1695
1696 static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1697                                           struct blkg_policy_data *pd, int off)
1698 {
1699         struct cfq_group *cfqg = pd_to_cfqg(pd);
1700
1701         if (!cfqg->dev_leaf_weight)
1702                 return 0;
1703         return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1704 }
1705
1706 static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1707 {
1708         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1709                           cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1710                           0, false);
1711         return 0;
1712 }
1713
1714 static int cfq_print_weight(struct seq_file *sf, void *v)
1715 {
1716         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1717
1718         seq_printf(sf, "%u\n", blkcg_to_cfqgd(blkcg)->weight);
1719         return 0;
1720 }
1721
1722 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1723 {
1724         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1725
1726         seq_printf(sf, "%u\n", blkcg_to_cfqgd(blkcg)->leaf_weight);
1727         return 0;
1728 }
1729
1730 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1731                                         char *buf, size_t nbytes, loff_t off,
1732                                         bool is_leaf_weight)
1733 {
1734         struct blkcg *blkcg = css_to_blkcg(of_css(of));
1735         struct blkg_conf_ctx ctx;
1736         struct cfq_group *cfqg;
1737         struct cfq_group_data *cfqgd;
1738         int ret;
1739
1740         ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1741         if (ret)
1742                 return ret;
1743
1744         ret = -EINVAL;
1745         cfqg = blkg_to_cfqg(ctx.blkg);
1746         cfqgd = blkcg_to_cfqgd(blkcg);
1747         if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1748                 if (!is_leaf_weight) {
1749                         cfqg->dev_weight = ctx.v;
1750                         cfqg->new_weight = ctx.v ?: cfqgd->weight;
1751                 } else {
1752                         cfqg->dev_leaf_weight = ctx.v;
1753                         cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
1754                 }
1755                 ret = 0;
1756         }
1757
1758         blkg_conf_finish(&ctx);
1759         return ret ?: nbytes;
1760 }
1761
1762 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1763                                       char *buf, size_t nbytes, loff_t off)
1764 {
1765         return __cfqg_set_weight_device(of, buf, nbytes, off, false);
1766 }
1767
1768 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1769                                            char *buf, size_t nbytes, loff_t off)
1770 {
1771         return __cfqg_set_weight_device(of, buf, nbytes, off, true);
1772 }
1773
1774 static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1775                             u64 val, bool is_leaf_weight)
1776 {
1777         struct blkcg *blkcg = css_to_blkcg(css);
1778         struct blkcg_gq *blkg;
1779         struct cfq_group_data *cfqgd;
1780
1781         if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1782                 return -EINVAL;
1783
1784         spin_lock_irq(&blkcg->lock);
1785         cfqgd = blkcg_to_cfqgd(blkcg);
1786
1787         if (!is_leaf_weight)
1788                 cfqgd->weight = val;
1789         else
1790                 cfqgd->leaf_weight = val;
1791
1792         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1793                 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1794
1795                 if (!cfqg)
1796                         continue;
1797
1798                 if (!is_leaf_weight) {
1799                         if (!cfqg->dev_weight)
1800                                 cfqg->new_weight = cfqgd->weight;
1801                 } else {
1802                         if (!cfqg->dev_leaf_weight)
1803                                 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1804                 }
1805         }
1806
1807         spin_unlock_irq(&blkcg->lock);
1808         return 0;
1809 }
1810
1811 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1812                           u64 val)
1813 {
1814         return __cfq_set_weight(css, cft, val, false);
1815 }
1816
1817 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1818                                struct cftype *cft, u64 val)
1819 {
1820         return __cfq_set_weight(css, cft, val, true);
1821 }
1822
1823 static int cfqg_print_stat(struct seq_file *sf, void *v)
1824 {
1825         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1826                           &blkcg_policy_cfq, seq_cft(sf)->private, false);
1827         return 0;
1828 }
1829
1830 static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1831 {
1832         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1833                           &blkcg_policy_cfq, seq_cft(sf)->private, true);
1834         return 0;
1835 }
1836
1837 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1838                                       struct blkg_policy_data *pd, int off)
1839 {
1840         u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
1841
1842         return __blkg_prfill_u64(sf, pd, sum);
1843 }
1844
1845 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1846                                         struct blkg_policy_data *pd, int off)
1847 {
1848         struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
1849
1850         return __blkg_prfill_rwstat(sf, pd, &sum);
1851 }
1852
1853 static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1854 {
1855         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1856                           cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1857                           seq_cft(sf)->private, false);
1858         return 0;
1859 }
1860
1861 static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1862 {
1863         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1864                           cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1865                           seq_cft(sf)->private, true);
1866         return 0;
1867 }
1868
1869 #ifdef CONFIG_DEBUG_BLK_CGROUP
1870 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1871                                       struct blkg_policy_data *pd, int off)
1872 {
1873         struct cfq_group *cfqg = pd_to_cfqg(pd);
1874         u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1875         u64 v = 0;
1876
1877         if (samples) {
1878                 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1879                 v = div64_u64(v, samples);
1880         }
1881         __blkg_prfill_u64(sf, pd, v);
1882         return 0;
1883 }
1884
1885 /* print avg_queue_size */
1886 static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1887 {
1888         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1889                           cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1890                           0, false);
1891         return 0;
1892 }
1893 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
1894
1895 static struct cftype cfq_blkcg_files[] = {
1896         /* on root, weight is mapped to leaf_weight */
1897         {
1898                 .name = "weight_device",
1899                 .flags = CFTYPE_ONLY_ON_ROOT,
1900                 .seq_show = cfqg_print_leaf_weight_device,
1901                 .write = cfqg_set_leaf_weight_device,
1902         },
1903         {
1904                 .name = "weight",
1905                 .flags = CFTYPE_ONLY_ON_ROOT,
1906                 .seq_show = cfq_print_leaf_weight,
1907                 .write_u64 = cfq_set_leaf_weight,
1908         },
1909
1910         /* no such mapping necessary for !roots */
1911         {
1912                 .name = "weight_device",
1913                 .flags = CFTYPE_NOT_ON_ROOT,
1914                 .seq_show = cfqg_print_weight_device,
1915                 .write = cfqg_set_weight_device,
1916         },
1917         {
1918                 .name = "weight",
1919                 .flags = CFTYPE_NOT_ON_ROOT,
1920                 .seq_show = cfq_print_weight,
1921                 .write_u64 = cfq_set_weight,
1922         },
1923
1924         {
1925                 .name = "leaf_weight_device",
1926                 .seq_show = cfqg_print_leaf_weight_device,
1927                 .write = cfqg_set_leaf_weight_device,
1928         },
1929         {
1930                 .name = "leaf_weight",
1931                 .seq_show = cfq_print_leaf_weight,
1932                 .write_u64 = cfq_set_leaf_weight,
1933         },
1934
1935         /* statistics, covers only the tasks in the cfqg */
1936         {
1937                 .name = "time",
1938                 .private = offsetof(struct cfq_group, stats.time),
1939                 .seq_show = cfqg_print_stat,
1940         },
1941         {
1942                 .name = "sectors",
1943                 .private = offsetof(struct cfq_group, stats.sectors),
1944                 .seq_show = cfqg_print_stat,
1945         },
1946         {
1947                 .name = "io_service_bytes",
1948                 .private = offsetof(struct cfq_group, stats.service_bytes),
1949                 .seq_show = cfqg_print_rwstat,
1950         },
1951         {
1952                 .name = "io_serviced",
1953                 .private = offsetof(struct cfq_group, stats.serviced),
1954                 .seq_show = cfqg_print_rwstat,
1955         },
1956         {
1957                 .name = "io_service_time",
1958                 .private = offsetof(struct cfq_group, stats.service_time),
1959                 .seq_show = cfqg_print_rwstat,
1960         },
1961         {
1962                 .name = "io_wait_time",
1963                 .private = offsetof(struct cfq_group, stats.wait_time),
1964                 .seq_show = cfqg_print_rwstat,
1965         },
1966         {
1967                 .name = "io_merged",
1968                 .private = offsetof(struct cfq_group, stats.merged),
1969                 .seq_show = cfqg_print_rwstat,
1970         },
1971         {
1972                 .name = "io_queued",
1973                 .private = offsetof(struct cfq_group, stats.queued),
1974                 .seq_show = cfqg_print_rwstat,
1975         },
1976
1977         /* the same statictics which cover the cfqg and its descendants */
1978         {
1979                 .name = "time_recursive",
1980                 .private = offsetof(struct cfq_group, stats.time),
1981                 .seq_show = cfqg_print_stat_recursive,
1982         },
1983         {
1984                 .name = "sectors_recursive",
1985                 .private = offsetof(struct cfq_group, stats.sectors),
1986                 .seq_show = cfqg_print_stat_recursive,
1987         },
1988         {
1989                 .name = "io_service_bytes_recursive",
1990                 .private = offsetof(struct cfq_group, stats.service_bytes),
1991                 .seq_show = cfqg_print_rwstat_recursive,
1992         },
1993         {
1994                 .name = "io_serviced_recursive",
1995                 .private = offsetof(struct cfq_group, stats.serviced),
1996                 .seq_show = cfqg_print_rwstat_recursive,
1997         },
1998         {
1999                 .name = "io_service_time_recursive",
2000                 .private = offsetof(struct cfq_group, stats.service_time),
2001                 .seq_show = cfqg_print_rwstat_recursive,
2002         },
2003         {
2004                 .name = "io_wait_time_recursive",
2005                 .private = offsetof(struct cfq_group, stats.wait_time),
2006                 .seq_show = cfqg_print_rwstat_recursive,
2007         },
2008         {
2009                 .name = "io_merged_recursive",
2010                 .private = offsetof(struct cfq_group, stats.merged),
2011                 .seq_show = cfqg_print_rwstat_recursive,
2012         },
2013         {
2014                 .name = "io_queued_recursive",
2015                 .private = offsetof(struct cfq_group, stats.queued),
2016                 .seq_show = cfqg_print_rwstat_recursive,
2017         },
2018 #ifdef CONFIG_DEBUG_BLK_CGROUP
2019         {
2020                 .name = "avg_queue_size",
2021                 .seq_show = cfqg_print_avg_queue_size,
2022         },
2023         {
2024                 .name = "group_wait_time",
2025                 .private = offsetof(struct cfq_group, stats.group_wait_time),
2026                 .seq_show = cfqg_print_stat,
2027         },
2028         {
2029                 .name = "idle_time",
2030                 .private = offsetof(struct cfq_group, stats.idle_time),
2031                 .seq_show = cfqg_print_stat,
2032         },
2033         {
2034                 .name = "empty_time",
2035                 .private = offsetof(struct cfq_group, stats.empty_time),
2036                 .seq_show = cfqg_print_stat,
2037         },
2038         {
2039                 .name = "dequeue",
2040                 .private = offsetof(struct cfq_group, stats.dequeue),
2041                 .seq_show = cfqg_print_stat,
2042         },
2043         {
2044                 .name = "unaccounted_time",
2045                 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2046                 .seq_show = cfqg_print_stat,
2047         },
2048 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
2049         { }     /* terminate */
2050 };
2051 #else /* GROUP_IOSCHED */
2052 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
2053                                                 struct blkcg *blkcg)
2054 {
2055         return cfqd->root_group;
2056 }
2057
2058 static inline void
2059 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2060         cfqq->cfqg = cfqg;
2061 }
2062
2063 #endif /* GROUP_IOSCHED */
2064
2065 /*
2066  * The cfqd->service_trees holds all pending cfq_queue's that have
2067  * requests waiting to be processed. It is sorted in the order that
2068  * we will service the queues.
2069  */
2070 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2071                                  bool add_front)
2072 {
2073         struct rb_node **p, *parent;
2074         struct cfq_queue *__cfqq;
2075         unsigned long rb_key;
2076         struct cfq_rb_root *st;
2077         int left;
2078         int new_cfqq = 1;
2079
2080         st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2081         if (cfq_class_idle(cfqq)) {
2082                 rb_key = CFQ_IDLE_DELAY;
2083                 parent = rb_last(&st->rb);
2084                 if (parent && parent != &cfqq->rb_node) {
2085                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2086                         rb_key += __cfqq->rb_key;
2087                 } else
2088                         rb_key += jiffies;
2089         } else if (!add_front) {
2090                 /*
2091                  * Get our rb key offset. Subtract any residual slice
2092                  * value carried from last service. A negative resid
2093                  * count indicates slice overrun, and this should position
2094                  * the next service time further away in the tree.
2095                  */
2096                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
2097                 rb_key -= cfqq->slice_resid;
2098                 cfqq->slice_resid = 0;
2099         } else {
2100                 rb_key = -HZ;
2101                 __cfqq = cfq_rb_first(st);
2102                 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
2103         }
2104
2105         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2106                 new_cfqq = 0;
2107                 /*
2108                  * same position, nothing more to do
2109                  */
2110                 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2111                         return;
2112
2113                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2114                 cfqq->service_tree = NULL;
2115         }
2116
2117         left = 1;
2118         parent = NULL;
2119         cfqq->service_tree = st;
2120         p = &st->rb.rb_node;
2121         while (*p) {
2122                 parent = *p;
2123                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2124
2125                 /*
2126                  * sort by key, that represents service time.
2127                  */
2128                 if (time_before(rb_key, __cfqq->rb_key))
2129                         p = &parent->rb_left;
2130                 else {
2131                         p = &parent->rb_right;
2132                         left = 0;
2133                 }
2134         }
2135
2136         if (left)
2137                 st->left = &cfqq->rb_node;
2138
2139         cfqq->rb_key = rb_key;
2140         rb_link_node(&cfqq->rb_node, parent, p);
2141         rb_insert_color(&cfqq->rb_node, &st->rb);
2142         st->count++;
2143         if (add_front || !new_cfqq)
2144                 return;
2145         cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2146 }
2147
2148 static struct cfq_queue *
2149 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2150                      sector_t sector, struct rb_node **ret_parent,
2151                      struct rb_node ***rb_link)
2152 {
2153         struct rb_node **p, *parent;
2154         struct cfq_queue *cfqq = NULL;
2155
2156         parent = NULL;
2157         p = &root->rb_node;
2158         while (*p) {
2159                 struct rb_node **n;
2160
2161                 parent = *p;
2162                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2163
2164                 /*
2165                  * Sort strictly based on sector.  Smallest to the left,
2166                  * largest to the right.
2167                  */
2168                 if (sector > blk_rq_pos(cfqq->next_rq))
2169                         n = &(*p)->rb_right;
2170                 else if (sector < blk_rq_pos(cfqq->next_rq))
2171                         n = &(*p)->rb_left;
2172                 else
2173                         break;
2174                 p = n;
2175                 cfqq = NULL;
2176         }
2177
2178         *ret_parent = parent;
2179         if (rb_link)
2180                 *rb_link = p;
2181         return cfqq;
2182 }
2183
2184 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2185 {
2186         struct rb_node **p, *parent;
2187         struct cfq_queue *__cfqq;
2188
2189         if (cfqq->p_root) {
2190                 rb_erase(&cfqq->p_node, cfqq->p_root);
2191                 cfqq->p_root = NULL;
2192         }
2193
2194         if (cfq_class_idle(cfqq))
2195                 return;
2196         if (!cfqq->next_rq)
2197                 return;
2198
2199         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2200         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2201                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
2202         if (!__cfqq) {
2203                 rb_link_node(&cfqq->p_node, parent, p);
2204                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2205         } else
2206                 cfqq->p_root = NULL;
2207 }
2208
2209 /*
2210  * Update cfqq's position in the service tree.
2211  */
2212 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2213 {
2214         /*
2215          * Resorting requires the cfqq to be on the RR list already.
2216          */
2217         if (cfq_cfqq_on_rr(cfqq)) {
2218                 cfq_service_tree_add(cfqd, cfqq, 0);
2219                 cfq_prio_tree_add(cfqd, cfqq);
2220         }
2221 }
2222
2223 /*
2224  * add to busy list of queues for service, trying to be fair in ordering
2225  * the pending list according to last request service
2226  */
2227 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2228 {
2229         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2230         BUG_ON(cfq_cfqq_on_rr(cfqq));
2231         cfq_mark_cfqq_on_rr(cfqq);
2232         cfqd->busy_queues++;
2233         if (cfq_cfqq_sync(cfqq))
2234                 cfqd->busy_sync_queues++;
2235
2236         cfq_resort_rr_list(cfqd, cfqq);
2237 }
2238
2239 /*
2240  * Called when the cfqq no longer has requests pending, remove it from
2241  * the service tree.
2242  */
2243 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2244 {
2245         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2246         BUG_ON(!cfq_cfqq_on_rr(cfqq));
2247         cfq_clear_cfqq_on_rr(cfqq);
2248
2249         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2250                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2251                 cfqq->service_tree = NULL;
2252         }
2253         if (cfqq->p_root) {
2254                 rb_erase(&cfqq->p_node, cfqq->p_root);
2255                 cfqq->p_root = NULL;
2256         }
2257
2258         cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2259         BUG_ON(!cfqd->busy_queues);
2260         cfqd->busy_queues--;
2261         if (cfq_cfqq_sync(cfqq))
2262                 cfqd->busy_sync_queues--;
2263 }
2264
2265 /*
2266  * rb tree support functions
2267  */
2268 static void cfq_del_rq_rb(struct request *rq)
2269 {
2270         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2271         const int sync = rq_is_sync(rq);
2272
2273         BUG_ON(!cfqq->queued[sync]);
2274         cfqq->queued[sync]--;
2275
2276         elv_rb_del(&cfqq->sort_list, rq);
2277
2278         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2279                 /*
2280                  * Queue will be deleted from service tree when we actually
2281                  * expire it later. Right now just remove it from prio tree
2282                  * as it is empty.
2283                  */
2284                 if (cfqq->p_root) {
2285                         rb_erase(&cfqq->p_node, cfqq->p_root);
2286                         cfqq->p_root = NULL;
2287                 }
2288         }
2289 }
2290
2291 static void cfq_add_rq_rb(struct request *rq)
2292 {
2293         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2294         struct cfq_data *cfqd = cfqq->cfqd;
2295         struct request *prev;
2296
2297         cfqq->queued[rq_is_sync(rq)]++;
2298
2299         elv_rb_add(&cfqq->sort_list, rq);
2300
2301         if (!cfq_cfqq_on_rr(cfqq))
2302                 cfq_add_cfqq_rr(cfqd, cfqq);
2303
2304         /*
2305          * check if this request is a better next-serve candidate
2306          */
2307         prev = cfqq->next_rq;
2308         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2309
2310         /*
2311          * adjust priority tree position, if ->next_rq changes
2312          */
2313         if (prev != cfqq->next_rq)
2314                 cfq_prio_tree_add(cfqd, cfqq);
2315
2316         BUG_ON(!cfqq->next_rq);
2317 }
2318
2319 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2320 {
2321         elv_rb_del(&cfqq->sort_list, rq);
2322         cfqq->queued[rq_is_sync(rq)]--;
2323         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2324         cfq_add_rq_rb(rq);
2325         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2326                                  rq->cmd_flags);
2327 }
2328
2329 static struct request *
2330 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2331 {
2332         struct task_struct *tsk = current;
2333         struct cfq_io_cq *cic;
2334         struct cfq_queue *cfqq;
2335
2336         cic = cfq_cic_lookup(cfqd, tsk->io_context);
2337         if (!cic)
2338                 return NULL;
2339
2340         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2341         if (cfqq)
2342                 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2343
2344         return NULL;
2345 }
2346
2347 static void cfq_activate_request(struct request_queue *q, struct request *rq)
2348 {
2349         struct cfq_data *cfqd = q->elevator->elevator_data;
2350
2351         cfqd->rq_in_driver++;
2352         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2353                                                 cfqd->rq_in_driver);
2354
2355         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2356 }
2357
2358 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2359 {
2360         struct cfq_data *cfqd = q->elevator->elevator_data;
2361
2362         WARN_ON(!cfqd->rq_in_driver);
2363         cfqd->rq_in_driver--;
2364         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2365                                                 cfqd->rq_in_driver);
2366 }
2367
2368 static void cfq_remove_request(struct request *rq)
2369 {
2370         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2371
2372         if (cfqq->next_rq == rq)
2373                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2374
2375         list_del_init(&rq->queuelist);
2376         cfq_del_rq_rb(rq);
2377
2378         cfqq->cfqd->rq_queued--;
2379         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2380         if (rq->cmd_flags & REQ_PRIO) {
2381                 WARN_ON(!cfqq->prio_pending);
2382                 cfqq->prio_pending--;
2383         }
2384 }
2385
2386 static int cfq_merge(struct request_queue *q, struct request **req,
2387                      struct bio *bio)
2388 {
2389         struct cfq_data *cfqd = q->elevator->elevator_data;
2390         struct request *__rq;
2391
2392         __rq = cfq_find_rq_fmerge(cfqd, bio);
2393         if (__rq && elv_rq_merge_ok(__rq, bio)) {
2394                 *req = __rq;
2395                 return ELEVATOR_FRONT_MERGE;
2396         }
2397
2398         return ELEVATOR_NO_MERGE;
2399 }
2400
2401 static void cfq_merged_request(struct request_queue *q, struct request *req,
2402                                int type)
2403 {
2404         if (type == ELEVATOR_FRONT_MERGE) {
2405                 struct cfq_queue *cfqq = RQ_CFQQ(req);
2406
2407                 cfq_reposition_rq_rb(cfqq, req);
2408         }
2409 }
2410
2411 static void cfq_bio_merged(struct request_queue *q, struct request *req,
2412                                 struct bio *bio)
2413 {
2414         cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
2415 }
2416
2417 static void
2418 cfq_merged_requests(struct request_queue *q, struct request *rq,
2419                     struct request *next)
2420 {
2421         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2422         struct cfq_data *cfqd = q->elevator->elevator_data;
2423
2424         /*
2425          * reposition in fifo if next is older than rq
2426          */
2427         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2428             time_before(next->fifo_time, rq->fifo_time) &&
2429             cfqq == RQ_CFQQ(next)) {
2430                 list_move(&rq->queuelist, &next->queuelist);
2431                 rq->fifo_time = next->fifo_time;
2432         }
2433
2434         if (cfqq->next_rq == next)
2435                 cfqq->next_rq = rq;
2436         cfq_remove_request(next);
2437         cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2438
2439         cfqq = RQ_CFQQ(next);
2440         /*
2441          * all requests of this queue are merged to other queues, delete it
2442          * from the service tree. If it's the active_queue,
2443          * cfq_dispatch_requests() will choose to expire it or do idle
2444          */
2445         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2446             cfqq != cfqd->active_queue)
2447                 cfq_del_cfqq_rr(cfqd, cfqq);
2448 }
2449
2450 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
2451                            struct bio *bio)
2452 {
2453         struct cfq_data *cfqd = q->elevator->elevator_data;
2454         struct cfq_io_cq *cic;
2455         struct cfq_queue *cfqq;
2456
2457         /*
2458          * Disallow merge of a sync bio into an async request.
2459          */
2460         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
2461                 return false;
2462
2463         /*
2464          * Lookup the cfqq that this bio will be queued with and allow
2465          * merge only if rq is queued there.
2466          */
2467         cic = cfq_cic_lookup(cfqd, current->io_context);
2468         if (!cic)
2469                 return false;
2470
2471         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2472         return cfqq == RQ_CFQQ(rq);
2473 }
2474
2475 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2476 {
2477         del_timer(&cfqd->idle_slice_timer);
2478         cfqg_stats_update_idle_time(cfqq->cfqg);
2479 }
2480
2481 static void __cfq_set_active_queue(struct cfq_data *cfqd,
2482                                    struct cfq_queue *cfqq)
2483 {
2484         if (cfqq) {
2485                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2486                                 cfqd->serving_wl_class, cfqd->serving_wl_type);
2487                 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2488                 cfqq->slice_start = 0;
2489                 cfqq->dispatch_start = jiffies;
2490                 cfqq->allocated_slice = 0;
2491                 cfqq->slice_end = 0;
2492                 cfqq->slice_dispatch = 0;
2493                 cfqq->nr_sectors = 0;
2494
2495                 cfq_clear_cfqq_wait_request(cfqq);
2496                 cfq_clear_cfqq_must_dispatch(cfqq);
2497                 cfq_clear_cfqq_must_alloc_slice(cfqq);
2498                 cfq_clear_cfqq_fifo_expire(cfqq);
2499                 cfq_mark_cfqq_slice_new(cfqq);
2500
2501                 cfq_del_timer(cfqd, cfqq);
2502         }
2503
2504         cfqd->active_queue = cfqq;
2505 }
2506
2507 /*
2508  * current cfqq expired its slice (or was too idle), select new one
2509  */
2510 static void
2511 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2512                     bool timed_out)
2513 {
2514         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2515
2516         if (cfq_cfqq_wait_request(cfqq))
2517                 cfq_del_timer(cfqd, cfqq);
2518
2519         cfq_clear_cfqq_wait_request(cfqq);
2520         cfq_clear_cfqq_wait_busy(cfqq);
2521
2522         /*
2523          * If this cfqq is shared between multiple processes, check to
2524          * make sure that those processes are still issuing I/Os within
2525          * the mean seek distance.  If not, it may be time to break the
2526          * queues apart again.
2527          */
2528         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2529                 cfq_mark_cfqq_split_coop(cfqq);
2530
2531         /*
2532          * store what was left of this slice, if the queue idled/timed out
2533          */
2534         if (timed_out) {
2535                 if (cfq_cfqq_slice_new(cfqq))
2536                         cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2537                 else
2538                         cfqq->slice_resid = cfqq->slice_end - jiffies;
2539                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
2540         }
2541
2542         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2543
2544         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2545                 cfq_del_cfqq_rr(cfqd, cfqq);
2546
2547         cfq_resort_rr_list(cfqd, cfqq);
2548
2549         if (cfqq == cfqd->active_queue)
2550                 cfqd->active_queue = NULL;
2551
2552         if (cfqd->active_cic) {
2553                 put_io_context(cfqd->active_cic->icq.ioc);
2554                 cfqd->active_cic = NULL;
2555         }
2556 }
2557
2558 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2559 {
2560         struct cfq_queue *cfqq = cfqd->active_queue;
2561
2562         if (cfqq)
2563                 __cfq_slice_expired(cfqd, cfqq, timed_out);
2564 }
2565
2566 /*
2567  * Get next queue for service. Unless we have a queue preemption,
2568  * we'll simply select the first cfqq in the service tree.
2569  */
2570 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2571 {
2572         struct cfq_rb_root *st = st_for(cfqd->serving_group,
2573                         cfqd->serving_wl_class, cfqd->serving_wl_type);
2574
2575         if (!cfqd->rq_queued)
2576                 return NULL;
2577
2578         /* There is nothing to dispatch */
2579         if (!st)
2580                 return NULL;
2581         if (RB_EMPTY_ROOT(&st->rb))
2582                 return NULL;
2583         return cfq_rb_first(st);
2584 }
2585
2586 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2587 {
2588         struct cfq_group *cfqg;
2589         struct cfq_queue *cfqq;
2590         int i, j;
2591         struct cfq_rb_root *st;
2592
2593         if (!cfqd->rq_queued)
2594                 return NULL;
2595
2596         cfqg = cfq_get_next_cfqg(cfqd);
2597         if (!cfqg)
2598                 return NULL;
2599
2600         for_each_cfqg_st(cfqg, i, j, st)
2601                 if ((cfqq = cfq_rb_first(st)) != NULL)
2602                         return cfqq;
2603         return NULL;
2604 }
2605
2606 /*
2607  * Get and set a new active queue for service.
2608  */
2609 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2610                                               struct cfq_queue *cfqq)
2611 {
2612         if (!cfqq)
2613                 cfqq = cfq_get_next_queue(cfqd);
2614
2615         __cfq_set_active_queue(cfqd, cfqq);
2616         return cfqq;
2617 }
2618
2619 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2620                                           struct request *rq)
2621 {
2622         if (blk_rq_pos(rq) >= cfqd->last_position)
2623                 return blk_rq_pos(rq) - cfqd->last_position;
2624         else
2625                 return cfqd->last_position - blk_rq_pos(rq);
2626 }
2627
2628 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2629                                struct request *rq)
2630 {
2631         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2632 }
2633
2634 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2635                                     struct cfq_queue *cur_cfqq)
2636 {
2637         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2638         struct rb_node *parent, *node;
2639         struct cfq_queue *__cfqq;
2640         sector_t sector = cfqd->last_position;
2641
2642         if (RB_EMPTY_ROOT(root))
2643                 return NULL;
2644
2645         /*
2646          * First, if we find a request starting at the end of the last
2647          * request, choose it.
2648          */
2649         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2650         if (__cfqq)
2651                 return __cfqq;
2652
2653         /*
2654          * If the exact sector wasn't found, the parent of the NULL leaf
2655          * will contain the closest sector.
2656          */
2657         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2658         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2659                 return __cfqq;
2660
2661         if (blk_rq_pos(__cfqq->next_rq) < sector)
2662                 node = rb_next(&__cfqq->p_node);
2663         else
2664                 node = rb_prev(&__cfqq->p_node);
2665         if (!node)
2666                 return NULL;
2667
2668         __cfqq = rb_entry(node, struct cfq_queue, p_node);
2669         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2670                 return __cfqq;
2671
2672         return NULL;
2673 }
2674
2675 /*
2676  * cfqd - obvious
2677  * cur_cfqq - passed in so that we don't decide that the current queue is
2678  *            closely cooperating with itself.
2679  *
2680  * So, basically we're assuming that that cur_cfqq has dispatched at least
2681  * one request, and that cfqd->last_position reflects a position on the disk
2682  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2683  * assumption.
2684  */
2685 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2686                                               struct cfq_queue *cur_cfqq)
2687 {
2688         struct cfq_queue *cfqq;
2689
2690         if (cfq_class_idle(cur_cfqq))
2691                 return NULL;
2692         if (!cfq_cfqq_sync(cur_cfqq))
2693                 return NULL;
2694         if (CFQQ_SEEKY(cur_cfqq))
2695                 return NULL;
2696
2697         /*
2698          * Don't search priority tree if it's the only queue in the group.
2699          */
2700         if (cur_cfqq->cfqg->nr_cfqq == 1)
2701                 return NULL;
2702
2703         /*
2704          * We should notice if some of the queues are cooperating, eg
2705          * working closely on the same area of the disk. In that case,
2706          * we can group them together and don't waste time idling.
2707          */
2708         cfqq = cfqq_close(cfqd, cur_cfqq);
2709         if (!cfqq)
2710                 return NULL;
2711
2712         /* If new queue belongs to different cfq_group, don't choose it */
2713         if (cur_cfqq->cfqg != cfqq->cfqg)
2714                 return NULL;
2715
2716         /*
2717          * It only makes sense to merge sync queues.
2718          */
2719         if (!cfq_cfqq_sync(cfqq))
2720                 return NULL;
2721         if (CFQQ_SEEKY(cfqq))
2722                 return NULL;
2723
2724         /*
2725          * Do not merge queues of different priority classes
2726          */
2727         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2728                 return NULL;
2729
2730         return cfqq;
2731 }
2732
2733 /*
2734  * Determine whether we should enforce idle window for this queue.
2735  */
2736
2737 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2738 {
2739         enum wl_class_t wl_class = cfqq_class(cfqq);
2740         struct cfq_rb_root *st = cfqq->service_tree;
2741
2742         BUG_ON(!st);
2743         BUG_ON(!st->count);
2744
2745         if (!cfqd->cfq_slice_idle)
2746                 return false;
2747
2748         /* We never do for idle class queues. */
2749         if (wl_class == IDLE_WORKLOAD)
2750                 return false;
2751
2752         /* We do for queues that were marked with idle window flag. */
2753         if (cfq_cfqq_idle_window(cfqq) &&
2754            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2755                 return true;
2756
2757         /*
2758          * Otherwise, we do only if they are the last ones
2759          * in their service tree.
2760          */
2761         if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2762            !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2763                 return true;
2764         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2765         return false;
2766 }
2767
2768 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2769 {
2770         struct cfq_queue *cfqq = cfqd->active_queue;
2771         struct cfq_io_cq *cic;
2772         unsigned long sl, group_idle = 0;
2773
2774         /*
2775          * SSD device without seek penalty, disable idling. But only do so
2776          * for devices that support queuing, otherwise we still have a problem
2777          * with sync vs async workloads.
2778          */
2779         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2780                 return;
2781
2782         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2783         WARN_ON(cfq_cfqq_slice_new(cfqq));
2784
2785         /*
2786          * idle is disabled, either manually or by past process history
2787          */
2788         if (!cfq_should_idle(cfqd, cfqq)) {
2789                 /* no queue idling. Check for group idling */
2790                 if (cfqd->cfq_group_idle)
2791                         group_idle = cfqd->cfq_group_idle;
2792                 else
2793                         return;
2794         }
2795
2796         /*
2797          * still active requests from this queue, don't idle
2798          */
2799         if (cfqq->dispatched)
2800                 return;
2801
2802         /*
2803          * task has exited, don't wait
2804          */
2805         cic = cfqd->active_cic;
2806         if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2807                 return;
2808
2809         /*
2810          * If our average think time is larger than the remaining time
2811          * slice, then don't idle. This avoids overrunning the allotted
2812          * time slice.
2813          */
2814         if (sample_valid(cic->ttime.ttime_samples) &&
2815             (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2816                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2817                              cic->ttime.ttime_mean);
2818                 return;
2819         }
2820
2821         /* There are other queues in the group, don't do group idle */
2822         if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2823                 return;
2824
2825         cfq_mark_cfqq_wait_request(cfqq);
2826
2827         if (group_idle)
2828                 sl = cfqd->cfq_group_idle;
2829         else
2830                 sl = cfqd->cfq_slice_idle;
2831
2832         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2833         cfqg_stats_set_start_idle_time(cfqq->cfqg);
2834         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2835                         group_idle ? 1 : 0);
2836 }
2837
2838 /*
2839  * Move request from internal lists to the request queue dispatch list.
2840  */
2841 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2842 {
2843         struct cfq_data *cfqd = q->elevator->elevator_data;
2844         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2845
2846         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2847
2848         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2849         cfq_remove_request(rq);
2850         cfqq->dispatched++;
2851         (RQ_CFQG(rq))->dispatched++;
2852         elv_dispatch_sort(q, rq);
2853
2854         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2855         cfqq->nr_sectors += blk_rq_sectors(rq);
2856         cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2857 }
2858
2859 /*
2860  * return expired entry, or NULL to just start from scratch in rbtree
2861  */
2862 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2863 {
2864         struct request *rq = NULL;
2865
2866         if (cfq_cfqq_fifo_expire(cfqq))
2867                 return NULL;
2868
2869         cfq_mark_cfqq_fifo_expire(cfqq);
2870
2871         if (list_empty(&cfqq->fifo))
2872                 return NULL;
2873
2874         rq = rq_entry_fifo(cfqq->fifo.next);
2875         if (time_before(jiffies, rq->fifo_time))
2876                 rq = NULL;
2877
2878         cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2879         return rq;
2880 }
2881
2882 static inline int
2883 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2884 {
2885         const int base_rq = cfqd->cfq_slice_async_rq;
2886
2887         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2888
2889         return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2890 }
2891
2892 /*
2893  * Must be called with the queue_lock held.
2894  */
2895 static int cfqq_process_refs(struct cfq_queue *cfqq)
2896 {
2897         int process_refs, io_refs;
2898
2899         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2900         process_refs = cfqq->ref - io_refs;
2901         BUG_ON(process_refs < 0);
2902         return process_refs;
2903 }
2904
2905 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2906 {
2907         int process_refs, new_process_refs;
2908         struct cfq_queue *__cfqq;
2909
2910         /*
2911          * If there are no process references on the new_cfqq, then it is
2912          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2913          * chain may have dropped their last reference (not just their
2914          * last process reference).
2915          */
2916         if (!cfqq_process_refs(new_cfqq))
2917                 return;
2918
2919         /* Avoid a circular list and skip interim queue merges */
2920         while ((__cfqq = new_cfqq->new_cfqq)) {
2921                 if (__cfqq == cfqq)
2922                         return;
2923                 new_cfqq = __cfqq;
2924         }
2925
2926         process_refs = cfqq_process_refs(cfqq);
2927         new_process_refs = cfqq_process_refs(new_cfqq);
2928         /*
2929          * If the process for the cfqq has gone away, there is no
2930          * sense in merging the queues.
2931          */
2932         if (process_refs == 0 || new_process_refs == 0)
2933                 return;
2934
2935         /*
2936          * Merge in the direction of the lesser amount of work.
2937          */
2938         if (new_process_refs >= process_refs) {
2939                 cfqq->new_cfqq = new_cfqq;
2940                 new_cfqq->ref += process_refs;
2941         } else {
2942                 new_cfqq->new_cfqq = cfqq;
2943                 cfqq->ref += new_process_refs;
2944         }
2945 }
2946
2947 static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
2948                         struct cfq_group *cfqg, enum wl_class_t wl_class)
2949 {
2950         struct cfq_queue *queue;
2951         int i;
2952         bool key_valid = false;
2953         unsigned long lowest_key = 0;
2954         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2955
2956         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2957                 /* select the one with lowest rb_key */
2958                 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
2959                 if (queue &&
2960                     (!key_valid || time_before(queue->rb_key, lowest_key))) {
2961                         lowest_key = queue->rb_key;
2962                         cur_best = i;
2963                         key_valid = true;
2964                 }
2965         }
2966
2967         return cur_best;
2968 }
2969
2970 static void
2971 choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
2972 {
2973         unsigned slice;
2974         unsigned count;
2975         struct cfq_rb_root *st;
2976         unsigned group_slice;
2977         enum wl_class_t original_class = cfqd->serving_wl_class;
2978
2979         /* Choose next priority. RT > BE > IDLE */
2980         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2981                 cfqd->serving_wl_class = RT_WORKLOAD;
2982         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2983                 cfqd->serving_wl_class = BE_WORKLOAD;
2984         else {
2985                 cfqd->serving_wl_class = IDLE_WORKLOAD;
2986                 cfqd->workload_expires = jiffies + 1;
2987                 return;
2988         }
2989
2990         if (original_class != cfqd->serving_wl_class)
2991                 goto new_workload;
2992
2993         /*
2994          * For RT and BE, we have to choose also the type
2995          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2996          * expiration time
2997          */
2998         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2999         count = st->count;
3000
3001         /*
3002          * check workload expiration, and that we still have other queues ready
3003          */
3004         if (count && !time_after(jiffies, cfqd->workload_expires))
3005                 return;
3006
3007 new_workload:
3008         /* otherwise select new workload type */
3009         cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3010                                         cfqd->serving_wl_class);
3011         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3012         count = st->count;
3013
3014         /*
3015          * the workload slice is computed as a fraction of target latency
3016          * proportional to the number of queues in that workload, over
3017          * all the queues in the same priority class
3018          */
3019         group_slice = cfq_group_slice(cfqd, cfqg);
3020
3021         slice = group_slice * count /
3022                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3023                       cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3024                                         cfqg));
3025
3026         if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3027                 unsigned int tmp;
3028
3029                 /*
3030                  * Async queues are currently system wide. Just taking
3031                  * proportion of queues with-in same group will lead to higher
3032                  * async ratio system wide as generally root group is going
3033                  * to have higher weight. A more accurate thing would be to
3034                  * calculate system wide asnc/sync ratio.
3035                  */
3036                 tmp = cfqd->cfq_target_latency *
3037                         cfqg_busy_async_queues(cfqd, cfqg);
3038                 tmp = tmp/cfqd->busy_queues;
3039                 slice = min_t(unsigned, slice, tmp);
3040
3041                 /* async workload slice is scaled down according to
3042                  * the sync/async slice ratio. */
3043                 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
3044         } else
3045                 /* sync workload slice is at least 2 * cfq_slice_idle */
3046                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3047
3048         slice = max_t(unsigned, slice, CFQ_MIN_TT);
3049         cfq_log(cfqd, "workload slice:%d", slice);
3050         cfqd->workload_expires = jiffies + slice;
3051 }
3052
3053 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3054 {
3055         struct cfq_rb_root *st = &cfqd->grp_service_tree;
3056         struct cfq_group *cfqg;
3057
3058         if (RB_EMPTY_ROOT(&st->rb))
3059                 return NULL;
3060         cfqg = cfq_rb_first_group(st);
3061         update_min_vdisktime(st);
3062         return cfqg;
3063 }
3064
3065 static void cfq_choose_cfqg(struct cfq_data *cfqd)
3066 {
3067         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3068
3069         cfqd->serving_group = cfqg;
3070
3071         /* Restore the workload type data */
3072         if (cfqg->saved_wl_slice) {
3073                 cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
3074                 cfqd->serving_wl_type = cfqg->saved_wl_type;
3075                 cfqd->serving_wl_class = cfqg->saved_wl_class;
3076         } else
3077                 cfqd->workload_expires = jiffies - 1;
3078
3079         choose_wl_class_and_type(cfqd, cfqg);
3080 }
3081
3082 /*
3083  * Select a queue for service. If we have a current active queue,
3084  * check whether to continue servicing it, or retrieve and set a new one.
3085  */
3086 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3087 {
3088         struct cfq_queue *cfqq, *new_cfqq = NULL;
3089
3090         cfqq = cfqd->active_queue;
3091         if (!cfqq)
3092                 goto new_queue;
3093
3094         if (!cfqd->rq_queued)
3095                 return NULL;
3096
3097         /*
3098          * We were waiting for group to get backlogged. Expire the queue
3099          */
3100         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3101                 goto expire;
3102
3103         /*
3104          * The active queue has run out of time, expire it and select new.
3105          */
3106         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3107                 /*
3108                  * If slice had not expired at the completion of last request
3109                  * we might not have turned on wait_busy flag. Don't expire
3110                  * the queue yet. Allow the group to get backlogged.
3111                  *
3112                  * The very fact that we have used the slice, that means we
3113                  * have been idling all along on this queue and it should be
3114                  * ok to wait for this request to complete.
3115                  */
3116                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3117                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3118                         cfqq = NULL;
3119                         goto keep_queue;
3120                 } else
3121                         goto check_group_idle;
3122         }
3123
3124         /*
3125          * The active queue has requests and isn't expired, allow it to
3126          * dispatch.
3127          */
3128         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3129                 goto keep_queue;
3130
3131         /*
3132          * If another queue has a request waiting within our mean seek
3133          * distance, let it run.  The expire code will check for close
3134          * cooperators and put the close queue at the front of the service
3135          * tree.  If possible, merge the expiring queue with the new cfqq.
3136          */
3137         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3138         if (new_cfqq) {
3139                 if (!cfqq->new_cfqq)
3140                         cfq_setup_merge(cfqq, new_cfqq);
3141                 goto expire;
3142         }
3143
3144         /*
3145          * No requests pending. If the active queue still has requests in
3146          * flight or is idling for a new request, allow either of these
3147          * conditions to happen (or time out) before selecting a new queue.
3148          */
3149         if (timer_pending(&cfqd->idle_slice_timer)) {
3150                 cfqq = NULL;
3151                 goto keep_queue;
3152         }
3153
3154         /*
3155          * This is a deep seek queue, but the device is much faster than
3156          * the queue can deliver, don't idle
3157          **/
3158         if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3159             (cfq_cfqq_slice_new(cfqq) ||
3160             (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
3161                 cfq_clear_cfqq_deep(cfqq);
3162                 cfq_clear_cfqq_idle_window(cfqq);
3163         }
3164
3165         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3166                 cfqq = NULL;
3167                 goto keep_queue;
3168         }
3169
3170         /*
3171          * If group idle is enabled and there are requests dispatched from
3172          * this group, wait for requests to complete.
3173          */
3174 check_group_idle:
3175         if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3176             cfqq->cfqg->dispatched &&
3177             !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3178                 cfqq = NULL;
3179                 goto keep_queue;
3180         }
3181
3182 expire:
3183         cfq_slice_expired(cfqd, 0);
3184 new_queue:
3185         /*
3186          * Current queue expired. Check if we have to switch to a new
3187          * service tree
3188          */
3189         if (!new_cfqq)
3190                 cfq_choose_cfqg(cfqd);
3191
3192         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3193 keep_queue:
3194         return cfqq;
3195 }
3196
3197 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3198 {
3199         int dispatched = 0;
3200
3201         while (cfqq->next_rq) {
3202                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3203                 dispatched++;
3204         }
3205
3206         BUG_ON(!list_empty(&cfqq->fifo));
3207
3208         /* By default cfqq is not expired if it is empty. Do it explicitly */
3209         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3210         return dispatched;
3211 }
3212
3213 /*
3214  * Drain our current requests. Used for barriers and when switching
3215  * io schedulers on-the-fly.
3216  */
3217 static int cfq_forced_dispatch(struct cfq_data *cfqd)
3218 {
3219         struct cfq_queue *cfqq;
3220         int dispatched = 0;
3221
3222         /* Expire the timeslice of the current active queue first */
3223         cfq_slice_expired(cfqd, 0);
3224         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3225                 __cfq_set_active_queue(cfqd, cfqq);
3226                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3227         }
3228
3229         BUG_ON(cfqd->busy_queues);
3230
3231         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3232         return dispatched;
3233 }
3234
3235 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3236         struct cfq_queue *cfqq)
3237 {
3238         /* the queue hasn't finished any request, can't estimate */
3239         if (cfq_cfqq_slice_new(cfqq))
3240                 return true;
3241         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
3242                 cfqq->slice_end))
3243                 return true;
3244
3245         return false;
3246 }
3247
3248 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3249 {
3250         unsigned int max_dispatch;
3251
3252         /*
3253          * Drain async requests before we start sync IO
3254          */
3255         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3256                 return false;
3257
3258         /*
3259          * If this is an async queue and we have sync IO in flight, let it wait
3260          */
3261         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3262                 return false;
3263
3264         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3265         if (cfq_class_idle(cfqq))
3266                 max_dispatch = 1;
3267
3268         /*
3269          * Does this cfqq already have too much IO in flight?
3270          */
3271         if (cfqq->dispatched >= max_dispatch) {
3272                 bool promote_sync = false;
3273                 /*
3274                  * idle queue must always only have a single IO in flight
3275                  */
3276                 if (cfq_class_idle(cfqq))
3277                         return false;
3278
3279                 /*
3280                  * If there is only one sync queue
3281                  * we can ignore async queue here and give the sync
3282                  * queue no dispatch limit. The reason is a sync queue can
3283                  * preempt async queue, limiting the sync queue doesn't make
3284                  * sense. This is useful for aiostress test.
3285                  */
3286                 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3287                         promote_sync = true;
3288
3289                 /*
3290                  * We have other queues, don't allow more IO from this one
3291                  */
3292                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3293                                 !promote_sync)
3294                         return false;
3295
3296                 /*
3297                  * Sole queue user, no limit
3298                  */
3299                 if (cfqd->busy_queues == 1 || promote_sync)
3300                         max_dispatch = -1;
3301                 else
3302                         /*
3303                          * Normally we start throttling cfqq when cfq_quantum/2
3304                          * requests have been dispatched. But we can drive
3305                          * deeper queue depths at the beginning of slice
3306                          * subjected to upper limit of cfq_quantum.
3307                          * */
3308                         max_dispatch = cfqd->cfq_quantum;
3309         }
3310
3311         /*
3312          * Async queues must wait a bit before being allowed dispatch.
3313          * We also ramp up the dispatch depth gradually for async IO,
3314          * based on the last sync IO we serviced
3315          */
3316         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3317                 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
3318                 unsigned int depth;
3319
3320                 depth = last_sync / cfqd->cfq_slice[1];
3321                 if (!depth && !cfqq->dispatched)
3322                         depth = 1;
3323                 if (depth < max_dispatch)
3324                         max_dispatch = depth;
3325         }
3326
3327         /*
3328          * If we're below the current max, allow a dispatch
3329          */
3330         return cfqq->dispatched < max_dispatch;
3331 }
3332
3333 /*
3334  * Dispatch a request from cfqq, moving them to the request queue
3335  * dispatch list.
3336  */
3337 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3338 {
3339         struct request *rq;
3340
3341         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3342
3343         if (!cfq_may_dispatch(cfqd, cfqq))
3344                 return false;
3345
3346         /*
3347          * follow expired path, else get first next available
3348          */
3349         rq = cfq_check_fifo(cfqq);
3350         if (!rq)
3351                 rq = cfqq->next_rq;
3352
3353         /*
3354          * insert request into driver dispatch list
3355          */
3356         cfq_dispatch_insert(cfqd->queue, rq);
3357
3358         if (!cfqd->active_cic) {
3359                 struct cfq_io_cq *cic = RQ_CIC(rq);
3360
3361                 atomic_long_inc(&cic->icq.ioc->refcount);
3362                 cfqd->active_cic = cic;
3363         }
3364
3365         return true;
3366 }
3367
3368 /*
3369  * Find the cfqq that we need to service and move a request from that to the
3370  * dispatch list
3371  */
3372 static int cfq_dispatch_requests(struct request_queue *q, int force)
3373 {
3374         struct cfq_data *cfqd = q->elevator->elevator_data;
3375         struct cfq_queue *cfqq;
3376
3377         if (!cfqd->busy_queues)
3378                 return 0;
3379
3380         if (unlikely(force))
3381                 return cfq_forced_dispatch(cfqd);
3382
3383         cfqq = cfq_select_queue(cfqd);
3384         if (!cfqq)
3385                 return 0;
3386
3387         /*
3388          * Dispatch a request from this cfqq, if it is allowed
3389          */
3390         if (!cfq_dispatch_request(cfqd, cfqq))
3391                 return 0;
3392
3393         cfqq->slice_dispatch++;
3394         cfq_clear_cfqq_must_dispatch(cfqq);
3395
3396         /*
3397          * expire an async queue immediately if it has used up its slice. idle
3398          * queue always expire after 1 dispatch round.
3399          */
3400         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3401             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3402             cfq_class_idle(cfqq))) {
3403                 cfqq->slice_end = jiffies + 1;
3404                 cfq_slice_expired(cfqd, 0);
3405         }
3406
3407         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3408         return 1;
3409 }
3410
3411 /*
3412  * task holds one reference to the queue, dropped when task exits. each rq
3413  * in-flight on this queue also holds a reference, dropped when rq is freed.
3414  *
3415  * Each cfq queue took a reference on the parent group. Drop it now.
3416  * queue lock must be held here.
3417  */
3418 static void cfq_put_queue(struct cfq_queue *cfqq)
3419 {
3420         struct cfq_data *cfqd = cfqq->cfqd;
3421         struct cfq_group *cfqg;
3422
3423         BUG_ON(cfqq->ref <= 0);
3424
3425         cfqq->ref--;
3426         if (cfqq->ref)
3427                 return;
3428
3429         cfq_log_cfqq(cfqd, cfqq, "put_queue");
3430         BUG_ON(rb_first(&cfqq->sort_list));
3431         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3432         cfqg = cfqq->cfqg;
3433
3434         if (unlikely(cfqd->active_queue == cfqq)) {
3435                 __cfq_slice_expired(cfqd, cfqq, 0);
3436                 cfq_schedule_dispatch(cfqd);
3437         }
3438
3439         BUG_ON(cfq_cfqq_on_rr(cfqq));
3440         kmem_cache_free(cfq_pool, cfqq);
3441         cfqg_put(cfqg);
3442 }
3443
3444 static void cfq_put_cooperator(struct cfq_queue *cfqq)
3445 {
3446         struct cfq_queue *__cfqq, *next;
3447
3448         /*
3449          * If this queue was scheduled to merge with another queue, be
3450          * sure to drop the reference taken on that queue (and others in
3451          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3452          */
3453         __cfqq = cfqq->new_cfqq;
3454         while (__cfqq) {
3455                 if (__cfqq == cfqq) {
3456                         WARN(1, "cfqq->new_cfqq loop detected\n");
3457                         break;
3458                 }
3459                 next = __cfqq->new_cfqq;
3460                 cfq_put_queue(__cfqq);
3461                 __cfqq = next;
3462         }
3463 }
3464
3465 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3466 {
3467         if (unlikely(cfqq == cfqd->active_queue)) {
3468                 __cfq_slice_expired(cfqd, cfqq, 0);
3469                 cfq_schedule_dispatch(cfqd);
3470         }
3471
3472         cfq_put_cooperator(cfqq);
3473
3474         cfq_put_queue(cfqq);
3475 }
3476
3477 static void cfq_init_icq(struct io_cq *icq)
3478 {
3479         struct cfq_io_cq *cic = icq_to_cic(icq);
3480
3481         cic->ttime.last_end_request = jiffies;
3482 }
3483
3484 static void cfq_exit_icq(struct io_cq *icq)
3485 {
3486         struct cfq_io_cq *cic = icq_to_cic(icq);
3487         struct cfq_data *cfqd = cic_to_cfqd(cic);
3488
3489         if (cic->cfqq[BLK_RW_ASYNC]) {
3490                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
3491                 cic->cfqq[BLK_RW_ASYNC] = NULL;
3492         }
3493
3494         if (cic->cfqq[BLK_RW_SYNC]) {
3495                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
3496                 cic->cfqq[BLK_RW_SYNC] = NULL;
3497         }
3498 }
3499
3500 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3501 {
3502         struct task_struct *tsk = current;
3503         int ioprio_class;
3504
3505         if (!cfq_cfqq_prio_changed(cfqq))
3506                 return;
3507
3508         ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3509         switch (ioprio_class) {
3510         default:
3511                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3512         case IOPRIO_CLASS_NONE:
3513                 /*
3514                  * no prio set, inherit CPU scheduling settings
3515                  */
3516                 cfqq->ioprio = task_nice_ioprio(tsk);
3517                 cfqq->ioprio_class = task_nice_ioclass(tsk);
3518                 break;
3519         case IOPRIO_CLASS_RT:
3520                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3521                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3522                 break;
3523         case IOPRIO_CLASS_BE:
3524                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3525                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3526                 break;
3527         case IOPRIO_CLASS_IDLE:
3528                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3529                 cfqq->ioprio = 7;
3530                 cfq_clear_cfqq_idle_window(cfqq);
3531                 break;
3532         }
3533
3534         /*
3535          * keep track of original prio settings in case we have to temporarily
3536          * elevate the priority of this queue
3537          */
3538         cfqq->org_ioprio = cfqq->ioprio;
3539         cfq_clear_cfqq_prio_changed(cfqq);
3540 }
3541
3542 static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3543 {
3544         int ioprio = cic->icq.ioc->ioprio;
3545         struct cfq_data *cfqd = cic_to_cfqd(cic);
3546         struct cfq_queue *cfqq;
3547
3548         /*
3549          * Check whether ioprio has changed.  The condition may trigger
3550          * spuriously on a newly created cic but there's no harm.
3551          */
3552         if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3553                 return;
3554
3555         cfqq = cic->cfqq[BLK_RW_ASYNC];
3556         if (cfqq) {
3557                 struct cfq_queue *new_cfqq;
3558                 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
3559                                          GFP_ATOMIC);
3560                 if (new_cfqq) {
3561                         cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3562                         cfq_put_queue(cfqq);
3563                 }
3564         }
3565
3566         cfqq = cic->cfqq[BLK_RW_SYNC];
3567         if (cfqq)
3568                 cfq_mark_cfqq_prio_changed(cfqq);
3569
3570         cic->ioprio = ioprio;
3571 }
3572
3573 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3574                           pid_t pid, bool is_sync)
3575 {
3576         RB_CLEAR_NODE(&cfqq->rb_node);
3577         RB_CLEAR_NODE(&cfqq->p_node);
3578         INIT_LIST_HEAD(&cfqq->fifo);
3579
3580         cfqq->ref = 0;
3581         cfqq->cfqd = cfqd;
3582
3583         cfq_mark_cfqq_prio_changed(cfqq);
3584
3585         if (is_sync) {
3586                 if (!cfq_class_idle(cfqq))
3587                         cfq_mark_cfqq_idle_window(cfqq);
3588                 cfq_mark_cfqq_sync(cfqq);
3589         }
3590         cfqq->pid = pid;
3591 }
3592
3593 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3594 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3595 {
3596         struct cfq_data *cfqd = cic_to_cfqd(cic);
3597         struct cfq_queue *sync_cfqq;
3598         uint64_t serial_nr;
3599
3600         rcu_read_lock();
3601         serial_nr = bio_blkcg(bio)->css.serial_nr;
3602         rcu_read_unlock();
3603
3604         /*
3605          * Check whether blkcg has changed.  The condition may trigger
3606          * spuriously on a newly created cic but there's no harm.
3607          */
3608         if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3609                 return;
3610
3611         sync_cfqq = cic_to_cfqq(cic, 1);
3612         if (sync_cfqq) {
3613                 /*
3614                  * Drop reference to sync queue. A new sync queue will be
3615                  * assigned in new group upon arrival of a fresh request.
3616                  */
3617                 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
3618                 cic_set_cfqq(cic, NULL, 1);
3619                 cfq_put_queue(sync_cfqq);
3620         }
3621
3622         cic->blkcg_serial_nr = serial_nr;
3623 }
3624 #else
3625 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3626 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3627
3628 static struct cfq_queue *
3629 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3630                      struct bio *bio, gfp_t gfp_mask)
3631 {
3632         struct blkcg *blkcg;
3633         struct cfq_queue *cfqq, *new_cfqq = NULL;
3634         struct cfq_group *cfqg;
3635
3636 retry:
3637         rcu_read_lock();
3638
3639         blkcg = bio_blkcg(bio);
3640         cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3641         if (!cfqg) {
3642                 cfqq = &cfqd->oom_cfqq;
3643                 goto out;
3644         }
3645
3646         cfqq = cic_to_cfqq(cic, is_sync);
3647
3648         /*
3649          * Always try a new alloc if we fell back to the OOM cfqq
3650          * originally, since it should just be a temporary situation.
3651          */
3652         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3653                 cfqq = NULL;
3654                 if (new_cfqq) {
3655                         cfqq = new_cfqq;
3656                         new_cfqq = NULL;
3657                 } else if (gfp_mask & __GFP_WAIT) {
3658                         rcu_read_unlock();
3659                         spin_unlock_irq(cfqd->queue->queue_lock);
3660                         new_cfqq = kmem_cache_alloc_node(cfq_pool,
3661                                         gfp_mask | __GFP_ZERO,
3662                                         cfqd->queue->node);
3663                         spin_lock_irq(cfqd->queue->queue_lock);
3664                         if (new_cfqq)
3665                                 goto retry;
3666                         else
3667                                 return &cfqd->oom_cfqq;
3668                 } else {
3669                         cfqq = kmem_cache_alloc_node(cfq_pool,
3670                                         gfp_mask | __GFP_ZERO,
3671                                         cfqd->queue->node);
3672                 }
3673
3674                 if (cfqq) {
3675                         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3676                         cfq_init_prio_data(cfqq, cic);
3677                         cfq_link_cfqq_cfqg(cfqq, cfqg);
3678                         cfq_log_cfqq(cfqd, cfqq, "alloced");
3679                 } else
3680                         cfqq = &cfqd->oom_cfqq;
3681         }
3682 out:
3683         if (new_cfqq)
3684                 kmem_cache_free(cfq_pool, new_cfqq);
3685
3686         rcu_read_unlock();
3687         return cfqq;
3688 }
3689
3690 static struct cfq_queue **
3691 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3692 {
3693         switch (ioprio_class) {
3694         case IOPRIO_CLASS_RT:
3695                 return &cfqd->async_cfqq[0][ioprio];
3696         case IOPRIO_CLASS_NONE:
3697                 ioprio = IOPRIO_NORM;
3698                 /* fall through */
3699         case IOPRIO_CLASS_BE:
3700                 return &cfqd->async_cfqq[1][ioprio];
3701         case IOPRIO_CLASS_IDLE:
3702                 return &cfqd->async_idle_cfqq;
3703         default:
3704                 BUG();
3705         }
3706 }
3707
3708 static struct cfq_queue *
3709 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3710               struct bio *bio, gfp_t gfp_mask)
3711 {
3712         int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3713         int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3714         struct cfq_queue **async_cfqq = NULL;
3715         struct cfq_queue *cfqq = NULL;
3716
3717         if (!is_sync) {
3718                 if (!ioprio_valid(cic->ioprio)) {
3719                         struct task_struct *tsk = current;
3720                         ioprio = task_nice_ioprio(tsk);
3721                         ioprio_class = task_nice_ioclass(tsk);
3722                 }
3723                 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
3724                 cfqq = *async_cfqq;
3725         }
3726
3727         if (!cfqq)
3728                 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
3729
3730         /*
3731          * pin the queue now that it's allocated, scheduler exit will prune it
3732          */
3733         if (!is_sync && !(*async_cfqq)) {
3734                 cfqq->ref++;
3735                 *async_cfqq = cfqq;
3736         }
3737
3738         cfqq->ref++;
3739         return cfqq;
3740 }
3741
3742 static void
3743 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3744 {
3745         unsigned long elapsed = jiffies - ttime->last_end_request;
3746         elapsed = min(elapsed, 2UL * slice_idle);
3747
3748         ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3749         ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
3750         ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
3751 }
3752
3753 static void
3754 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3755                         struct cfq_io_cq *cic)
3756 {
3757         if (cfq_cfqq_sync(cfqq)) {
3758                 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3759                 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3760                         cfqd->cfq_slice_idle);
3761         }
3762 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3763         __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3764 #endif
3765 }
3766
3767 static void
3768 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3769                        struct request *rq)
3770 {
3771         sector_t sdist = 0;
3772         sector_t n_sec = blk_rq_sectors(rq);
3773         if (cfqq->last_request_pos) {
3774                 if (cfqq->last_request_pos < blk_rq_pos(rq))
3775                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3776                 else
3777                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3778         }
3779
3780         cfqq->seek_history <<= 1;
3781         if (blk_queue_nonrot(cfqd->queue))
3782                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3783         else
3784                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3785 }
3786
3787 /*
3788  * Disable idle window if the process thinks too long or seeks so much that
3789  * it doesn't matter
3790  */
3791 static void
3792 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3793                        struct cfq_io_cq *cic)
3794 {
3795         int old_idle, enable_idle;
3796
3797         /*
3798          * Don't idle for async or idle io prio class
3799          */
3800         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3801                 return;
3802
3803         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3804
3805         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3806                 cfq_mark_cfqq_deep(cfqq);
3807
3808         if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3809                 enable_idle = 0;
3810         else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3811                  !cfqd->cfq_slice_idle ||
3812                  (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3813                 enable_idle = 0;
3814         else if (sample_valid(cic->ttime.ttime_samples)) {
3815                 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3816                         enable_idle = 0;
3817                 else
3818                         enable_idle = 1;
3819         }
3820
3821         if (old_idle != enable_idle) {
3822                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3823                 if (enable_idle)
3824                         cfq_mark_cfqq_idle_window(cfqq);
3825                 else
3826                         cfq_clear_cfqq_idle_window(cfqq);
3827         }
3828 }
3829
3830 /*
3831  * Check if new_cfqq should preempt the currently active queue. Return 0 for
3832  * no or if we aren't sure, a 1 will cause a preempt.
3833  */
3834 static bool
3835 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3836                    struct request *rq)
3837 {
3838         struct cfq_queue *cfqq;
3839
3840         cfqq = cfqd->active_queue;
3841         if (!cfqq)
3842                 return false;
3843
3844         if (cfq_class_idle(new_cfqq))
3845                 return false;
3846
3847         if (cfq_class_idle(cfqq))
3848                 return true;
3849
3850         /*
3851          * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3852          */
3853         if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3854                 return false;
3855
3856         /*
3857          * if the new request is sync, but the currently running queue is
3858          * not, let the sync request have priority.
3859          */
3860         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3861                 return true;
3862
3863         if (new_cfqq->cfqg != cfqq->cfqg)
3864                 return false;
3865
3866         if (cfq_slice_used(cfqq))
3867                 return true;
3868
3869         /* Allow preemption only if we are idling on sync-noidle tree */
3870         if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
3871             cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3872             new_cfqq->service_tree->count == 2 &&
3873             RB_EMPTY_ROOT(&cfqq->sort_list))
3874                 return true;
3875
3876         /*
3877          * So both queues are sync. Let the new request get disk time if
3878          * it's a metadata request and the current queue is doing regular IO.
3879          */
3880         if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3881                 return true;
3882
3883         /*
3884          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3885          */
3886         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3887                 return true;
3888
3889         /* An idle queue should not be idle now for some reason */
3890         if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3891                 return true;
3892
3893         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3894                 return false;
3895
3896         /*
3897          * if this request is as-good as one we would expect from the
3898          * current cfqq, let it preempt
3899          */
3900         if (cfq_rq_close(cfqd, cfqq, rq))
3901                 return true;
3902
3903         return false;
3904 }
3905
3906 /*
3907  * cfqq preempts the active queue. if we allowed preempt with no slice left,
3908  * let it have half of its nominal slice.
3909  */
3910 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3911 {
3912         enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3913
3914         cfq_log_cfqq(cfqd, cfqq, "preempt");
3915         cfq_slice_expired(cfqd, 1);
3916
3917         /*
3918          * workload type is changed, don't save slice, otherwise preempt
3919          * doesn't happen
3920          */
3921         if (old_type != cfqq_type(cfqq))
3922                 cfqq->cfqg->saved_wl_slice = 0;
3923
3924         /*
3925          * Put the new queue at the front of the of the current list,
3926          * so we know that it will be selected next.
3927          */
3928         BUG_ON(!cfq_cfqq_on_rr(cfqq));
3929
3930         cfq_service_tree_add(cfqd, cfqq, 1);
3931
3932         cfqq->slice_end = 0;
3933         cfq_mark_cfqq_slice_new(cfqq);
3934 }
3935
3936 /*
3937  * Called when a new fs request (rq) is added (to cfqq). Check if there's
3938  * something we should do about it
3939  */
3940 static void
3941 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3942                 struct request *rq)
3943 {
3944         struct cfq_io_cq *cic = RQ_CIC(rq);
3945
3946         cfqd->rq_queued++;
3947         if (rq->cmd_flags & REQ_PRIO)
3948                 cfqq->prio_pending++;
3949
3950         cfq_update_io_thinktime(cfqd, cfqq, cic);
3951         cfq_update_io_seektime(cfqd, cfqq, rq);
3952         cfq_update_idle_window(cfqd, cfqq, cic);
3953
3954         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3955
3956         if (cfqq == cfqd->active_queue) {
3957                 /*
3958                  * Remember that we saw a request from this process, but
3959                  * don't start queuing just yet. Otherwise we risk seeing lots
3960                  * of tiny requests, because we disrupt the normal plugging
3961                  * and merging. If the request is already larger than a single
3962                  * page, let it rip immediately. For that case we assume that
3963                  * merging is already done. Ditto for a busy system that
3964                  * has other work pending, don't risk delaying until the
3965                  * idle timer unplug to continue working.
3966                  */
3967                 if (cfq_cfqq_wait_request(cfqq)) {
3968                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3969                             cfqd->busy_queues > 1) {
3970                                 cfq_del_timer(cfqd, cfqq);
3971                                 cfq_clear_cfqq_wait_request(cfqq);
3972                                 __blk_run_queue(cfqd->queue);
3973                         } else {
3974                                 cfqg_stats_update_idle_time(cfqq->cfqg);
3975                                 cfq_mark_cfqq_must_dispatch(cfqq);
3976                         }
3977                 }
3978         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3979                 /*
3980                  * not the active queue - expire current slice if it is
3981                  * idle and has expired it's mean thinktime or this new queue
3982                  * has some old slice time left and is of higher priority or
3983                  * this new queue is RT and the current one is BE
3984                  */
3985                 cfq_preempt_queue(cfqd, cfqq);
3986                 __blk_run_queue(cfqd->queue);
3987         }
3988 }
3989
3990 static void cfq_insert_request(struct request_queue *q, struct request *rq)
3991 {
3992         struct cfq_data *cfqd = q->elevator->elevator_data;
3993         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3994
3995         cfq_log_cfqq(cfqd, cfqq, "insert_request");
3996         cfq_init_prio_data(cfqq, RQ_CIC(rq));
3997
3998         rq->fifo_time = jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
3999         list_add_tail(&rq->queuelist, &cfqq->fifo);
4000         cfq_add_rq_rb(rq);
4001         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4002                                  rq->cmd_flags);
4003         cfq_rq_enqueued(cfqd, cfqq, rq);
4004 }
4005
4006 /*
4007  * Update hw_tag based on peak queue depth over 50 samples under
4008  * sufficient load.
4009  */
4010 static void cfq_update_hw_tag(struct cfq_data *cfqd)
4011 {
4012         struct cfq_queue *cfqq = cfqd->active_queue;
4013
4014         if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4015                 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4016
4017         if (cfqd->hw_tag == 1)
4018                 return;
4019
4020         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4021             cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4022                 return;
4023
4024         /*
4025          * If active queue hasn't enough requests and can idle, cfq might not
4026          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4027          * case
4028          */
4029         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4030             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4031             CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4032                 return;
4033
4034         if (cfqd->hw_tag_samples++ < 50)
4035                 return;
4036
4037         if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4038                 cfqd->hw_tag = 1;
4039         else
4040                 cfqd->hw_tag = 0;
4041 }
4042
4043 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4044 {
4045         struct cfq_io_cq *cic = cfqd->active_cic;
4046
4047         /* If the queue already has requests, don't wait */
4048         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4049                 return false;
4050
4051         /* If there are other queues in the group, don't wait */
4052         if (cfqq->cfqg->nr_cfqq > 1)
4053                 return false;
4054
4055         /* the only queue in the group, but think time is big */
4056         if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4057                 return false;
4058
4059         if (cfq_slice_used(cfqq))
4060                 return true;
4061
4062         /* if slice left is less than think time, wait busy */
4063         if (cic && sample_valid(cic->ttime.ttime_samples)
4064             && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
4065                 return true;
4066
4067         /*
4068          * If think times is less than a jiffy than ttime_mean=0 and above
4069          * will not be true. It might happen that slice has not expired yet
4070          * but will expire soon (4-5 ns) during select_queue(). To cover the
4071          * case where think time is less than a jiffy, mark the queue wait
4072          * busy if only 1 jiffy is left in the slice.
4073          */
4074         if (cfqq->slice_end - jiffies == 1)
4075                 return true;
4076
4077         return false;
4078 }
4079
4080 static void cfq_completed_request(struct request_queue *q, struct request *rq)
4081 {
4082         struct cfq_queue *cfqq = RQ_CFQQ(rq);
4083         struct cfq_data *cfqd = cfqq->cfqd;
4084         const int sync = rq_is_sync(rq);
4085         unsigned long now;
4086
4087         now = jiffies;
4088         cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
4089                      !!(rq->cmd_flags & REQ_NOIDLE));
4090
4091         cfq_update_hw_tag(cfqd);
4092
4093         WARN_ON(!cfqd->rq_in_driver);
4094         WARN_ON(!cfqq->dispatched);
4095         cfqd->rq_in_driver--;
4096         cfqq->dispatched--;
4097         (RQ_CFQG(rq))->dispatched--;
4098         cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
4099                                      rq_io_start_time_ns(rq), rq->cmd_flags);
4100
4101         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4102
4103         if (sync) {
4104                 struct cfq_rb_root *st;
4105
4106                 RQ_CIC(rq)->ttime.last_end_request = now;
4107
4108                 if (cfq_cfqq_on_rr(cfqq))
4109                         st = cfqq->service_tree;
4110                 else
4111                         st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4112                                         cfqq_type(cfqq));
4113
4114                 st->ttime.last_end_request = now;
4115                 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
4116                         cfqd->last_delayed_sync = now;
4117         }
4118
4119 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4120         cfqq->cfqg->ttime.last_end_request = now;
4121 #endif
4122
4123         /*
4124          * If this is the active queue, check if it needs to be expired,
4125          * or if we want to idle in case it has no pending requests.
4126          */
4127         if (cfqd->active_queue == cfqq) {
4128                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4129
4130                 if (cfq_cfqq_slice_new(cfqq)) {
4131                         cfq_set_prio_slice(cfqd, cfqq);
4132                         cfq_clear_cfqq_slice_new(cfqq);
4133                 }
4134
4135                 /*
4136                  * Should we wait for next request to come in before we expire
4137                  * the queue.
4138                  */
4139                 if (cfq_should_wait_busy(cfqd, cfqq)) {
4140                         unsigned long extend_sl = cfqd->cfq_slice_idle;
4141                         if (!cfqd->cfq_slice_idle)
4142                                 extend_sl = cfqd->cfq_group_idle;
4143                         cfqq->slice_end = jiffies + extend_sl;
4144                         cfq_mark_cfqq_wait_busy(cfqq);
4145                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4146                 }
4147
4148                 /*
4149                  * Idling is not enabled on:
4150                  * - expired queues
4151                  * - idle-priority queues
4152                  * - async queues
4153                  * - queues with still some requests queued
4154                  * - when there is a close cooperator
4155                  */
4156                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4157                         cfq_slice_expired(cfqd, 1);
4158                 else if (sync && cfqq_empty &&
4159                          !cfq_close_cooperator(cfqd, cfqq)) {
4160                         cfq_arm_slice_timer(cfqd);
4161                 }
4162         }
4163
4164         if (!cfqd->rq_in_driver)
4165                 cfq_schedule_dispatch(cfqd);
4166 }
4167
4168 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4169 {
4170         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4171                 cfq_mark_cfqq_must_alloc_slice(cfqq);
4172                 return ELV_MQUEUE_MUST;
4173         }
4174
4175         return ELV_MQUEUE_MAY;
4176 }
4177
4178 static int cfq_may_queue(struct request_queue *q, int rw)
4179 {
4180         struct cfq_data *cfqd = q->elevator->elevator_data;
4181         struct task_struct *tsk = current;
4182         struct cfq_io_cq *cic;
4183         struct cfq_queue *cfqq;
4184
4185         /*
4186          * don't force setup of a queue from here, as a call to may_queue
4187          * does not necessarily imply that a request actually will be queued.
4188          * so just lookup a possibly existing queue, or return 'may queue'
4189          * if that fails
4190          */
4191         cic = cfq_cic_lookup(cfqd, tsk->io_context);
4192         if (!cic)
4193                 return ELV_MQUEUE_MAY;
4194
4195         cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
4196         if (cfqq) {
4197                 cfq_init_prio_data(cfqq, cic);
4198
4199                 return __cfq_may_queue(cfqq);
4200         }
4201
4202         return ELV_MQUEUE_MAY;
4203 }
4204
4205 /*
4206  * queue lock held here
4207  */
4208 static void cfq_put_request(struct request *rq)
4209 {
4210         struct cfq_queue *cfqq = RQ_CFQQ(rq);
4211
4212         if (cfqq) {
4213                 const int rw = rq_data_dir(rq);
4214
4215                 BUG_ON(!cfqq->allocated[rw]);
4216                 cfqq->allocated[rw]--;
4217
4218                 /* Put down rq reference on cfqg */
4219                 cfqg_put(RQ_CFQG(rq));
4220                 rq->elv.priv[0] = NULL;
4221                 rq->elv.priv[1] = NULL;
4222
4223                 cfq_put_queue(cfqq);
4224         }
4225 }
4226
4227 static struct cfq_queue *
4228 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4229                 struct cfq_queue *cfqq)
4230 {
4231         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4232         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4233         cfq_mark_cfqq_coop(cfqq->new_cfqq);
4234         cfq_put_queue(cfqq);
4235         return cic_to_cfqq(cic, 1);
4236 }
4237
4238 /*
4239  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4240  * was the last process referring to said cfqq.
4241  */
4242 static struct cfq_queue *
4243 split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4244 {
4245         if (cfqq_process_refs(cfqq) == 1) {
4246                 cfqq->pid = current->pid;
4247                 cfq_clear_cfqq_coop(cfqq);
4248                 cfq_clear_cfqq_split_coop(cfqq);
4249                 return cfqq;
4250         }
4251
4252         cic_set_cfqq(cic, NULL, 1);
4253
4254         cfq_put_cooperator(cfqq);
4255
4256         cfq_put_queue(cfqq);
4257         return NULL;
4258 }
4259 /*
4260  * Allocate cfq data structures associated with this request.
4261  */
4262 static int
4263 cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4264                 gfp_t gfp_mask)
4265 {
4266         struct cfq_data *cfqd = q->elevator->elevator_data;
4267         struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4268         const int rw = rq_data_dir(rq);
4269         const bool is_sync = rq_is_sync(rq);
4270         struct cfq_queue *cfqq;
4271
4272         might_sleep_if(gfp_mask & __GFP_WAIT);
4273
4274         spin_lock_irq(q->queue_lock);
4275
4276         check_ioprio_changed(cic, bio);
4277         check_blkcg_changed(cic, bio);
4278 new_queue:
4279         cfqq = cic_to_cfqq(cic, is_sync);
4280         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4281                 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
4282                 cic_set_cfqq(cic, cfqq, is_sync);
4283         } else {
4284                 /*
4285                  * If the queue was seeky for too long, break it apart.
4286                  */
4287                 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4288                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4289                         cfqq = split_cfqq(cic, cfqq);
4290                         if (!cfqq)
4291                                 goto new_queue;
4292                 }
4293
4294                 /*
4295                  * Check to see if this queue is scheduled to merge with
4296                  * another, closely cooperating queue.  The merging of
4297                  * queues happens here as it must be done in process context.
4298                  * The reference on new_cfqq was taken in merge_cfqqs.
4299                  */
4300                 if (cfqq->new_cfqq)
4301                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4302         }
4303
4304         cfqq->allocated[rw]++;
4305
4306         cfqq->ref++;
4307         cfqg_get(cfqq->cfqg);
4308         rq->elv.priv[0] = cfqq;
4309         rq->elv.priv[1] = cfqq->cfqg;
4310         spin_unlock_irq(q->queue_lock);
4311         return 0;
4312 }
4313
4314 static void cfq_kick_queue(struct work_struct *work)
4315 {
4316         struct cfq_data *cfqd =
4317                 container_of(work, struct cfq_data, unplug_work);
4318         struct request_queue *q = cfqd->queue;
4319
4320         spin_lock_irq(q->queue_lock);
4321         __blk_run_queue(cfqd->queue);
4322         spin_unlock_irq(q->queue_lock);
4323 }
4324
4325 /*
4326  * Timer running if the active_queue is currently idling inside its time slice
4327  */
4328 static void cfq_idle_slice_timer(unsigned long data)
4329 {
4330         struct cfq_data *cfqd = (struct cfq_data *) data;
4331         struct cfq_queue *cfqq;
4332         unsigned long flags;
4333         int timed_out = 1;
4334
4335         cfq_log(cfqd, "idle timer fired");
4336
4337         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4338
4339         cfqq = cfqd->active_queue;
4340         if (cfqq) {
4341                 timed_out = 0;
4342
4343                 /*
4344                  * We saw a request before the queue expired, let it through
4345                  */
4346                 if (cfq_cfqq_must_dispatch(cfqq))
4347                         goto out_kick;
4348
4349                 /*
4350                  * expired
4351                  */
4352                 if (cfq_slice_used(cfqq))
4353                         goto expire;
4354
4355                 /*
4356                  * only expire and reinvoke request handler, if there are
4357                  * other queues with pending requests
4358                  */
4359                 if (!cfqd->busy_queues)
4360                         goto out_cont;
4361
4362                 /*
4363                  * not expired and it has a request pending, let it dispatch
4364                  */
4365                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4366                         goto out_kick;
4367
4368                 /*
4369                  * Queue depth flag is reset only when the idle didn't succeed
4370                  */
4371                 cfq_clear_cfqq_deep(cfqq);
4372         }
4373 expire:
4374         cfq_slice_expired(cfqd, timed_out);
4375 out_kick:
4376         cfq_schedule_dispatch(cfqd);
4377 out_cont:
4378         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4379 }
4380
4381 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4382 {
4383         del_timer_sync(&cfqd->idle_slice_timer);
4384         cancel_work_sync(&cfqd->unplug_work);
4385 }
4386
4387 static void cfq_put_async_queues(struct cfq_data *cfqd)
4388 {
4389         int i;
4390
4391         for (i = 0; i < IOPRIO_BE_NR; i++) {
4392                 if (cfqd->async_cfqq[0][i])
4393                         cfq_put_queue(cfqd->async_cfqq[0][i]);
4394                 if (cfqd->async_cfqq[1][i])
4395                         cfq_put_queue(cfqd->async_cfqq[1][i]);
4396         }
4397
4398         if (cfqd->async_idle_cfqq)
4399                 cfq_put_queue(cfqd->async_idle_cfqq);
4400 }
4401
4402 static void cfq_exit_queue(struct elevator_queue *e)
4403 {
4404         struct cfq_data *cfqd = e->elevator_data;
4405         struct request_queue *q = cfqd->queue;
4406
4407         cfq_shutdown_timer_wq(cfqd);
4408
4409         spin_lock_irq(q->queue_lock);
4410
4411         if (cfqd->active_queue)
4412                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4413
4414         cfq_put_async_queues(cfqd);
4415
4416         spin_unlock_irq(q->queue_lock);
4417
4418         cfq_shutdown_timer_wq(cfqd);
4419
4420 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4421         blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4422 #else
4423         kfree(cfqd->root_group);
4424 #endif
4425         kfree(cfqd);
4426 }
4427
4428 static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4429 {
4430         struct cfq_data *cfqd;
4431         struct blkcg_gq *blkg __maybe_unused;
4432         int i, ret;
4433         struct elevator_queue *eq;
4434
4435         eq = elevator_alloc(q, e);
4436         if (!eq)
4437                 return -ENOMEM;
4438
4439         cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4440         if (!cfqd) {
4441                 kobject_put(&eq->kobj);
4442                 return -ENOMEM;
4443         }
4444         eq->elevator_data = cfqd;
4445
4446         cfqd->queue = q;
4447         spin_lock_irq(q->queue_lock);
4448         q->elevator = eq;
4449         spin_unlock_irq(q->queue_lock);
4450
4451         /* Init root service tree */
4452         cfqd->grp_service_tree = CFQ_RB_ROOT;
4453
4454         /* Init root group and prefer root group over other groups by default */
4455 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4456         ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4457         if (ret)
4458                 goto out_free;
4459
4460         cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4461 #else
4462         ret = -ENOMEM;
4463         cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4464                                         GFP_KERNEL, cfqd->queue->node);
4465         if (!cfqd->root_group)
4466                 goto out_free;
4467
4468         cfq_init_cfqg_base(cfqd->root_group);
4469 #endif
4470         cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4471         cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
4472
4473         /*
4474          * Not strictly needed (since RB_ROOT just clears the node and we
4475          * zeroed cfqd on alloc), but better be safe in case someone decides
4476          * to add magic to the rb code
4477          */
4478         for (i = 0; i < CFQ_PRIO_LISTS; i++)
4479                 cfqd->prio_trees[i] = RB_ROOT;
4480
4481         /*
4482          * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
4483          * Grab a permanent reference to it, so that the normal code flow
4484          * will not attempt to free it.  oom_cfqq is linked to root_group
4485          * but shouldn't hold a reference as it'll never be unlinked.  Lose
4486          * the reference from linking right away.
4487          */
4488         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4489         cfqd->oom_cfqq.ref++;
4490
4491         spin_lock_irq(q->queue_lock);
4492         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4493         cfqg_put(cfqd->root_group);
4494         spin_unlock_irq(q->queue_lock);
4495
4496         init_timer(&cfqd->idle_slice_timer);
4497         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4498         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
4499
4500         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4501
4502         cfqd->cfq_quantum = cfq_quantum;
4503         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4504         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4505         cfqd->cfq_back_max = cfq_back_max;
4506         cfqd->cfq_back_penalty = cfq_back_penalty;
4507         cfqd->cfq_slice[0] = cfq_slice_async;
4508         cfqd->cfq_slice[1] = cfq_slice_sync;
4509         cfqd->cfq_target_latency = cfq_target_latency;
4510         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4511         cfqd->cfq_slice_idle = cfq_slice_idle;
4512         cfqd->cfq_group_idle = cfq_group_idle;
4513         cfqd->cfq_latency = 1;
4514         cfqd->hw_tag = -1;
4515         /*
4516          * we optimistically start assuming sync ops weren't delayed in last
4517          * second, in order to have larger depth for async operations.
4518          */
4519         cfqd->last_delayed_sync = jiffies - HZ;
4520         return 0;
4521
4522 out_free:
4523         kfree(cfqd);
4524         kobject_put(&eq->kobj);
4525         return ret;
4526 }
4527
4528 static void cfq_registered_queue(struct request_queue *q)
4529 {
4530         struct elevator_queue *e = q->elevator;
4531         struct cfq_data *cfqd = e->elevator_data;
4532
4533         /*
4534          * Default to IOPS mode with no idling for SSDs
4535          */
4536         if (blk_queue_nonrot(q))
4537                 cfqd->cfq_slice_idle = 0;
4538 }
4539
4540 /*
4541  * sysfs parts below -->
4542  */
4543 static ssize_t
4544 cfq_var_show(unsigned int var, char *page)
4545 {
4546         return sprintf(page, "%u\n", var);
4547 }
4548
4549 static ssize_t
4550 cfq_var_store(unsigned int *var, const char *page, size_t count)
4551 {
4552         char *p = (char *) page;
4553
4554         *var = simple_strtoul(p, &p, 10);
4555         return count;
4556 }
4557
4558 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
4559 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4560 {                                                                       \
4561         struct cfq_data *cfqd = e->elevator_data;                       \
4562         unsigned int __data = __VAR;                                    \
4563         if (__CONV)                                                     \
4564                 __data = jiffies_to_msecs(__data);                      \
4565         return cfq_var_show(__data, (page));                            \
4566 }
4567 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4568 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4569 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4570 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4571 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4572 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4573 SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4574 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4575 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4576 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4577 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4578 SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4579 #undef SHOW_FUNCTION
4580
4581 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
4582 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4583 {                                                                       \
4584         struct cfq_data *cfqd = e->elevator_data;                       \
4585         unsigned int __data;                                            \
4586         int ret = cfq_var_store(&__data, (page), count);                \
4587         if (__data < (MIN))                                             \
4588                 __data = (MIN);                                         \
4589         else if (__data > (MAX))                                        \
4590                 __data = (MAX);                                         \
4591         if (__CONV)                                                     \
4592                 *(__PTR) = msecs_to_jiffies(__data);                    \
4593         else                                                            \
4594                 *(__PTR) = __data;                                      \
4595         return ret;                                                     \
4596 }
4597 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4598 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4599                 UINT_MAX, 1);
4600 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4601                 UINT_MAX, 1);
4602 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4603 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4604                 UINT_MAX, 0);
4605 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4606 STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4607 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4608 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4609 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4610                 UINT_MAX, 0);
4611 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4612 STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4613 #undef STORE_FUNCTION
4614
4615 #define CFQ_ATTR(name) \
4616         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4617
4618 static struct elv_fs_entry cfq_attrs[] = {
4619         CFQ_ATTR(quantum),
4620         CFQ_ATTR(fifo_expire_sync),
4621         CFQ_ATTR(fifo_expire_async),
4622         CFQ_ATTR(back_seek_max),
4623         CFQ_ATTR(back_seek_penalty),
4624         CFQ_ATTR(slice_sync),
4625         CFQ_ATTR(slice_async),
4626         CFQ_ATTR(slice_async_rq),
4627         CFQ_ATTR(slice_idle),
4628         CFQ_ATTR(group_idle),
4629         CFQ_ATTR(low_latency),
4630         CFQ_ATTR(target_latency),
4631         __ATTR_NULL
4632 };
4633
4634 static struct elevator_type iosched_cfq = {
4635         .ops = {
4636                 .elevator_merge_fn =            cfq_merge,
4637                 .elevator_merged_fn =           cfq_merged_request,
4638                 .elevator_merge_req_fn =        cfq_merged_requests,
4639                 .elevator_allow_merge_fn =      cfq_allow_merge,
4640                 .elevator_bio_merged_fn =       cfq_bio_merged,
4641                 .elevator_dispatch_fn =         cfq_dispatch_requests,
4642                 .elevator_add_req_fn =          cfq_insert_request,
4643                 .elevator_activate_req_fn =     cfq_activate_request,
4644                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
4645                 .elevator_completed_req_fn =    cfq_completed_request,
4646                 .elevator_former_req_fn =       elv_rb_former_request,
4647                 .elevator_latter_req_fn =       elv_rb_latter_request,
4648                 .elevator_init_icq_fn =         cfq_init_icq,
4649                 .elevator_exit_icq_fn =         cfq_exit_icq,
4650                 .elevator_set_req_fn =          cfq_set_request,
4651                 .elevator_put_req_fn =          cfq_put_request,
4652                 .elevator_may_queue_fn =        cfq_may_queue,
4653                 .elevator_init_fn =             cfq_init_queue,
4654                 .elevator_exit_fn =             cfq_exit_queue,
4655                 .elevator_registered_fn =       cfq_registered_queue,
4656         },
4657         .icq_size       =       sizeof(struct cfq_io_cq),
4658         .icq_align      =       __alignof__(struct cfq_io_cq),
4659         .elevator_attrs =       cfq_attrs,
4660         .elevator_name  =       "cfq",
4661         .elevator_owner =       THIS_MODULE,
4662 };
4663
4664 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4665 static struct blkcg_policy blkcg_policy_cfq = {
4666         .pd_size                = sizeof(struct cfq_group),
4667         .cpd_size               = sizeof(struct cfq_group_data),
4668         .cftypes                = cfq_blkcg_files,
4669
4670         .cpd_init_fn            = cfq_cpd_init,
4671         .pd_init_fn             = cfq_pd_init,
4672         .pd_offline_fn          = cfq_pd_offline,
4673         .pd_reset_stats_fn      = cfq_pd_reset_stats,
4674 };
4675 #endif
4676
4677 static int __init cfq_init(void)
4678 {
4679         int ret;
4680
4681         /*
4682          * could be 0 on HZ < 1000 setups
4683          */
4684         if (!cfq_slice_async)
4685                 cfq_slice_async = 1;
4686         if (!cfq_slice_idle)
4687                 cfq_slice_idle = 1;
4688
4689 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4690         if (!cfq_group_idle)
4691                 cfq_group_idle = 1;
4692
4693         ret = blkcg_policy_register(&blkcg_policy_cfq);
4694         if (ret)
4695                 return ret;
4696 #else
4697         cfq_group_idle = 0;
4698 #endif
4699
4700         ret = -ENOMEM;
4701         cfq_pool = KMEM_CACHE(cfq_queue, 0);
4702         if (!cfq_pool)
4703                 goto err_pol_unreg;
4704
4705         ret = elv_register(&iosched_cfq);
4706         if (ret)
4707                 goto err_free_pool;
4708
4709         return 0;
4710
4711 err_free_pool:
4712         kmem_cache_destroy(cfq_pool);
4713 err_pol_unreg:
4714 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4715         blkcg_policy_unregister(&blkcg_policy_cfq);
4716 #endif
4717         return ret;
4718 }
4719
4720 static void __exit cfq_exit(void)
4721 {
4722 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4723         blkcg_policy_unregister(&blkcg_policy_cfq);
4724 #endif
4725         elv_unregister(&iosched_cfq);
4726         kmem_cache_destroy(cfq_pool);
4727 }
4728
4729 module_init(cfq_init);
4730 module_exit(cfq_exit);
4731
4732 MODULE_AUTHOR("Jens Axboe");
4733 MODULE_LICENSE("GPL");
4734 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");