block: remove per-queue plugging
[cascardo/linux.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8  */
9 #include <linux/module.h>
10 #include <linux/slab.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/jiffies.h>
14 #include <linux/rbtree.h>
15 #include <linux/ioprio.h>
16 #include <linux/blktrace_api.h>
17 #include "cfq.h"
18
19 /*
20  * tunables
21  */
22 /* max queue in one round of service */
23 static const int cfq_quantum = 8;
24 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
25 /* maximum backwards seek, in KiB */
26 static const int cfq_back_max = 16 * 1024;
27 /* penalty of a backwards seek */
28 static const int cfq_back_penalty = 2;
29 static const int cfq_slice_sync = HZ / 10;
30 static int cfq_slice_async = HZ / 25;
31 static const int cfq_slice_async_rq = 2;
32 static int cfq_slice_idle = HZ / 125;
33 static int cfq_group_idle = HZ / 125;
34 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
35 static const int cfq_hist_divisor = 4;
36
37 /*
38  * offset from end of service tree
39  */
40 #define CFQ_IDLE_DELAY          (HZ / 5)
41
42 /*
43  * below this threshold, we consider thinktime immediate
44  */
45 #define CFQ_MIN_TT              (2)
46
47 #define CFQ_SLICE_SCALE         (5)
48 #define CFQ_HW_QUEUE_MIN        (5)
49 #define CFQ_SERVICE_SHIFT       12
50
51 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
52 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
53 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
54 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
55
56 #define RQ_CIC(rq)              \
57         ((struct cfq_io_context *) (rq)->elevator_private[0])
58 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private[1])
59 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private[2])
60
61 static struct kmem_cache *cfq_pool;
62 static struct kmem_cache *cfq_ioc_pool;
63
64 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65 static struct completion *ioc_gone;
66 static DEFINE_SPINLOCK(ioc_gone_lock);
67
68 static DEFINE_SPINLOCK(cic_index_lock);
69 static DEFINE_IDA(cic_index_ida);
70
71 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
72 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
73 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
74
75 #define sample_valid(samples)   ((samples) > 80)
76 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
77
78 /*
79  * Most of our rbtree usage is for sorting with min extraction, so
80  * if we cache the leftmost node we don't have to walk down the tree
81  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82  * move this into the elevator for the rq sorting as well.
83  */
84 struct cfq_rb_root {
85         struct rb_root rb;
86         struct rb_node *left;
87         unsigned count;
88         unsigned total_weight;
89         u64 min_vdisktime;
90 };
91 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
92                         .count = 0, .min_vdisktime = 0, }
93
94 /*
95  * Per process-grouping structure
96  */
97 struct cfq_queue {
98         /* reference count */
99         int ref;
100         /* various state flags, see below */
101         unsigned int flags;
102         /* parent cfq_data */
103         struct cfq_data *cfqd;
104         /* service_tree member */
105         struct rb_node rb_node;
106         /* service_tree key */
107         unsigned long rb_key;
108         /* prio tree member */
109         struct rb_node p_node;
110         /* prio tree root we belong to, if any */
111         struct rb_root *p_root;
112         /* sorted list of pending requests */
113         struct rb_root sort_list;
114         /* if fifo isn't expired, next request to serve */
115         struct request *next_rq;
116         /* requests queued in sort_list */
117         int queued[2];
118         /* currently allocated requests */
119         int allocated[2];
120         /* fifo list of requests in sort_list */
121         struct list_head fifo;
122
123         /* time when queue got scheduled in to dispatch first request. */
124         unsigned long dispatch_start;
125         unsigned int allocated_slice;
126         unsigned int slice_dispatch;
127         /* time when first request from queue completed and slice started. */
128         unsigned long slice_start;
129         unsigned long slice_end;
130         long slice_resid;
131
132         /* pending metadata requests */
133         int meta_pending;
134         /* number of requests that are on the dispatch list or inside driver */
135         int dispatched;
136
137         /* io prio of this group */
138         unsigned short ioprio, org_ioprio;
139         unsigned short ioprio_class, org_ioprio_class;
140
141         pid_t pid;
142
143         u32 seek_history;
144         sector_t last_request_pos;
145
146         struct cfq_rb_root *service_tree;
147         struct cfq_queue *new_cfqq;
148         struct cfq_group *cfqg;
149         /* Number of sectors dispatched from queue in single dispatch round */
150         unsigned long nr_sectors;
151 };
152
153 /*
154  * First index in the service_trees.
155  * IDLE is handled separately, so it has negative index
156  */
157 enum wl_prio_t {
158         BE_WORKLOAD = 0,
159         RT_WORKLOAD = 1,
160         IDLE_WORKLOAD = 2,
161         CFQ_PRIO_NR,
162 };
163
164 /*
165  * Second index in the service_trees.
166  */
167 enum wl_type_t {
168         ASYNC_WORKLOAD = 0,
169         SYNC_NOIDLE_WORKLOAD = 1,
170         SYNC_WORKLOAD = 2
171 };
172
173 /* This is per cgroup per device grouping structure */
174 struct cfq_group {
175         /* group service_tree member */
176         struct rb_node rb_node;
177
178         /* group service_tree key */
179         u64 vdisktime;
180         unsigned int weight;
181
182         /* number of cfqq currently on this group */
183         int nr_cfqq;
184
185         /*
186          * Per group busy queus average. Useful for workload slice calc. We
187          * create the array for each prio class but at run time it is used
188          * only for RT and BE class and slot for IDLE class remains unused.
189          * This is primarily done to avoid confusion and a gcc warning.
190          */
191         unsigned int busy_queues_avg[CFQ_PRIO_NR];
192         /*
193          * rr lists of queues with requests. We maintain service trees for
194          * RT and BE classes. These trees are subdivided in subclasses
195          * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
196          * class there is no subclassification and all the cfq queues go on
197          * a single tree service_tree_idle.
198          * Counts are embedded in the cfq_rb_root
199          */
200         struct cfq_rb_root service_trees[2][3];
201         struct cfq_rb_root service_tree_idle;
202
203         unsigned long saved_workload_slice;
204         enum wl_type_t saved_workload;
205         enum wl_prio_t saved_serving_prio;
206         struct blkio_group blkg;
207 #ifdef CONFIG_CFQ_GROUP_IOSCHED
208         struct hlist_node cfqd_node;
209         int ref;
210 #endif
211         /* number of requests that are on the dispatch list or inside driver */
212         int dispatched;
213 };
214
215 /*
216  * Per block device queue structure
217  */
218 struct cfq_data {
219         struct request_queue *queue;
220         /* Root service tree for cfq_groups */
221         struct cfq_rb_root grp_service_tree;
222         struct cfq_group root_group;
223
224         /*
225          * The priority currently being served
226          */
227         enum wl_prio_t serving_prio;
228         enum wl_type_t serving_type;
229         unsigned long workload_expires;
230         struct cfq_group *serving_group;
231
232         /*
233          * Each priority tree is sorted by next_request position.  These
234          * trees are used when determining if two or more queues are
235          * interleaving requests (see cfq_close_cooperator).
236          */
237         struct rb_root prio_trees[CFQ_PRIO_LISTS];
238
239         unsigned int busy_queues;
240
241         int rq_in_driver;
242         int rq_in_flight[2];
243
244         /*
245          * queue-depth detection
246          */
247         int rq_queued;
248         int hw_tag;
249         /*
250          * hw_tag can be
251          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
252          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
253          *  0 => no NCQ
254          */
255         int hw_tag_est_depth;
256         unsigned int hw_tag_samples;
257
258         /*
259          * idle window management
260          */
261         struct timer_list idle_slice_timer;
262         struct work_struct unplug_work;
263
264         struct cfq_queue *active_queue;
265         struct cfq_io_context *active_cic;
266
267         /*
268          * async queue for each priority case
269          */
270         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
271         struct cfq_queue *async_idle_cfqq;
272
273         sector_t last_position;
274
275         /*
276          * tunables, see top of file
277          */
278         unsigned int cfq_quantum;
279         unsigned int cfq_fifo_expire[2];
280         unsigned int cfq_back_penalty;
281         unsigned int cfq_back_max;
282         unsigned int cfq_slice[2];
283         unsigned int cfq_slice_async_rq;
284         unsigned int cfq_slice_idle;
285         unsigned int cfq_group_idle;
286         unsigned int cfq_latency;
287
288         unsigned int cic_index;
289         struct list_head cic_list;
290
291         /*
292          * Fallback dummy cfqq for extreme OOM conditions
293          */
294         struct cfq_queue oom_cfqq;
295
296         unsigned long last_delayed_sync;
297
298         /* List of cfq groups being managed on this device*/
299         struct hlist_head cfqg_list;
300         struct rcu_head rcu;
301 };
302
303 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
304
305 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
306                                             enum wl_prio_t prio,
307                                             enum wl_type_t type)
308 {
309         if (!cfqg)
310                 return NULL;
311
312         if (prio == IDLE_WORKLOAD)
313                 return &cfqg->service_tree_idle;
314
315         return &cfqg->service_trees[prio][type];
316 }
317
318 enum cfqq_state_flags {
319         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
320         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
321         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
322         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
323         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
324         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
325         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
326         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
327         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
328         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
329         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
330         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
331         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
332 };
333
334 #define CFQ_CFQQ_FNS(name)                                              \
335 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
336 {                                                                       \
337         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
338 }                                                                       \
339 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
340 {                                                                       \
341         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
342 }                                                                       \
343 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
344 {                                                                       \
345         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
346 }
347
348 CFQ_CFQQ_FNS(on_rr);
349 CFQ_CFQQ_FNS(wait_request);
350 CFQ_CFQQ_FNS(must_dispatch);
351 CFQ_CFQQ_FNS(must_alloc_slice);
352 CFQ_CFQQ_FNS(fifo_expire);
353 CFQ_CFQQ_FNS(idle_window);
354 CFQ_CFQQ_FNS(prio_changed);
355 CFQ_CFQQ_FNS(slice_new);
356 CFQ_CFQQ_FNS(sync);
357 CFQ_CFQQ_FNS(coop);
358 CFQ_CFQQ_FNS(split_coop);
359 CFQ_CFQQ_FNS(deep);
360 CFQ_CFQQ_FNS(wait_busy);
361 #undef CFQ_CFQQ_FNS
362
363 #ifdef CONFIG_CFQ_GROUP_IOSCHED
364 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
365         blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
366                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
367                         blkg_path(&(cfqq)->cfqg->blkg), ##args);
368
369 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)                          \
370         blk_add_trace_msg((cfqd)->queue, "%s " fmt,                     \
371                                 blkg_path(&(cfqg)->blkg), ##args);      \
372
373 #else
374 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
375         blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
376 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0);
377 #endif
378 #define cfq_log(cfqd, fmt, args...)     \
379         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
380
381 /* Traverses through cfq group service trees */
382 #define for_each_cfqg_st(cfqg, i, j, st) \
383         for (i = 0; i <= IDLE_WORKLOAD; i++) \
384                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
385                         : &cfqg->service_tree_idle; \
386                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
387                         (i == IDLE_WORKLOAD && j == 0); \
388                         j++, st = i < IDLE_WORKLOAD ? \
389                         &cfqg->service_trees[i][j]: NULL) \
390
391
392 static inline bool iops_mode(struct cfq_data *cfqd)
393 {
394         /*
395          * If we are not idling on queues and it is a NCQ drive, parallel
396          * execution of requests is on and measuring time is not possible
397          * in most of the cases until and unless we drive shallower queue
398          * depths and that becomes a performance bottleneck. In such cases
399          * switch to start providing fairness in terms of number of IOs.
400          */
401         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
402                 return true;
403         else
404                 return false;
405 }
406
407 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
408 {
409         if (cfq_class_idle(cfqq))
410                 return IDLE_WORKLOAD;
411         if (cfq_class_rt(cfqq))
412                 return RT_WORKLOAD;
413         return BE_WORKLOAD;
414 }
415
416
417 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
418 {
419         if (!cfq_cfqq_sync(cfqq))
420                 return ASYNC_WORKLOAD;
421         if (!cfq_cfqq_idle_window(cfqq))
422                 return SYNC_NOIDLE_WORKLOAD;
423         return SYNC_WORKLOAD;
424 }
425
426 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
427                                         struct cfq_data *cfqd,
428                                         struct cfq_group *cfqg)
429 {
430         if (wl == IDLE_WORKLOAD)
431                 return cfqg->service_tree_idle.count;
432
433         return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
434                 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
435                 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
436 }
437
438 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
439                                         struct cfq_group *cfqg)
440 {
441         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
442                 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
443 }
444
445 static void cfq_dispatch_insert(struct request_queue *, struct request *);
446 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
447                                        struct io_context *, gfp_t);
448 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
449                                                 struct io_context *);
450
451 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
452                                             bool is_sync)
453 {
454         return cic->cfqq[is_sync];
455 }
456
457 static inline void cic_set_cfqq(struct cfq_io_context *cic,
458                                 struct cfq_queue *cfqq, bool is_sync)
459 {
460         cic->cfqq[is_sync] = cfqq;
461 }
462
463 #define CIC_DEAD_KEY    1ul
464 #define CIC_DEAD_INDEX_SHIFT    1
465
466 static inline void *cfqd_dead_key(struct cfq_data *cfqd)
467 {
468         return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
469 }
470
471 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
472 {
473         struct cfq_data *cfqd = cic->key;
474
475         if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
476                 return NULL;
477
478         return cfqd;
479 }
480
481 /*
482  * We regard a request as SYNC, if it's either a read or has the SYNC bit
483  * set (in which case it could also be direct WRITE).
484  */
485 static inline bool cfq_bio_sync(struct bio *bio)
486 {
487         return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
488 }
489
490 /*
491  * scheduler run of queue, if there are requests pending and no one in the
492  * driver that will restart queueing
493  */
494 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
495 {
496         if (cfqd->busy_queues) {
497                 cfq_log(cfqd, "schedule dispatch");
498                 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
499         }
500 }
501
502 /*
503  * Scale schedule slice based on io priority. Use the sync time slice only
504  * if a queue is marked sync and has sync io queued. A sync queue with async
505  * io only, should not get full sync slice length.
506  */
507 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
508                                  unsigned short prio)
509 {
510         const int base_slice = cfqd->cfq_slice[sync];
511
512         WARN_ON(prio >= IOPRIO_BE_NR);
513
514         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
515 }
516
517 static inline int
518 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
519 {
520         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
521 }
522
523 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
524 {
525         u64 d = delta << CFQ_SERVICE_SHIFT;
526
527         d = d * BLKIO_WEIGHT_DEFAULT;
528         do_div(d, cfqg->weight);
529         return d;
530 }
531
532 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
533 {
534         s64 delta = (s64)(vdisktime - min_vdisktime);
535         if (delta > 0)
536                 min_vdisktime = vdisktime;
537
538         return min_vdisktime;
539 }
540
541 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
542 {
543         s64 delta = (s64)(vdisktime - min_vdisktime);
544         if (delta < 0)
545                 min_vdisktime = vdisktime;
546
547         return min_vdisktime;
548 }
549
550 static void update_min_vdisktime(struct cfq_rb_root *st)
551 {
552         u64 vdisktime = st->min_vdisktime;
553         struct cfq_group *cfqg;
554
555         if (st->left) {
556                 cfqg = rb_entry_cfqg(st->left);
557                 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
558         }
559
560         st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
561 }
562
563 /*
564  * get averaged number of queues of RT/BE priority.
565  * average is updated, with a formula that gives more weight to higher numbers,
566  * to quickly follows sudden increases and decrease slowly
567  */
568
569 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
570                                         struct cfq_group *cfqg, bool rt)
571 {
572         unsigned min_q, max_q;
573         unsigned mult  = cfq_hist_divisor - 1;
574         unsigned round = cfq_hist_divisor / 2;
575         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
576
577         min_q = min(cfqg->busy_queues_avg[rt], busy);
578         max_q = max(cfqg->busy_queues_avg[rt], busy);
579         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
580                 cfq_hist_divisor;
581         return cfqg->busy_queues_avg[rt];
582 }
583
584 static inline unsigned
585 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
586 {
587         struct cfq_rb_root *st = &cfqd->grp_service_tree;
588
589         return cfq_target_latency * cfqg->weight / st->total_weight;
590 }
591
592 static inline unsigned
593 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
594 {
595         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
596         if (cfqd->cfq_latency) {
597                 /*
598                  * interested queues (we consider only the ones with the same
599                  * priority class in the cfq group)
600                  */
601                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
602                                                 cfq_class_rt(cfqq));
603                 unsigned sync_slice = cfqd->cfq_slice[1];
604                 unsigned expect_latency = sync_slice * iq;
605                 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
606
607                 if (expect_latency > group_slice) {
608                         unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
609                         /* scale low_slice according to IO priority
610                          * and sync vs async */
611                         unsigned low_slice =
612                                 min(slice, base_low_slice * slice / sync_slice);
613                         /* the adapted slice value is scaled to fit all iqs
614                          * into the target latency */
615                         slice = max(slice * group_slice / expect_latency,
616                                     low_slice);
617                 }
618         }
619         return slice;
620 }
621
622 static inline void
623 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
624 {
625         unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
626
627         cfqq->slice_start = jiffies;
628         cfqq->slice_end = jiffies + slice;
629         cfqq->allocated_slice = slice;
630         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
631 }
632
633 /*
634  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
635  * isn't valid until the first request from the dispatch is activated
636  * and the slice time set.
637  */
638 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
639 {
640         if (cfq_cfqq_slice_new(cfqq))
641                 return false;
642         if (time_before(jiffies, cfqq->slice_end))
643                 return false;
644
645         return true;
646 }
647
648 /*
649  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
650  * We choose the request that is closest to the head right now. Distance
651  * behind the head is penalized and only allowed to a certain extent.
652  */
653 static struct request *
654 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
655 {
656         sector_t s1, s2, d1 = 0, d2 = 0;
657         unsigned long back_max;
658 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
659 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
660         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
661
662         if (rq1 == NULL || rq1 == rq2)
663                 return rq2;
664         if (rq2 == NULL)
665                 return rq1;
666
667         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
668                 return rq1;
669         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
670                 return rq2;
671         if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
672                 return rq1;
673         else if ((rq2->cmd_flags & REQ_META) &&
674                  !(rq1->cmd_flags & REQ_META))
675                 return rq2;
676
677         s1 = blk_rq_pos(rq1);
678         s2 = blk_rq_pos(rq2);
679
680         /*
681          * by definition, 1KiB is 2 sectors
682          */
683         back_max = cfqd->cfq_back_max * 2;
684
685         /*
686          * Strict one way elevator _except_ in the case where we allow
687          * short backward seeks which are biased as twice the cost of a
688          * similar forward seek.
689          */
690         if (s1 >= last)
691                 d1 = s1 - last;
692         else if (s1 + back_max >= last)
693                 d1 = (last - s1) * cfqd->cfq_back_penalty;
694         else
695                 wrap |= CFQ_RQ1_WRAP;
696
697         if (s2 >= last)
698                 d2 = s2 - last;
699         else if (s2 + back_max >= last)
700                 d2 = (last - s2) * cfqd->cfq_back_penalty;
701         else
702                 wrap |= CFQ_RQ2_WRAP;
703
704         /* Found required data */
705
706         /*
707          * By doing switch() on the bit mask "wrap" we avoid having to
708          * check two variables for all permutations: --> faster!
709          */
710         switch (wrap) {
711         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
712                 if (d1 < d2)
713                         return rq1;
714                 else if (d2 < d1)
715                         return rq2;
716                 else {
717                         if (s1 >= s2)
718                                 return rq1;
719                         else
720                                 return rq2;
721                 }
722
723         case CFQ_RQ2_WRAP:
724                 return rq1;
725         case CFQ_RQ1_WRAP:
726                 return rq2;
727         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
728         default:
729                 /*
730                  * Since both rqs are wrapped,
731                  * start with the one that's further behind head
732                  * (--> only *one* back seek required),
733                  * since back seek takes more time than forward.
734                  */
735                 if (s1 <= s2)
736                         return rq1;
737                 else
738                         return rq2;
739         }
740 }
741
742 /*
743  * The below is leftmost cache rbtree addon
744  */
745 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
746 {
747         /* Service tree is empty */
748         if (!root->count)
749                 return NULL;
750
751         if (!root->left)
752                 root->left = rb_first(&root->rb);
753
754         if (root->left)
755                 return rb_entry(root->left, struct cfq_queue, rb_node);
756
757         return NULL;
758 }
759
760 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
761 {
762         if (!root->left)
763                 root->left = rb_first(&root->rb);
764
765         if (root->left)
766                 return rb_entry_cfqg(root->left);
767
768         return NULL;
769 }
770
771 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
772 {
773         rb_erase(n, root);
774         RB_CLEAR_NODE(n);
775 }
776
777 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
778 {
779         if (root->left == n)
780                 root->left = NULL;
781         rb_erase_init(n, &root->rb);
782         --root->count;
783 }
784
785 /*
786  * would be nice to take fifo expire time into account as well
787  */
788 static struct request *
789 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
790                   struct request *last)
791 {
792         struct rb_node *rbnext = rb_next(&last->rb_node);
793         struct rb_node *rbprev = rb_prev(&last->rb_node);
794         struct request *next = NULL, *prev = NULL;
795
796         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
797
798         if (rbprev)
799                 prev = rb_entry_rq(rbprev);
800
801         if (rbnext)
802                 next = rb_entry_rq(rbnext);
803         else {
804                 rbnext = rb_first(&cfqq->sort_list);
805                 if (rbnext && rbnext != &last->rb_node)
806                         next = rb_entry_rq(rbnext);
807         }
808
809         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
810 }
811
812 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
813                                       struct cfq_queue *cfqq)
814 {
815         /*
816          * just an approximation, should be ok.
817          */
818         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
819                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
820 }
821
822 static inline s64
823 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
824 {
825         return cfqg->vdisktime - st->min_vdisktime;
826 }
827
828 static void
829 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
830 {
831         struct rb_node **node = &st->rb.rb_node;
832         struct rb_node *parent = NULL;
833         struct cfq_group *__cfqg;
834         s64 key = cfqg_key(st, cfqg);
835         int left = 1;
836
837         while (*node != NULL) {
838                 parent = *node;
839                 __cfqg = rb_entry_cfqg(parent);
840
841                 if (key < cfqg_key(st, __cfqg))
842                         node = &parent->rb_left;
843                 else {
844                         node = &parent->rb_right;
845                         left = 0;
846                 }
847         }
848
849         if (left)
850                 st->left = &cfqg->rb_node;
851
852         rb_link_node(&cfqg->rb_node, parent, node);
853         rb_insert_color(&cfqg->rb_node, &st->rb);
854 }
855
856 static void
857 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
858 {
859         struct cfq_rb_root *st = &cfqd->grp_service_tree;
860         struct cfq_group *__cfqg;
861         struct rb_node *n;
862
863         cfqg->nr_cfqq++;
864         if (!RB_EMPTY_NODE(&cfqg->rb_node))
865                 return;
866
867         /*
868          * Currently put the group at the end. Later implement something
869          * so that groups get lesser vtime based on their weights, so that
870          * if group does not loose all if it was not continously backlogged.
871          */
872         n = rb_last(&st->rb);
873         if (n) {
874                 __cfqg = rb_entry_cfqg(n);
875                 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
876         } else
877                 cfqg->vdisktime = st->min_vdisktime;
878
879         __cfq_group_service_tree_add(st, cfqg);
880         st->total_weight += cfqg->weight;
881 }
882
883 static void
884 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
885 {
886         struct cfq_rb_root *st = &cfqd->grp_service_tree;
887
888         BUG_ON(cfqg->nr_cfqq < 1);
889         cfqg->nr_cfqq--;
890
891         /* If there are other cfq queues under this group, don't delete it */
892         if (cfqg->nr_cfqq)
893                 return;
894
895         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
896         st->total_weight -= cfqg->weight;
897         if (!RB_EMPTY_NODE(&cfqg->rb_node))
898                 cfq_rb_erase(&cfqg->rb_node, st);
899         cfqg->saved_workload_slice = 0;
900         cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
901 }
902
903 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
904 {
905         unsigned int slice_used;
906
907         /*
908          * Queue got expired before even a single request completed or
909          * got expired immediately after first request completion.
910          */
911         if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
912                 /*
913                  * Also charge the seek time incurred to the group, otherwise
914                  * if there are mutiple queues in the group, each can dispatch
915                  * a single request on seeky media and cause lots of seek time
916                  * and group will never know it.
917                  */
918                 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
919                                         1);
920         } else {
921                 slice_used = jiffies - cfqq->slice_start;
922                 if (slice_used > cfqq->allocated_slice)
923                         slice_used = cfqq->allocated_slice;
924         }
925
926         return slice_used;
927 }
928
929 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
930                                 struct cfq_queue *cfqq)
931 {
932         struct cfq_rb_root *st = &cfqd->grp_service_tree;
933         unsigned int used_sl, charge;
934         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
935                         - cfqg->service_tree_idle.count;
936
937         BUG_ON(nr_sync < 0);
938         used_sl = charge = cfq_cfqq_slice_usage(cfqq);
939
940         if (iops_mode(cfqd))
941                 charge = cfqq->slice_dispatch;
942         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
943                 charge = cfqq->allocated_slice;
944
945         /* Can't update vdisktime while group is on service tree */
946         cfq_rb_erase(&cfqg->rb_node, st);
947         cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
948         __cfq_group_service_tree_add(st, cfqg);
949
950         /* This group is being expired. Save the context */
951         if (time_after(cfqd->workload_expires, jiffies)) {
952                 cfqg->saved_workload_slice = cfqd->workload_expires
953                                                 - jiffies;
954                 cfqg->saved_workload = cfqd->serving_type;
955                 cfqg->saved_serving_prio = cfqd->serving_prio;
956         } else
957                 cfqg->saved_workload_slice = 0;
958
959         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
960                                         st->min_vdisktime);
961         cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
962                         " sect=%u", used_sl, cfqq->slice_dispatch, charge,
963                         iops_mode(cfqd), cfqq->nr_sectors);
964         cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
965         cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
966 }
967
968 #ifdef CONFIG_CFQ_GROUP_IOSCHED
969 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
970 {
971         if (blkg)
972                 return container_of(blkg, struct cfq_group, blkg);
973         return NULL;
974 }
975
976 void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
977                                         unsigned int weight)
978 {
979         cfqg_of_blkg(blkg)->weight = weight;
980 }
981
982 static struct cfq_group *
983 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
984 {
985         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
986         struct cfq_group *cfqg = NULL;
987         void *key = cfqd;
988         int i, j;
989         struct cfq_rb_root *st;
990         struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
991         unsigned int major, minor;
992
993         cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
994         if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
995                 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
996                 cfqg->blkg.dev = MKDEV(major, minor);
997                 goto done;
998         }
999         if (cfqg || !create)
1000                 goto done;
1001
1002         cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1003         if (!cfqg)
1004                 goto done;
1005
1006         for_each_cfqg_st(cfqg, i, j, st)
1007                 *st = CFQ_RB_ROOT;
1008         RB_CLEAR_NODE(&cfqg->rb_node);
1009
1010         /*
1011          * Take the initial reference that will be released on destroy
1012          * This can be thought of a joint reference by cgroup and
1013          * elevator which will be dropped by either elevator exit
1014          * or cgroup deletion path depending on who is exiting first.
1015          */
1016         cfqg->ref = 1;
1017
1018         /*
1019          * Add group onto cgroup list. It might happen that bdi->dev is
1020          * not initialized yet. Initialize this new group without major
1021          * and minor info and this info will be filled in once a new thread
1022          * comes for IO. See code above.
1023          */
1024         if (bdi->dev) {
1025                 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1026                 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1027                                         MKDEV(major, minor));
1028         } else
1029                 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1030                                         0);
1031
1032         cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1033
1034         /* Add group on cfqd list */
1035         hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1036
1037 done:
1038         return cfqg;
1039 }
1040
1041 /*
1042  * Search for the cfq group current task belongs to. If create = 1, then also
1043  * create the cfq group if it does not exist. request_queue lock must be held.
1044  */
1045 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1046 {
1047         struct cgroup *cgroup;
1048         struct cfq_group *cfqg = NULL;
1049
1050         rcu_read_lock();
1051         cgroup = task_cgroup(current, blkio_subsys_id);
1052         cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1053         if (!cfqg && create)
1054                 cfqg = &cfqd->root_group;
1055         rcu_read_unlock();
1056         return cfqg;
1057 }
1058
1059 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1060 {
1061         cfqg->ref++;
1062         return cfqg;
1063 }
1064
1065 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1066 {
1067         /* Currently, all async queues are mapped to root group */
1068         if (!cfq_cfqq_sync(cfqq))
1069                 cfqg = &cfqq->cfqd->root_group;
1070
1071         cfqq->cfqg = cfqg;
1072         /* cfqq reference on cfqg */
1073         cfqq->cfqg->ref++;
1074 }
1075
1076 static void cfq_put_cfqg(struct cfq_group *cfqg)
1077 {
1078         struct cfq_rb_root *st;
1079         int i, j;
1080
1081         BUG_ON(cfqg->ref <= 0);
1082         cfqg->ref--;
1083         if (cfqg->ref)
1084                 return;
1085         for_each_cfqg_st(cfqg, i, j, st)
1086                 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1087         kfree(cfqg);
1088 }
1089
1090 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1091 {
1092         /* Something wrong if we are trying to remove same group twice */
1093         BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1094
1095         hlist_del_init(&cfqg->cfqd_node);
1096
1097         /*
1098          * Put the reference taken at the time of creation so that when all
1099          * queues are gone, group can be destroyed.
1100          */
1101         cfq_put_cfqg(cfqg);
1102 }
1103
1104 static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1105 {
1106         struct hlist_node *pos, *n;
1107         struct cfq_group *cfqg;
1108
1109         hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1110                 /*
1111                  * If cgroup removal path got to blk_group first and removed
1112                  * it from cgroup list, then it will take care of destroying
1113                  * cfqg also.
1114                  */
1115                 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1116                         cfq_destroy_cfqg(cfqd, cfqg);
1117         }
1118 }
1119
1120 /*
1121  * Blk cgroup controller notification saying that blkio_group object is being
1122  * delinked as associated cgroup object is going away. That also means that
1123  * no new IO will come in this group. So get rid of this group as soon as
1124  * any pending IO in the group is finished.
1125  *
1126  * This function is called under rcu_read_lock(). key is the rcu protected
1127  * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1128  * read lock.
1129  *
1130  * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1131  * it should not be NULL as even if elevator was exiting, cgroup deltion
1132  * path got to it first.
1133  */
1134 void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1135 {
1136         unsigned long  flags;
1137         struct cfq_data *cfqd = key;
1138
1139         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1140         cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1141         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1142 }
1143
1144 #else /* GROUP_IOSCHED */
1145 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1146 {
1147         return &cfqd->root_group;
1148 }
1149
1150 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1151 {
1152         return cfqg;
1153 }
1154
1155 static inline void
1156 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1157         cfqq->cfqg = cfqg;
1158 }
1159
1160 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1161 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1162
1163 #endif /* GROUP_IOSCHED */
1164
1165 /*
1166  * The cfqd->service_trees holds all pending cfq_queue's that have
1167  * requests waiting to be processed. It is sorted in the order that
1168  * we will service the queues.
1169  */
1170 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1171                                  bool add_front)
1172 {
1173         struct rb_node **p, *parent;
1174         struct cfq_queue *__cfqq;
1175         unsigned long rb_key;
1176         struct cfq_rb_root *service_tree;
1177         int left;
1178         int new_cfqq = 1;
1179         int group_changed = 0;
1180
1181         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1182                                                 cfqq_type(cfqq));
1183         if (cfq_class_idle(cfqq)) {
1184                 rb_key = CFQ_IDLE_DELAY;
1185                 parent = rb_last(&service_tree->rb);
1186                 if (parent && parent != &cfqq->rb_node) {
1187                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1188                         rb_key += __cfqq->rb_key;
1189                 } else
1190                         rb_key += jiffies;
1191         } else if (!add_front) {
1192                 /*
1193                  * Get our rb key offset. Subtract any residual slice
1194                  * value carried from last service. A negative resid
1195                  * count indicates slice overrun, and this should position
1196                  * the next service time further away in the tree.
1197                  */
1198                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1199                 rb_key -= cfqq->slice_resid;
1200                 cfqq->slice_resid = 0;
1201         } else {
1202                 rb_key = -HZ;
1203                 __cfqq = cfq_rb_first(service_tree);
1204                 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1205         }
1206
1207         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1208                 new_cfqq = 0;
1209                 /*
1210                  * same position, nothing more to do
1211                  */
1212                 if (rb_key == cfqq->rb_key &&
1213                     cfqq->service_tree == service_tree)
1214                         return;
1215
1216                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1217                 cfqq->service_tree = NULL;
1218         }
1219
1220         left = 1;
1221         parent = NULL;
1222         cfqq->service_tree = service_tree;
1223         p = &service_tree->rb.rb_node;
1224         while (*p) {
1225                 struct rb_node **n;
1226
1227                 parent = *p;
1228                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1229
1230                 /*
1231                  * sort by key, that represents service time.
1232                  */
1233                 if (time_before(rb_key, __cfqq->rb_key))
1234                         n = &(*p)->rb_left;
1235                 else {
1236                         n = &(*p)->rb_right;
1237                         left = 0;
1238                 }
1239
1240                 p = n;
1241         }
1242
1243         if (left)
1244                 service_tree->left = &cfqq->rb_node;
1245
1246         cfqq->rb_key = rb_key;
1247         rb_link_node(&cfqq->rb_node, parent, p);
1248         rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1249         service_tree->count++;
1250         if ((add_front || !new_cfqq) && !group_changed)
1251                 return;
1252         cfq_group_service_tree_add(cfqd, cfqq->cfqg);
1253 }
1254
1255 static struct cfq_queue *
1256 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1257                      sector_t sector, struct rb_node **ret_parent,
1258                      struct rb_node ***rb_link)
1259 {
1260         struct rb_node **p, *parent;
1261         struct cfq_queue *cfqq = NULL;
1262
1263         parent = NULL;
1264         p = &root->rb_node;
1265         while (*p) {
1266                 struct rb_node **n;
1267
1268                 parent = *p;
1269                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
1270
1271                 /*
1272                  * Sort strictly based on sector.  Smallest to the left,
1273                  * largest to the right.
1274                  */
1275                 if (sector > blk_rq_pos(cfqq->next_rq))
1276                         n = &(*p)->rb_right;
1277                 else if (sector < blk_rq_pos(cfqq->next_rq))
1278                         n = &(*p)->rb_left;
1279                 else
1280                         break;
1281                 p = n;
1282                 cfqq = NULL;
1283         }
1284
1285         *ret_parent = parent;
1286         if (rb_link)
1287                 *rb_link = p;
1288         return cfqq;
1289 }
1290
1291 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1292 {
1293         struct rb_node **p, *parent;
1294         struct cfq_queue *__cfqq;
1295
1296         if (cfqq->p_root) {
1297                 rb_erase(&cfqq->p_node, cfqq->p_root);
1298                 cfqq->p_root = NULL;
1299         }
1300
1301         if (cfq_class_idle(cfqq))
1302                 return;
1303         if (!cfqq->next_rq)
1304                 return;
1305
1306         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1307         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1308                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
1309         if (!__cfqq) {
1310                 rb_link_node(&cfqq->p_node, parent, p);
1311                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
1312         } else
1313                 cfqq->p_root = NULL;
1314 }
1315
1316 /*
1317  * Update cfqq's position in the service tree.
1318  */
1319 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1320 {
1321         /*
1322          * Resorting requires the cfqq to be on the RR list already.
1323          */
1324         if (cfq_cfqq_on_rr(cfqq)) {
1325                 cfq_service_tree_add(cfqd, cfqq, 0);
1326                 cfq_prio_tree_add(cfqd, cfqq);
1327         }
1328 }
1329
1330 /*
1331  * add to busy list of queues for service, trying to be fair in ordering
1332  * the pending list according to last request service
1333  */
1334 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1335 {
1336         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1337         BUG_ON(cfq_cfqq_on_rr(cfqq));
1338         cfq_mark_cfqq_on_rr(cfqq);
1339         cfqd->busy_queues++;
1340
1341         cfq_resort_rr_list(cfqd, cfqq);
1342 }
1343
1344 /*
1345  * Called when the cfqq no longer has requests pending, remove it from
1346  * the service tree.
1347  */
1348 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1349 {
1350         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1351         BUG_ON(!cfq_cfqq_on_rr(cfqq));
1352         cfq_clear_cfqq_on_rr(cfqq);
1353
1354         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1355                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1356                 cfqq->service_tree = NULL;
1357         }
1358         if (cfqq->p_root) {
1359                 rb_erase(&cfqq->p_node, cfqq->p_root);
1360                 cfqq->p_root = NULL;
1361         }
1362
1363         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1364         BUG_ON(!cfqd->busy_queues);
1365         cfqd->busy_queues--;
1366 }
1367
1368 /*
1369  * rb tree support functions
1370  */
1371 static void cfq_del_rq_rb(struct request *rq)
1372 {
1373         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1374         const int sync = rq_is_sync(rq);
1375
1376         BUG_ON(!cfqq->queued[sync]);
1377         cfqq->queued[sync]--;
1378
1379         elv_rb_del(&cfqq->sort_list, rq);
1380
1381         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1382                 /*
1383                  * Queue will be deleted from service tree when we actually
1384                  * expire it later. Right now just remove it from prio tree
1385                  * as it is empty.
1386                  */
1387                 if (cfqq->p_root) {
1388                         rb_erase(&cfqq->p_node, cfqq->p_root);
1389                         cfqq->p_root = NULL;
1390                 }
1391         }
1392 }
1393
1394 static void cfq_add_rq_rb(struct request *rq)
1395 {
1396         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1397         struct cfq_data *cfqd = cfqq->cfqd;
1398         struct request *__alias, *prev;
1399
1400         cfqq->queued[rq_is_sync(rq)]++;
1401
1402         /*
1403          * looks a little odd, but the first insert might return an alias.
1404          * if that happens, put the alias on the dispatch list
1405          */
1406         while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
1407                 cfq_dispatch_insert(cfqd->queue, __alias);
1408
1409         if (!cfq_cfqq_on_rr(cfqq))
1410                 cfq_add_cfqq_rr(cfqd, cfqq);
1411
1412         /*
1413          * check if this request is a better next-serve candidate
1414          */
1415         prev = cfqq->next_rq;
1416         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1417
1418         /*
1419          * adjust priority tree position, if ->next_rq changes
1420          */
1421         if (prev != cfqq->next_rq)
1422                 cfq_prio_tree_add(cfqd, cfqq);
1423
1424         BUG_ON(!cfqq->next_rq);
1425 }
1426
1427 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1428 {
1429         elv_rb_del(&cfqq->sort_list, rq);
1430         cfqq->queued[rq_is_sync(rq)]--;
1431         cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1432                                         rq_data_dir(rq), rq_is_sync(rq));
1433         cfq_add_rq_rb(rq);
1434         cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1435                         &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1436                         rq_is_sync(rq));
1437 }
1438
1439 static struct request *
1440 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1441 {
1442         struct task_struct *tsk = current;
1443         struct cfq_io_context *cic;
1444         struct cfq_queue *cfqq;
1445
1446         cic = cfq_cic_lookup(cfqd, tsk->io_context);
1447         if (!cic)
1448                 return NULL;
1449
1450         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1451         if (cfqq) {
1452                 sector_t sector = bio->bi_sector + bio_sectors(bio);
1453
1454                 return elv_rb_find(&cfqq->sort_list, sector);
1455         }
1456
1457         return NULL;
1458 }
1459
1460 static void cfq_activate_request(struct request_queue *q, struct request *rq)
1461 {
1462         struct cfq_data *cfqd = q->elevator->elevator_data;
1463
1464         cfqd->rq_in_driver++;
1465         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1466                                                 cfqd->rq_in_driver);
1467
1468         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1469 }
1470
1471 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1472 {
1473         struct cfq_data *cfqd = q->elevator->elevator_data;
1474
1475         WARN_ON(!cfqd->rq_in_driver);
1476         cfqd->rq_in_driver--;
1477         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1478                                                 cfqd->rq_in_driver);
1479 }
1480
1481 static void cfq_remove_request(struct request *rq)
1482 {
1483         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1484
1485         if (cfqq->next_rq == rq)
1486                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1487
1488         list_del_init(&rq->queuelist);
1489         cfq_del_rq_rb(rq);
1490
1491         cfqq->cfqd->rq_queued--;
1492         cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1493                                         rq_data_dir(rq), rq_is_sync(rq));
1494         if (rq->cmd_flags & REQ_META) {
1495                 WARN_ON(!cfqq->meta_pending);
1496                 cfqq->meta_pending--;
1497         }
1498 }
1499
1500 static int cfq_merge(struct request_queue *q, struct request **req,
1501                      struct bio *bio)
1502 {
1503         struct cfq_data *cfqd = q->elevator->elevator_data;
1504         struct request *__rq;
1505
1506         __rq = cfq_find_rq_fmerge(cfqd, bio);
1507         if (__rq && elv_rq_merge_ok(__rq, bio)) {
1508                 *req = __rq;
1509                 return ELEVATOR_FRONT_MERGE;
1510         }
1511
1512         return ELEVATOR_NO_MERGE;
1513 }
1514
1515 static void cfq_merged_request(struct request_queue *q, struct request *req,
1516                                int type)
1517 {
1518         if (type == ELEVATOR_FRONT_MERGE) {
1519                 struct cfq_queue *cfqq = RQ_CFQQ(req);
1520
1521                 cfq_reposition_rq_rb(cfqq, req);
1522         }
1523 }
1524
1525 static void cfq_bio_merged(struct request_queue *q, struct request *req,
1526                                 struct bio *bio)
1527 {
1528         cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1529                                         bio_data_dir(bio), cfq_bio_sync(bio));
1530 }
1531
1532 static void
1533 cfq_merged_requests(struct request_queue *q, struct request *rq,
1534                     struct request *next)
1535 {
1536         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1537         /*
1538          * reposition in fifo if next is older than rq
1539          */
1540         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1541             time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1542                 list_move(&rq->queuelist, &next->queuelist);
1543                 rq_set_fifo_time(rq, rq_fifo_time(next));
1544         }
1545
1546         if (cfqq->next_rq == next)
1547                 cfqq->next_rq = rq;
1548         cfq_remove_request(next);
1549         cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1550                                         rq_data_dir(next), rq_is_sync(next));
1551 }
1552
1553 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1554                            struct bio *bio)
1555 {
1556         struct cfq_data *cfqd = q->elevator->elevator_data;
1557         struct cfq_io_context *cic;
1558         struct cfq_queue *cfqq;
1559
1560         /*
1561          * Disallow merge of a sync bio into an async request.
1562          */
1563         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1564                 return false;
1565
1566         /*
1567          * Lookup the cfqq that this bio will be queued with. Allow
1568          * merge only if rq is queued there.
1569          */
1570         cic = cfq_cic_lookup(cfqd, current->io_context);
1571         if (!cic)
1572                 return false;
1573
1574         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1575         return cfqq == RQ_CFQQ(rq);
1576 }
1577
1578 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1579 {
1580         del_timer(&cfqd->idle_slice_timer);
1581         cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1582 }
1583
1584 static void __cfq_set_active_queue(struct cfq_data *cfqd,
1585                                    struct cfq_queue *cfqq)
1586 {
1587         if (cfqq) {
1588                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1589                                 cfqd->serving_prio, cfqd->serving_type);
1590                 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1591                 cfqq->slice_start = 0;
1592                 cfqq->dispatch_start = jiffies;
1593                 cfqq->allocated_slice = 0;
1594                 cfqq->slice_end = 0;
1595                 cfqq->slice_dispatch = 0;
1596                 cfqq->nr_sectors = 0;
1597
1598                 cfq_clear_cfqq_wait_request(cfqq);
1599                 cfq_clear_cfqq_must_dispatch(cfqq);
1600                 cfq_clear_cfqq_must_alloc_slice(cfqq);
1601                 cfq_clear_cfqq_fifo_expire(cfqq);
1602                 cfq_mark_cfqq_slice_new(cfqq);
1603
1604                 cfq_del_timer(cfqd, cfqq);
1605         }
1606
1607         cfqd->active_queue = cfqq;
1608 }
1609
1610 /*
1611  * current cfqq expired its slice (or was too idle), select new one
1612  */
1613 static void
1614 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1615                     bool timed_out)
1616 {
1617         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1618
1619         if (cfq_cfqq_wait_request(cfqq))
1620                 cfq_del_timer(cfqd, cfqq);
1621
1622         cfq_clear_cfqq_wait_request(cfqq);
1623         cfq_clear_cfqq_wait_busy(cfqq);
1624
1625         /*
1626          * If this cfqq is shared between multiple processes, check to
1627          * make sure that those processes are still issuing I/Os within
1628          * the mean seek distance.  If not, it may be time to break the
1629          * queues apart again.
1630          */
1631         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1632                 cfq_mark_cfqq_split_coop(cfqq);
1633
1634         /*
1635          * store what was left of this slice, if the queue idled/timed out
1636          */
1637         if (timed_out) {
1638                 if (cfq_cfqq_slice_new(cfqq))
1639                         cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1640                 else
1641                         cfqq->slice_resid = cfqq->slice_end - jiffies;
1642                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1643         }
1644
1645         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1646
1647         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1648                 cfq_del_cfqq_rr(cfqd, cfqq);
1649
1650         cfq_resort_rr_list(cfqd, cfqq);
1651
1652         if (cfqq == cfqd->active_queue)
1653                 cfqd->active_queue = NULL;
1654
1655         if (cfqd->active_cic) {
1656                 put_io_context(cfqd->active_cic->ioc);
1657                 cfqd->active_cic = NULL;
1658         }
1659 }
1660
1661 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1662 {
1663         struct cfq_queue *cfqq = cfqd->active_queue;
1664
1665         if (cfqq)
1666                 __cfq_slice_expired(cfqd, cfqq, timed_out);
1667 }
1668
1669 /*
1670  * Get next queue for service. Unless we have a queue preemption,
1671  * we'll simply select the first cfqq in the service tree.
1672  */
1673 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1674 {
1675         struct cfq_rb_root *service_tree =
1676                 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1677                                         cfqd->serving_type);
1678
1679         if (!cfqd->rq_queued)
1680                 return NULL;
1681
1682         /* There is nothing to dispatch */
1683         if (!service_tree)
1684                 return NULL;
1685         if (RB_EMPTY_ROOT(&service_tree->rb))
1686                 return NULL;
1687         return cfq_rb_first(service_tree);
1688 }
1689
1690 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1691 {
1692         struct cfq_group *cfqg;
1693         struct cfq_queue *cfqq;
1694         int i, j;
1695         struct cfq_rb_root *st;
1696
1697         if (!cfqd->rq_queued)
1698                 return NULL;
1699
1700         cfqg = cfq_get_next_cfqg(cfqd);
1701         if (!cfqg)
1702                 return NULL;
1703
1704         for_each_cfqg_st(cfqg, i, j, st)
1705                 if ((cfqq = cfq_rb_first(st)) != NULL)
1706                         return cfqq;
1707         return NULL;
1708 }
1709
1710 /*
1711  * Get and set a new active queue for service.
1712  */
1713 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1714                                               struct cfq_queue *cfqq)
1715 {
1716         if (!cfqq)
1717                 cfqq = cfq_get_next_queue(cfqd);
1718
1719         __cfq_set_active_queue(cfqd, cfqq);
1720         return cfqq;
1721 }
1722
1723 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1724                                           struct request *rq)
1725 {
1726         if (blk_rq_pos(rq) >= cfqd->last_position)
1727                 return blk_rq_pos(rq) - cfqd->last_position;
1728         else
1729                 return cfqd->last_position - blk_rq_pos(rq);
1730 }
1731
1732 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1733                                struct request *rq)
1734 {
1735         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1736 }
1737
1738 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1739                                     struct cfq_queue *cur_cfqq)
1740 {
1741         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1742         struct rb_node *parent, *node;
1743         struct cfq_queue *__cfqq;
1744         sector_t sector = cfqd->last_position;
1745
1746         if (RB_EMPTY_ROOT(root))
1747                 return NULL;
1748
1749         /*
1750          * First, if we find a request starting at the end of the last
1751          * request, choose it.
1752          */
1753         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1754         if (__cfqq)
1755                 return __cfqq;
1756
1757         /*
1758          * If the exact sector wasn't found, the parent of the NULL leaf
1759          * will contain the closest sector.
1760          */
1761         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1762         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1763                 return __cfqq;
1764
1765         if (blk_rq_pos(__cfqq->next_rq) < sector)
1766                 node = rb_next(&__cfqq->p_node);
1767         else
1768                 node = rb_prev(&__cfqq->p_node);
1769         if (!node)
1770                 return NULL;
1771
1772         __cfqq = rb_entry(node, struct cfq_queue, p_node);
1773         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1774                 return __cfqq;
1775
1776         return NULL;
1777 }
1778
1779 /*
1780  * cfqd - obvious
1781  * cur_cfqq - passed in so that we don't decide that the current queue is
1782  *            closely cooperating with itself.
1783  *
1784  * So, basically we're assuming that that cur_cfqq has dispatched at least
1785  * one request, and that cfqd->last_position reflects a position on the disk
1786  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1787  * assumption.
1788  */
1789 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1790                                               struct cfq_queue *cur_cfqq)
1791 {
1792         struct cfq_queue *cfqq;
1793
1794         if (cfq_class_idle(cur_cfqq))
1795                 return NULL;
1796         if (!cfq_cfqq_sync(cur_cfqq))
1797                 return NULL;
1798         if (CFQQ_SEEKY(cur_cfqq))
1799                 return NULL;
1800
1801         /*
1802          * Don't search priority tree if it's the only queue in the group.
1803          */
1804         if (cur_cfqq->cfqg->nr_cfqq == 1)
1805                 return NULL;
1806
1807         /*
1808          * We should notice if some of the queues are cooperating, eg
1809          * working closely on the same area of the disk. In that case,
1810          * we can group them together and don't waste time idling.
1811          */
1812         cfqq = cfqq_close(cfqd, cur_cfqq);
1813         if (!cfqq)
1814                 return NULL;
1815
1816         /* If new queue belongs to different cfq_group, don't choose it */
1817         if (cur_cfqq->cfqg != cfqq->cfqg)
1818                 return NULL;
1819
1820         /*
1821          * It only makes sense to merge sync queues.
1822          */
1823         if (!cfq_cfqq_sync(cfqq))
1824                 return NULL;
1825         if (CFQQ_SEEKY(cfqq))
1826                 return NULL;
1827
1828         /*
1829          * Do not merge queues of different priority classes
1830          */
1831         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1832                 return NULL;
1833
1834         return cfqq;
1835 }
1836
1837 /*
1838  * Determine whether we should enforce idle window for this queue.
1839  */
1840
1841 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1842 {
1843         enum wl_prio_t prio = cfqq_prio(cfqq);
1844         struct cfq_rb_root *service_tree = cfqq->service_tree;
1845
1846         BUG_ON(!service_tree);
1847         BUG_ON(!service_tree->count);
1848
1849         if (!cfqd->cfq_slice_idle)
1850                 return false;
1851
1852         /* We never do for idle class queues. */
1853         if (prio == IDLE_WORKLOAD)
1854                 return false;
1855
1856         /* We do for queues that were marked with idle window flag. */
1857         if (cfq_cfqq_idle_window(cfqq) &&
1858            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1859                 return true;
1860
1861         /*
1862          * Otherwise, we do only if they are the last ones
1863          * in their service tree.
1864          */
1865         if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1866                 return true;
1867         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1868                         service_tree->count);
1869         return false;
1870 }
1871
1872 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1873 {
1874         struct cfq_queue *cfqq = cfqd->active_queue;
1875         struct cfq_io_context *cic;
1876         unsigned long sl, group_idle = 0;
1877
1878         /*
1879          * SSD device without seek penalty, disable idling. But only do so
1880          * for devices that support queuing, otherwise we still have a problem
1881          * with sync vs async workloads.
1882          */
1883         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1884                 return;
1885
1886         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1887         WARN_ON(cfq_cfqq_slice_new(cfqq));
1888
1889         /*
1890          * idle is disabled, either manually or by past process history
1891          */
1892         if (!cfq_should_idle(cfqd, cfqq)) {
1893                 /* no queue idling. Check for group idling */
1894                 if (cfqd->cfq_group_idle)
1895                         group_idle = cfqd->cfq_group_idle;
1896                 else
1897                         return;
1898         }
1899
1900         /*
1901          * still active requests from this queue, don't idle
1902          */
1903         if (cfqq->dispatched)
1904                 return;
1905
1906         /*
1907          * task has exited, don't wait
1908          */
1909         cic = cfqd->active_cic;
1910         if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1911                 return;
1912
1913         /*
1914          * If our average think time is larger than the remaining time
1915          * slice, then don't idle. This avoids overrunning the allotted
1916          * time slice.
1917          */
1918         if (sample_valid(cic->ttime_samples) &&
1919             (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1920                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
1921                                 cic->ttime_mean);
1922                 return;
1923         }
1924
1925         /* There are other queues in the group, don't do group idle */
1926         if (group_idle && cfqq->cfqg->nr_cfqq > 1)
1927                 return;
1928
1929         cfq_mark_cfqq_wait_request(cfqq);
1930
1931         if (group_idle)
1932                 sl = cfqd->cfq_group_idle;
1933         else
1934                 sl = cfqd->cfq_slice_idle;
1935
1936         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1937         cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1938         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
1939                         group_idle ? 1 : 0);
1940 }
1941
1942 /*
1943  * Move request from internal lists to the request queue dispatch list.
1944  */
1945 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1946 {
1947         struct cfq_data *cfqd = q->elevator->elevator_data;
1948         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1949
1950         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1951
1952         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1953         cfq_remove_request(rq);
1954         cfqq->dispatched++;
1955         (RQ_CFQG(rq))->dispatched++;
1956         elv_dispatch_sort(q, rq);
1957
1958         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1959         cfqq->nr_sectors += blk_rq_sectors(rq);
1960         cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1961                                         rq_data_dir(rq), rq_is_sync(rq));
1962 }
1963
1964 /*
1965  * return expired entry, or NULL to just start from scratch in rbtree
1966  */
1967 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1968 {
1969         struct request *rq = NULL;
1970
1971         if (cfq_cfqq_fifo_expire(cfqq))
1972                 return NULL;
1973
1974         cfq_mark_cfqq_fifo_expire(cfqq);
1975
1976         if (list_empty(&cfqq->fifo))
1977                 return NULL;
1978
1979         rq = rq_entry_fifo(cfqq->fifo.next);
1980         if (time_before(jiffies, rq_fifo_time(rq)))
1981                 rq = NULL;
1982
1983         cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
1984         return rq;
1985 }
1986
1987 static inline int
1988 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1989 {
1990         const int base_rq = cfqd->cfq_slice_async_rq;
1991
1992         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1993
1994         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1995 }
1996
1997 /*
1998  * Must be called with the queue_lock held.
1999  */
2000 static int cfqq_process_refs(struct cfq_queue *cfqq)
2001 {
2002         int process_refs, io_refs;
2003
2004         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2005         process_refs = cfqq->ref - io_refs;
2006         BUG_ON(process_refs < 0);
2007         return process_refs;
2008 }
2009
2010 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2011 {
2012         int process_refs, new_process_refs;
2013         struct cfq_queue *__cfqq;
2014
2015         /*
2016          * If there are no process references on the new_cfqq, then it is
2017          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2018          * chain may have dropped their last reference (not just their
2019          * last process reference).
2020          */
2021         if (!cfqq_process_refs(new_cfqq))
2022                 return;
2023
2024         /* Avoid a circular list and skip interim queue merges */
2025         while ((__cfqq = new_cfqq->new_cfqq)) {
2026                 if (__cfqq == cfqq)
2027                         return;
2028                 new_cfqq = __cfqq;
2029         }
2030
2031         process_refs = cfqq_process_refs(cfqq);
2032         new_process_refs = cfqq_process_refs(new_cfqq);
2033         /*
2034          * If the process for the cfqq has gone away, there is no
2035          * sense in merging the queues.
2036          */
2037         if (process_refs == 0 || new_process_refs == 0)
2038                 return;
2039
2040         /*
2041          * Merge in the direction of the lesser amount of work.
2042          */
2043         if (new_process_refs >= process_refs) {
2044                 cfqq->new_cfqq = new_cfqq;
2045                 new_cfqq->ref += process_refs;
2046         } else {
2047                 new_cfqq->new_cfqq = cfqq;
2048                 cfqq->ref += new_process_refs;
2049         }
2050 }
2051
2052 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2053                                 struct cfq_group *cfqg, enum wl_prio_t prio)
2054 {
2055         struct cfq_queue *queue;
2056         int i;
2057         bool key_valid = false;
2058         unsigned long lowest_key = 0;
2059         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2060
2061         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2062                 /* select the one with lowest rb_key */
2063                 queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2064                 if (queue &&
2065                     (!key_valid || time_before(queue->rb_key, lowest_key))) {
2066                         lowest_key = queue->rb_key;
2067                         cur_best = i;
2068                         key_valid = true;
2069                 }
2070         }
2071
2072         return cur_best;
2073 }
2074
2075 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2076 {
2077         unsigned slice;
2078         unsigned count;
2079         struct cfq_rb_root *st;
2080         unsigned group_slice;
2081         enum wl_prio_t original_prio = cfqd->serving_prio;
2082
2083         /* Choose next priority. RT > BE > IDLE */
2084         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2085                 cfqd->serving_prio = RT_WORKLOAD;
2086         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2087                 cfqd->serving_prio = BE_WORKLOAD;
2088         else {
2089                 cfqd->serving_prio = IDLE_WORKLOAD;
2090                 cfqd->workload_expires = jiffies + 1;
2091                 return;
2092         }
2093
2094         if (original_prio != cfqd->serving_prio)
2095                 goto new_workload;
2096
2097         /*
2098          * For RT and BE, we have to choose also the type
2099          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2100          * expiration time
2101          */
2102         st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2103         count = st->count;
2104
2105         /*
2106          * check workload expiration, and that we still have other queues ready
2107          */
2108         if (count && !time_after(jiffies, cfqd->workload_expires))
2109                 return;
2110
2111 new_workload:
2112         /* otherwise select new workload type */
2113         cfqd->serving_type =
2114                 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2115         st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2116         count = st->count;
2117
2118         /*
2119          * the workload slice is computed as a fraction of target latency
2120          * proportional to the number of queues in that workload, over
2121          * all the queues in the same priority class
2122          */
2123         group_slice = cfq_group_slice(cfqd, cfqg);
2124
2125         slice = group_slice * count /
2126                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2127                       cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2128
2129         if (cfqd->serving_type == ASYNC_WORKLOAD) {
2130                 unsigned int tmp;
2131
2132                 /*
2133                  * Async queues are currently system wide. Just taking
2134                  * proportion of queues with-in same group will lead to higher
2135                  * async ratio system wide as generally root group is going
2136                  * to have higher weight. A more accurate thing would be to
2137                  * calculate system wide asnc/sync ratio.
2138                  */
2139                 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2140                 tmp = tmp/cfqd->busy_queues;
2141                 slice = min_t(unsigned, slice, tmp);
2142
2143                 /* async workload slice is scaled down according to
2144                  * the sync/async slice ratio. */
2145                 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2146         } else
2147                 /* sync workload slice is at least 2 * cfq_slice_idle */
2148                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2149
2150         slice = max_t(unsigned, slice, CFQ_MIN_TT);
2151         cfq_log(cfqd, "workload slice:%d", slice);
2152         cfqd->workload_expires = jiffies + slice;
2153 }
2154
2155 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2156 {
2157         struct cfq_rb_root *st = &cfqd->grp_service_tree;
2158         struct cfq_group *cfqg;
2159
2160         if (RB_EMPTY_ROOT(&st->rb))
2161                 return NULL;
2162         cfqg = cfq_rb_first_group(st);
2163         update_min_vdisktime(st);
2164         return cfqg;
2165 }
2166
2167 static void cfq_choose_cfqg(struct cfq_data *cfqd)
2168 {
2169         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2170
2171         cfqd->serving_group = cfqg;
2172
2173         /* Restore the workload type data */
2174         if (cfqg->saved_workload_slice) {
2175                 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2176                 cfqd->serving_type = cfqg->saved_workload;
2177                 cfqd->serving_prio = cfqg->saved_serving_prio;
2178         } else
2179                 cfqd->workload_expires = jiffies - 1;
2180
2181         choose_service_tree(cfqd, cfqg);
2182 }
2183
2184 /*
2185  * Select a queue for service. If we have a current active queue,
2186  * check whether to continue servicing it, or retrieve and set a new one.
2187  */
2188 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2189 {
2190         struct cfq_queue *cfqq, *new_cfqq = NULL;
2191
2192         cfqq = cfqd->active_queue;
2193         if (!cfqq)
2194                 goto new_queue;
2195
2196         if (!cfqd->rq_queued)
2197                 return NULL;
2198
2199         /*
2200          * We were waiting for group to get backlogged. Expire the queue
2201          */
2202         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2203                 goto expire;
2204
2205         /*
2206          * The active queue has run out of time, expire it and select new.
2207          */
2208         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2209                 /*
2210                  * If slice had not expired at the completion of last request
2211                  * we might not have turned on wait_busy flag. Don't expire
2212                  * the queue yet. Allow the group to get backlogged.
2213                  *
2214                  * The very fact that we have used the slice, that means we
2215                  * have been idling all along on this queue and it should be
2216                  * ok to wait for this request to complete.
2217                  */
2218                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2219                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2220                         cfqq = NULL;
2221                         goto keep_queue;
2222                 } else
2223                         goto check_group_idle;
2224         }
2225
2226         /*
2227          * The active queue has requests and isn't expired, allow it to
2228          * dispatch.
2229          */
2230         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2231                 goto keep_queue;
2232
2233         /*
2234          * If another queue has a request waiting within our mean seek
2235          * distance, let it run.  The expire code will check for close
2236          * cooperators and put the close queue at the front of the service
2237          * tree.  If possible, merge the expiring queue with the new cfqq.
2238          */
2239         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2240         if (new_cfqq) {
2241                 if (!cfqq->new_cfqq)
2242                         cfq_setup_merge(cfqq, new_cfqq);
2243                 goto expire;
2244         }
2245
2246         /*
2247          * No requests pending. If the active queue still has requests in
2248          * flight or is idling for a new request, allow either of these
2249          * conditions to happen (or time out) before selecting a new queue.
2250          */
2251         if (timer_pending(&cfqd->idle_slice_timer)) {
2252                 cfqq = NULL;
2253                 goto keep_queue;
2254         }
2255
2256         /*
2257          * This is a deep seek queue, but the device is much faster than
2258          * the queue can deliver, don't idle
2259          **/
2260         if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2261             (cfq_cfqq_slice_new(cfqq) ||
2262             (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2263                 cfq_clear_cfqq_deep(cfqq);
2264                 cfq_clear_cfqq_idle_window(cfqq);
2265         }
2266
2267         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2268                 cfqq = NULL;
2269                 goto keep_queue;
2270         }
2271
2272         /*
2273          * If group idle is enabled and there are requests dispatched from
2274          * this group, wait for requests to complete.
2275          */
2276 check_group_idle:
2277         if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
2278             && cfqq->cfqg->dispatched) {
2279                 cfqq = NULL;
2280                 goto keep_queue;
2281         }
2282
2283 expire:
2284         cfq_slice_expired(cfqd, 0);
2285 new_queue:
2286         /*
2287          * Current queue expired. Check if we have to switch to a new
2288          * service tree
2289          */
2290         if (!new_cfqq)
2291                 cfq_choose_cfqg(cfqd);
2292
2293         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2294 keep_queue:
2295         return cfqq;
2296 }
2297
2298 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2299 {
2300         int dispatched = 0;
2301
2302         while (cfqq->next_rq) {
2303                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2304                 dispatched++;
2305         }
2306
2307         BUG_ON(!list_empty(&cfqq->fifo));
2308
2309         /* By default cfqq is not expired if it is empty. Do it explicitly */
2310         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2311         return dispatched;
2312 }
2313
2314 /*
2315  * Drain our current requests. Used for barriers and when switching
2316  * io schedulers on-the-fly.
2317  */
2318 static int cfq_forced_dispatch(struct cfq_data *cfqd)
2319 {
2320         struct cfq_queue *cfqq;
2321         int dispatched = 0;
2322
2323         /* Expire the timeslice of the current active queue first */
2324         cfq_slice_expired(cfqd, 0);
2325         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2326                 __cfq_set_active_queue(cfqd, cfqq);
2327                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2328         }
2329
2330         BUG_ON(cfqd->busy_queues);
2331
2332         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2333         return dispatched;
2334 }
2335
2336 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2337         struct cfq_queue *cfqq)
2338 {
2339         /* the queue hasn't finished any request, can't estimate */
2340         if (cfq_cfqq_slice_new(cfqq))
2341                 return true;
2342         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2343                 cfqq->slice_end))
2344                 return true;
2345
2346         return false;
2347 }
2348
2349 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2350 {
2351         unsigned int max_dispatch;
2352
2353         /*
2354          * Drain async requests before we start sync IO
2355          */
2356         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2357                 return false;
2358
2359         /*
2360          * If this is an async queue and we have sync IO in flight, let it wait
2361          */
2362         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2363                 return false;
2364
2365         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2366         if (cfq_class_idle(cfqq))
2367                 max_dispatch = 1;
2368
2369         /*
2370          * Does this cfqq already have too much IO in flight?
2371          */
2372         if (cfqq->dispatched >= max_dispatch) {
2373                 /*
2374                  * idle queue must always only have a single IO in flight
2375                  */
2376                 if (cfq_class_idle(cfqq))
2377                         return false;
2378
2379                 /*
2380                  * We have other queues, don't allow more IO from this one
2381                  */
2382                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
2383                         return false;
2384
2385                 /*
2386                  * Sole queue user, no limit
2387                  */
2388                 if (cfqd->busy_queues == 1)
2389                         max_dispatch = -1;
2390                 else
2391                         /*
2392                          * Normally we start throttling cfqq when cfq_quantum/2
2393                          * requests have been dispatched. But we can drive
2394                          * deeper queue depths at the beginning of slice
2395                          * subjected to upper limit of cfq_quantum.
2396                          * */
2397                         max_dispatch = cfqd->cfq_quantum;
2398         }
2399
2400         /*
2401          * Async queues must wait a bit before being allowed dispatch.
2402          * We also ramp up the dispatch depth gradually for async IO,
2403          * based on the last sync IO we serviced
2404          */
2405         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2406                 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2407                 unsigned int depth;
2408
2409                 depth = last_sync / cfqd->cfq_slice[1];
2410                 if (!depth && !cfqq->dispatched)
2411                         depth = 1;
2412                 if (depth < max_dispatch)
2413                         max_dispatch = depth;
2414         }
2415
2416         /*
2417          * If we're below the current max, allow a dispatch
2418          */
2419         return cfqq->dispatched < max_dispatch;
2420 }
2421
2422 /*
2423  * Dispatch a request from cfqq, moving them to the request queue
2424  * dispatch list.
2425  */
2426 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2427 {
2428         struct request *rq;
2429
2430         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2431
2432         if (!cfq_may_dispatch(cfqd, cfqq))
2433                 return false;
2434
2435         /*
2436          * follow expired path, else get first next available
2437          */
2438         rq = cfq_check_fifo(cfqq);
2439         if (!rq)
2440                 rq = cfqq->next_rq;
2441
2442         /*
2443          * insert request into driver dispatch list
2444          */
2445         cfq_dispatch_insert(cfqd->queue, rq);
2446
2447         if (!cfqd->active_cic) {
2448                 struct cfq_io_context *cic = RQ_CIC(rq);
2449
2450                 atomic_long_inc(&cic->ioc->refcount);
2451                 cfqd->active_cic = cic;
2452         }
2453
2454         return true;
2455 }
2456
2457 /*
2458  * Find the cfqq that we need to service and move a request from that to the
2459  * dispatch list
2460  */
2461 static int cfq_dispatch_requests(struct request_queue *q, int force)
2462 {
2463         struct cfq_data *cfqd = q->elevator->elevator_data;
2464         struct cfq_queue *cfqq;
2465
2466         if (!cfqd->busy_queues)
2467                 return 0;
2468
2469         if (unlikely(force))
2470                 return cfq_forced_dispatch(cfqd);
2471
2472         cfqq = cfq_select_queue(cfqd);
2473         if (!cfqq)
2474                 return 0;
2475
2476         /*
2477          * Dispatch a request from this cfqq, if it is allowed
2478          */
2479         if (!cfq_dispatch_request(cfqd, cfqq))
2480                 return 0;
2481
2482         cfqq->slice_dispatch++;
2483         cfq_clear_cfqq_must_dispatch(cfqq);
2484
2485         /*
2486          * expire an async queue immediately if it has used up its slice. idle
2487          * queue always expire after 1 dispatch round.
2488          */
2489         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2490             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2491             cfq_class_idle(cfqq))) {
2492                 cfqq->slice_end = jiffies + 1;
2493                 cfq_slice_expired(cfqd, 0);
2494         }
2495
2496         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2497         return 1;
2498 }
2499
2500 /*
2501  * task holds one reference to the queue, dropped when task exits. each rq
2502  * in-flight on this queue also holds a reference, dropped when rq is freed.
2503  *
2504  * Each cfq queue took a reference on the parent group. Drop it now.
2505  * queue lock must be held here.
2506  */
2507 static void cfq_put_queue(struct cfq_queue *cfqq)
2508 {
2509         struct cfq_data *cfqd = cfqq->cfqd;
2510         struct cfq_group *cfqg;
2511
2512         BUG_ON(cfqq->ref <= 0);
2513
2514         cfqq->ref--;
2515         if (cfqq->ref)
2516                 return;
2517
2518         cfq_log_cfqq(cfqd, cfqq, "put_queue");
2519         BUG_ON(rb_first(&cfqq->sort_list));
2520         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2521         cfqg = cfqq->cfqg;
2522
2523         if (unlikely(cfqd->active_queue == cfqq)) {
2524                 __cfq_slice_expired(cfqd, cfqq, 0);
2525                 cfq_schedule_dispatch(cfqd);
2526         }
2527
2528         BUG_ON(cfq_cfqq_on_rr(cfqq));
2529         kmem_cache_free(cfq_pool, cfqq);
2530         cfq_put_cfqg(cfqg);
2531 }
2532
2533 /*
2534  * Must always be called with the rcu_read_lock() held
2535  */
2536 static void
2537 __call_for_each_cic(struct io_context *ioc,
2538                     void (*func)(struct io_context *, struct cfq_io_context *))
2539 {
2540         struct cfq_io_context *cic;
2541         struct hlist_node *n;
2542
2543         hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2544                 func(ioc, cic);
2545 }
2546
2547 /*
2548  * Call func for each cic attached to this ioc.
2549  */
2550 static void
2551 call_for_each_cic(struct io_context *ioc,
2552                   void (*func)(struct io_context *, struct cfq_io_context *))
2553 {
2554         rcu_read_lock();
2555         __call_for_each_cic(ioc, func);
2556         rcu_read_unlock();
2557 }
2558
2559 static void cfq_cic_free_rcu(struct rcu_head *head)
2560 {
2561         struct cfq_io_context *cic;
2562
2563         cic = container_of(head, struct cfq_io_context, rcu_head);
2564
2565         kmem_cache_free(cfq_ioc_pool, cic);
2566         elv_ioc_count_dec(cfq_ioc_count);
2567
2568         if (ioc_gone) {
2569                 /*
2570                  * CFQ scheduler is exiting, grab exit lock and check
2571                  * the pending io context count. If it hits zero,
2572                  * complete ioc_gone and set it back to NULL
2573                  */
2574                 spin_lock(&ioc_gone_lock);
2575                 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2576                         complete(ioc_gone);
2577                         ioc_gone = NULL;
2578                 }
2579                 spin_unlock(&ioc_gone_lock);
2580         }
2581 }
2582
2583 static void cfq_cic_free(struct cfq_io_context *cic)
2584 {
2585         call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2586 }
2587
2588 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2589 {
2590         unsigned long flags;
2591         unsigned long dead_key = (unsigned long) cic->key;
2592
2593         BUG_ON(!(dead_key & CIC_DEAD_KEY));
2594
2595         spin_lock_irqsave(&ioc->lock, flags);
2596         radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2597         hlist_del_rcu(&cic->cic_list);
2598         spin_unlock_irqrestore(&ioc->lock, flags);
2599
2600         cfq_cic_free(cic);
2601 }
2602
2603 /*
2604  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2605  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2606  * and ->trim() which is called with the task lock held
2607  */
2608 static void cfq_free_io_context(struct io_context *ioc)
2609 {
2610         /*
2611          * ioc->refcount is zero here, or we are called from elv_unregister(),
2612          * so no more cic's are allowed to be linked into this ioc.  So it
2613          * should be ok to iterate over the known list, we will see all cic's
2614          * since no new ones are added.
2615          */
2616         __call_for_each_cic(ioc, cic_free_func);
2617 }
2618
2619 static void cfq_put_cooperator(struct cfq_queue *cfqq)
2620 {
2621         struct cfq_queue *__cfqq, *next;
2622
2623         /*
2624          * If this queue was scheduled to merge with another queue, be
2625          * sure to drop the reference taken on that queue (and others in
2626          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2627          */
2628         __cfqq = cfqq->new_cfqq;
2629         while (__cfqq) {
2630                 if (__cfqq == cfqq) {
2631                         WARN(1, "cfqq->new_cfqq loop detected\n");
2632                         break;
2633                 }
2634                 next = __cfqq->new_cfqq;
2635                 cfq_put_queue(__cfqq);
2636                 __cfqq = next;
2637         }
2638 }
2639
2640 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2641 {
2642         if (unlikely(cfqq == cfqd->active_queue)) {
2643                 __cfq_slice_expired(cfqd, cfqq, 0);
2644                 cfq_schedule_dispatch(cfqd);
2645         }
2646
2647         cfq_put_cooperator(cfqq);
2648
2649         cfq_put_queue(cfqq);
2650 }
2651
2652 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2653                                          struct cfq_io_context *cic)
2654 {
2655         struct io_context *ioc = cic->ioc;
2656
2657         list_del_init(&cic->queue_list);
2658
2659         /*
2660          * Make sure dead mark is seen for dead queues
2661          */
2662         smp_wmb();
2663         cic->key = cfqd_dead_key(cfqd);
2664
2665         if (ioc->ioc_data == cic)
2666                 rcu_assign_pointer(ioc->ioc_data, NULL);
2667
2668         if (cic->cfqq[BLK_RW_ASYNC]) {
2669                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2670                 cic->cfqq[BLK_RW_ASYNC] = NULL;
2671         }
2672
2673         if (cic->cfqq[BLK_RW_SYNC]) {
2674                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2675                 cic->cfqq[BLK_RW_SYNC] = NULL;
2676         }
2677 }
2678
2679 static void cfq_exit_single_io_context(struct io_context *ioc,
2680                                        struct cfq_io_context *cic)
2681 {
2682         struct cfq_data *cfqd = cic_to_cfqd(cic);
2683
2684         if (cfqd) {
2685                 struct request_queue *q = cfqd->queue;
2686                 unsigned long flags;
2687
2688                 spin_lock_irqsave(q->queue_lock, flags);
2689
2690                 /*
2691                  * Ensure we get a fresh copy of the ->key to prevent
2692                  * race between exiting task and queue
2693                  */
2694                 smp_read_barrier_depends();
2695                 if (cic->key == cfqd)
2696                         __cfq_exit_single_io_context(cfqd, cic);
2697
2698                 spin_unlock_irqrestore(q->queue_lock, flags);
2699         }
2700 }
2701
2702 /*
2703  * The process that ioc belongs to has exited, we need to clean up
2704  * and put the internal structures we have that belongs to that process.
2705  */
2706 static void cfq_exit_io_context(struct io_context *ioc)
2707 {
2708         call_for_each_cic(ioc, cfq_exit_single_io_context);
2709 }
2710
2711 static struct cfq_io_context *
2712 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2713 {
2714         struct cfq_io_context *cic;
2715
2716         cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2717                                                         cfqd->queue->node);
2718         if (cic) {
2719                 cic->last_end_request = jiffies;
2720                 INIT_LIST_HEAD(&cic->queue_list);
2721                 INIT_HLIST_NODE(&cic->cic_list);
2722                 cic->dtor = cfq_free_io_context;
2723                 cic->exit = cfq_exit_io_context;
2724                 elv_ioc_count_inc(cfq_ioc_count);
2725         }
2726
2727         return cic;
2728 }
2729
2730 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2731 {
2732         struct task_struct *tsk = current;
2733         int ioprio_class;
2734
2735         if (!cfq_cfqq_prio_changed(cfqq))
2736                 return;
2737
2738         ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2739         switch (ioprio_class) {
2740         default:
2741                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2742         case IOPRIO_CLASS_NONE:
2743                 /*
2744                  * no prio set, inherit CPU scheduling settings
2745                  */
2746                 cfqq->ioprio = task_nice_ioprio(tsk);
2747                 cfqq->ioprio_class = task_nice_ioclass(tsk);
2748                 break;
2749         case IOPRIO_CLASS_RT:
2750                 cfqq->ioprio = task_ioprio(ioc);
2751                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
2752                 break;
2753         case IOPRIO_CLASS_BE:
2754                 cfqq->ioprio = task_ioprio(ioc);
2755                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
2756                 break;
2757         case IOPRIO_CLASS_IDLE:
2758                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2759                 cfqq->ioprio = 7;
2760                 cfq_clear_cfqq_idle_window(cfqq);
2761                 break;
2762         }
2763
2764         /*
2765          * keep track of original prio settings in case we have to temporarily
2766          * elevate the priority of this queue
2767          */
2768         cfqq->org_ioprio = cfqq->ioprio;
2769         cfqq->org_ioprio_class = cfqq->ioprio_class;
2770         cfq_clear_cfqq_prio_changed(cfqq);
2771 }
2772
2773 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2774 {
2775         struct cfq_data *cfqd = cic_to_cfqd(cic);
2776         struct cfq_queue *cfqq;
2777         unsigned long flags;
2778
2779         if (unlikely(!cfqd))
2780                 return;
2781
2782         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2783
2784         cfqq = cic->cfqq[BLK_RW_ASYNC];
2785         if (cfqq) {
2786                 struct cfq_queue *new_cfqq;
2787                 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2788                                                 GFP_ATOMIC);
2789                 if (new_cfqq) {
2790                         cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2791                         cfq_put_queue(cfqq);
2792                 }
2793         }
2794
2795         cfqq = cic->cfqq[BLK_RW_SYNC];
2796         if (cfqq)
2797                 cfq_mark_cfqq_prio_changed(cfqq);
2798
2799         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2800 }
2801
2802 static void cfq_ioc_set_ioprio(struct io_context *ioc)
2803 {
2804         call_for_each_cic(ioc, changed_ioprio);
2805         ioc->ioprio_changed = 0;
2806 }
2807
2808 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2809                           pid_t pid, bool is_sync)
2810 {
2811         RB_CLEAR_NODE(&cfqq->rb_node);
2812         RB_CLEAR_NODE(&cfqq->p_node);
2813         INIT_LIST_HEAD(&cfqq->fifo);
2814
2815         cfqq->ref = 0;
2816         cfqq->cfqd = cfqd;
2817
2818         cfq_mark_cfqq_prio_changed(cfqq);
2819
2820         if (is_sync) {
2821                 if (!cfq_class_idle(cfqq))
2822                         cfq_mark_cfqq_idle_window(cfqq);
2823                 cfq_mark_cfqq_sync(cfqq);
2824         }
2825         cfqq->pid = pid;
2826 }
2827
2828 #ifdef CONFIG_CFQ_GROUP_IOSCHED
2829 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2830 {
2831         struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2832         struct cfq_data *cfqd = cic_to_cfqd(cic);
2833         unsigned long flags;
2834         struct request_queue *q;
2835
2836         if (unlikely(!cfqd))
2837                 return;
2838
2839         q = cfqd->queue;
2840
2841         spin_lock_irqsave(q->queue_lock, flags);
2842
2843         if (sync_cfqq) {
2844                 /*
2845                  * Drop reference to sync queue. A new sync queue will be
2846                  * assigned in new group upon arrival of a fresh request.
2847                  */
2848                 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2849                 cic_set_cfqq(cic, NULL, 1);
2850                 cfq_put_queue(sync_cfqq);
2851         }
2852
2853         spin_unlock_irqrestore(q->queue_lock, flags);
2854 }
2855
2856 static void cfq_ioc_set_cgroup(struct io_context *ioc)
2857 {
2858         call_for_each_cic(ioc, changed_cgroup);
2859         ioc->cgroup_changed = 0;
2860 }
2861 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2862
2863 static struct cfq_queue *
2864 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2865                      struct io_context *ioc, gfp_t gfp_mask)
2866 {
2867         struct cfq_queue *cfqq, *new_cfqq = NULL;
2868         struct cfq_io_context *cic;
2869         struct cfq_group *cfqg;
2870
2871 retry:
2872         cfqg = cfq_get_cfqg(cfqd, 1);
2873         cic = cfq_cic_lookup(cfqd, ioc);
2874         /* cic always exists here */
2875         cfqq = cic_to_cfqq(cic, is_sync);
2876
2877         /*
2878          * Always try a new alloc if we fell back to the OOM cfqq
2879          * originally, since it should just be a temporary situation.
2880          */
2881         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2882                 cfqq = NULL;
2883                 if (new_cfqq) {
2884                         cfqq = new_cfqq;
2885                         new_cfqq = NULL;
2886                 } else if (gfp_mask & __GFP_WAIT) {
2887                         spin_unlock_irq(cfqd->queue->queue_lock);
2888                         new_cfqq = kmem_cache_alloc_node(cfq_pool,
2889                                         gfp_mask | __GFP_ZERO,
2890                                         cfqd->queue->node);
2891                         spin_lock_irq(cfqd->queue->queue_lock);
2892                         if (new_cfqq)
2893                                 goto retry;
2894                 } else {
2895                         cfqq = kmem_cache_alloc_node(cfq_pool,
2896                                         gfp_mask | __GFP_ZERO,
2897                                         cfqd->queue->node);
2898                 }
2899
2900                 if (cfqq) {
2901                         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2902                         cfq_init_prio_data(cfqq, ioc);
2903                         cfq_link_cfqq_cfqg(cfqq, cfqg);
2904                         cfq_log_cfqq(cfqd, cfqq, "alloced");
2905                 } else
2906                         cfqq = &cfqd->oom_cfqq;
2907         }
2908
2909         if (new_cfqq)
2910                 kmem_cache_free(cfq_pool, new_cfqq);
2911
2912         return cfqq;
2913 }
2914
2915 static struct cfq_queue **
2916 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2917 {
2918         switch (ioprio_class) {
2919         case IOPRIO_CLASS_RT:
2920                 return &cfqd->async_cfqq[0][ioprio];
2921         case IOPRIO_CLASS_BE:
2922                 return &cfqd->async_cfqq[1][ioprio];
2923         case IOPRIO_CLASS_IDLE:
2924                 return &cfqd->async_idle_cfqq;
2925         default:
2926                 BUG();
2927         }
2928 }
2929
2930 static struct cfq_queue *
2931 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2932               gfp_t gfp_mask)
2933 {
2934         const int ioprio = task_ioprio(ioc);
2935         const int ioprio_class = task_ioprio_class(ioc);
2936         struct cfq_queue **async_cfqq = NULL;
2937         struct cfq_queue *cfqq = NULL;
2938
2939         if (!is_sync) {
2940                 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2941                 cfqq = *async_cfqq;
2942         }
2943
2944         if (!cfqq)
2945                 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2946
2947         /*
2948          * pin the queue now that it's allocated, scheduler exit will prune it
2949          */
2950         if (!is_sync && !(*async_cfqq)) {
2951                 cfqq->ref++;
2952                 *async_cfqq = cfqq;
2953         }
2954
2955         cfqq->ref++;
2956         return cfqq;
2957 }
2958
2959 /*
2960  * We drop cfq io contexts lazily, so we may find a dead one.
2961  */
2962 static void
2963 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2964                   struct cfq_io_context *cic)
2965 {
2966         unsigned long flags;
2967
2968         WARN_ON(!list_empty(&cic->queue_list));
2969         BUG_ON(cic->key != cfqd_dead_key(cfqd));
2970
2971         spin_lock_irqsave(&ioc->lock, flags);
2972
2973         BUG_ON(ioc->ioc_data == cic);
2974
2975         radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2976         hlist_del_rcu(&cic->cic_list);
2977         spin_unlock_irqrestore(&ioc->lock, flags);
2978
2979         cfq_cic_free(cic);
2980 }
2981
2982 static struct cfq_io_context *
2983 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2984 {
2985         struct cfq_io_context *cic;
2986         unsigned long flags;
2987
2988         if (unlikely(!ioc))
2989                 return NULL;
2990
2991         rcu_read_lock();
2992
2993         /*
2994          * we maintain a last-hit cache, to avoid browsing over the tree
2995          */
2996         cic = rcu_dereference(ioc->ioc_data);
2997         if (cic && cic->key == cfqd) {
2998                 rcu_read_unlock();
2999                 return cic;
3000         }
3001
3002         do {
3003                 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3004                 rcu_read_unlock();
3005                 if (!cic)
3006                         break;
3007                 if (unlikely(cic->key != cfqd)) {
3008                         cfq_drop_dead_cic(cfqd, ioc, cic);
3009                         rcu_read_lock();
3010                         continue;
3011                 }
3012
3013                 spin_lock_irqsave(&ioc->lock, flags);
3014                 rcu_assign_pointer(ioc->ioc_data, cic);
3015                 spin_unlock_irqrestore(&ioc->lock, flags);
3016                 break;
3017         } while (1);
3018
3019         return cic;
3020 }
3021
3022 /*
3023  * Add cic into ioc, using cfqd as the search key. This enables us to lookup
3024  * the process specific cfq io context when entered from the block layer.
3025  * Also adds the cic to a per-cfqd list, used when this queue is removed.
3026  */
3027 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3028                         struct cfq_io_context *cic, gfp_t gfp_mask)
3029 {
3030         unsigned long flags;
3031         int ret;
3032
3033         ret = radix_tree_preload(gfp_mask);
3034         if (!ret) {
3035                 cic->ioc = ioc;
3036                 cic->key = cfqd;
3037
3038                 spin_lock_irqsave(&ioc->lock, flags);
3039                 ret = radix_tree_insert(&ioc->radix_root,
3040                                                 cfqd->cic_index, cic);
3041                 if (!ret)
3042                         hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3043                 spin_unlock_irqrestore(&ioc->lock, flags);
3044
3045                 radix_tree_preload_end();
3046
3047                 if (!ret) {
3048                         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3049                         list_add(&cic->queue_list, &cfqd->cic_list);
3050                         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3051                 }
3052         }
3053
3054         if (ret)
3055                 printk(KERN_ERR "cfq: cic link failed!\n");
3056
3057         return ret;
3058 }
3059
3060 /*
3061  * Setup general io context and cfq io context. There can be several cfq
3062  * io contexts per general io context, if this process is doing io to more
3063  * than one device managed by cfq.
3064  */
3065 static struct cfq_io_context *
3066 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3067 {
3068         struct io_context *ioc = NULL;
3069         struct cfq_io_context *cic;
3070
3071         might_sleep_if(gfp_mask & __GFP_WAIT);
3072
3073         ioc = get_io_context(gfp_mask, cfqd->queue->node);
3074         if (!ioc)
3075                 return NULL;
3076
3077         cic = cfq_cic_lookup(cfqd, ioc);
3078         if (cic)
3079                 goto out;
3080
3081         cic = cfq_alloc_io_context(cfqd, gfp_mask);
3082         if (cic == NULL)
3083                 goto err;
3084
3085         if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
3086                 goto err_free;
3087
3088 out:
3089         smp_read_barrier_depends();
3090         if (unlikely(ioc->ioprio_changed))
3091                 cfq_ioc_set_ioprio(ioc);
3092
3093 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3094         if (unlikely(ioc->cgroup_changed))
3095                 cfq_ioc_set_cgroup(ioc);
3096 #endif
3097         return cic;
3098 err_free:
3099         cfq_cic_free(cic);
3100 err:
3101         put_io_context(ioc);
3102         return NULL;
3103 }
3104
3105 static void
3106 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
3107 {
3108         unsigned long elapsed = jiffies - cic->last_end_request;
3109         unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
3110
3111         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
3112         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
3113         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
3114 }
3115
3116 static void
3117 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3118                        struct request *rq)
3119 {
3120         sector_t sdist = 0;
3121         sector_t n_sec = blk_rq_sectors(rq);
3122         if (cfqq->last_request_pos) {
3123                 if (cfqq->last_request_pos < blk_rq_pos(rq))
3124                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3125                 else
3126                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3127         }
3128
3129         cfqq->seek_history <<= 1;
3130         if (blk_queue_nonrot(cfqd->queue))
3131                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3132         else
3133                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3134 }
3135
3136 /*
3137  * Disable idle window if the process thinks too long or seeks so much that
3138  * it doesn't matter
3139  */
3140 static void
3141 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3142                        struct cfq_io_context *cic)
3143 {
3144         int old_idle, enable_idle;
3145
3146         /*
3147          * Don't idle for async or idle io prio class
3148          */
3149         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3150                 return;
3151
3152         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3153
3154         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3155                 cfq_mark_cfqq_deep(cfqq);
3156
3157         if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3158                 enable_idle = 0;
3159         else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3160             (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3161                 enable_idle = 0;
3162         else if (sample_valid(cic->ttime_samples)) {
3163                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
3164                         enable_idle = 0;
3165                 else
3166                         enable_idle = 1;
3167         }
3168
3169         if (old_idle != enable_idle) {
3170                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3171                 if (enable_idle)
3172                         cfq_mark_cfqq_idle_window(cfqq);
3173                 else
3174                         cfq_clear_cfqq_idle_window(cfqq);
3175         }
3176 }
3177
3178 /*
3179  * Check if new_cfqq should preempt the currently active queue. Return 0 for
3180  * no or if we aren't sure, a 1 will cause a preempt.
3181  */
3182 static bool
3183 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3184                    struct request *rq)
3185 {
3186         struct cfq_queue *cfqq;
3187
3188         cfqq = cfqd->active_queue;
3189         if (!cfqq)
3190                 return false;
3191
3192         if (cfq_class_idle(new_cfqq))
3193                 return false;
3194
3195         if (cfq_class_idle(cfqq))
3196                 return true;
3197
3198         /*
3199          * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3200          */
3201         if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3202                 return false;
3203
3204         /*
3205          * if the new request is sync, but the currently running queue is
3206          * not, let the sync request have priority.
3207          */
3208         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3209                 return true;
3210
3211         if (new_cfqq->cfqg != cfqq->cfqg)
3212                 return false;
3213
3214         if (cfq_slice_used(cfqq))
3215                 return true;
3216
3217         /* Allow preemption only if we are idling on sync-noidle tree */
3218         if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3219             cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3220             new_cfqq->service_tree->count == 2 &&
3221             RB_EMPTY_ROOT(&cfqq->sort_list))
3222                 return true;
3223
3224         /*
3225          * So both queues are sync. Let the new request get disk time if
3226          * it's a metadata request and the current queue is doing regular IO.
3227          */
3228         if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
3229                 return true;
3230
3231         /*
3232          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3233          */
3234         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3235                 return true;
3236
3237         /* An idle queue should not be idle now for some reason */
3238         if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3239                 return true;
3240
3241         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3242                 return false;
3243
3244         /*
3245          * if this request is as-good as one we would expect from the
3246          * current cfqq, let it preempt
3247          */
3248         if (cfq_rq_close(cfqd, cfqq, rq))
3249                 return true;
3250
3251         return false;
3252 }
3253
3254 /*
3255  * cfqq preempts the active queue. if we allowed preempt with no slice left,
3256  * let it have half of its nominal slice.
3257  */
3258 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3259 {
3260         struct cfq_queue *old_cfqq = cfqd->active_queue;
3261
3262         cfq_log_cfqq(cfqd, cfqq, "preempt");
3263         cfq_slice_expired(cfqd, 1);
3264
3265         /*
3266          * workload type is changed, don't save slice, otherwise preempt
3267          * doesn't happen
3268          */
3269         if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
3270                 cfqq->cfqg->saved_workload_slice = 0;
3271
3272         /*
3273          * Put the new queue at the front of the of the current list,
3274          * so we know that it will be selected next.
3275          */
3276         BUG_ON(!cfq_cfqq_on_rr(cfqq));
3277
3278         cfq_service_tree_add(cfqd, cfqq, 1);
3279
3280         cfqq->slice_end = 0;
3281         cfq_mark_cfqq_slice_new(cfqq);
3282 }
3283
3284 /*
3285  * Called when a new fs request (rq) is added (to cfqq). Check if there's
3286  * something we should do about it
3287  */
3288 static void
3289 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3290                 struct request *rq)
3291 {
3292         struct cfq_io_context *cic = RQ_CIC(rq);
3293
3294         cfqd->rq_queued++;
3295         if (rq->cmd_flags & REQ_META)
3296                 cfqq->meta_pending++;
3297
3298         cfq_update_io_thinktime(cfqd, cic);
3299         cfq_update_io_seektime(cfqd, cfqq, rq);
3300         cfq_update_idle_window(cfqd, cfqq, cic);
3301
3302         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3303
3304         if (cfqq == cfqd->active_queue) {
3305                 /*
3306                  * Remember that we saw a request from this process, but
3307                  * don't start queuing just yet. Otherwise we risk seeing lots
3308                  * of tiny requests, because we disrupt the normal plugging
3309                  * and merging. If the request is already larger than a single
3310                  * page, let it rip immediately. For that case we assume that
3311                  * merging is already done. Ditto for a busy system that
3312                  * has other work pending, don't risk delaying until the
3313                  * idle timer unplug to continue working.
3314                  */
3315                 if (cfq_cfqq_wait_request(cfqq)) {
3316                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3317                             cfqd->busy_queues > 1) {
3318                                 cfq_del_timer(cfqd, cfqq);
3319                                 cfq_clear_cfqq_wait_request(cfqq);
3320                                 __blk_run_queue(cfqd->queue);
3321                         } else {
3322                                 cfq_blkiocg_update_idle_time_stats(
3323                                                 &cfqq->cfqg->blkg);
3324                                 cfq_mark_cfqq_must_dispatch(cfqq);
3325                         }
3326                 }
3327         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3328                 /*
3329                  * not the active queue - expire current slice if it is
3330                  * idle and has expired it's mean thinktime or this new queue
3331                  * has some old slice time left and is of higher priority or
3332                  * this new queue is RT and the current one is BE
3333                  */
3334                 cfq_preempt_queue(cfqd, cfqq);
3335                 __blk_run_queue(cfqd->queue);
3336         }
3337 }
3338
3339 static void cfq_insert_request(struct request_queue *q, struct request *rq)
3340 {
3341         struct cfq_data *cfqd = q->elevator->elevator_data;
3342         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3343
3344         cfq_log_cfqq(cfqd, cfqq, "insert_request");
3345         cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3346
3347         rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3348         list_add_tail(&rq->queuelist, &cfqq->fifo);
3349         cfq_add_rq_rb(rq);
3350         cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3351                         &cfqd->serving_group->blkg, rq_data_dir(rq),
3352                         rq_is_sync(rq));
3353         cfq_rq_enqueued(cfqd, cfqq, rq);
3354 }
3355
3356 /*
3357  * Update hw_tag based on peak queue depth over 50 samples under
3358  * sufficient load.
3359  */
3360 static void cfq_update_hw_tag(struct cfq_data *cfqd)
3361 {
3362         struct cfq_queue *cfqq = cfqd->active_queue;
3363
3364         if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3365                 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3366
3367         if (cfqd->hw_tag == 1)
3368                 return;
3369
3370         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3371             cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3372                 return;
3373
3374         /*
3375          * If active queue hasn't enough requests and can idle, cfq might not
3376          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3377          * case
3378          */
3379         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3380             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3381             CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3382                 return;
3383
3384         if (cfqd->hw_tag_samples++ < 50)
3385                 return;
3386
3387         if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3388                 cfqd->hw_tag = 1;
3389         else
3390                 cfqd->hw_tag = 0;
3391 }
3392
3393 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3394 {
3395         struct cfq_io_context *cic = cfqd->active_cic;
3396
3397         /* If the queue already has requests, don't wait */
3398         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3399                 return false;
3400
3401         /* If there are other queues in the group, don't wait */
3402         if (cfqq->cfqg->nr_cfqq > 1)
3403                 return false;
3404
3405         if (cfq_slice_used(cfqq))
3406                 return true;
3407
3408         /* if slice left is less than think time, wait busy */
3409         if (cic && sample_valid(cic->ttime_samples)
3410             && (cfqq->slice_end - jiffies < cic->ttime_mean))
3411                 return true;
3412
3413         /*
3414          * If think times is less than a jiffy than ttime_mean=0 and above
3415          * will not be true. It might happen that slice has not expired yet
3416          * but will expire soon (4-5 ns) during select_queue(). To cover the
3417          * case where think time is less than a jiffy, mark the queue wait
3418          * busy if only 1 jiffy is left in the slice.
3419          */
3420         if (cfqq->slice_end - jiffies == 1)
3421                 return true;
3422
3423         return false;
3424 }
3425
3426 static void cfq_completed_request(struct request_queue *q, struct request *rq)
3427 {
3428         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3429         struct cfq_data *cfqd = cfqq->cfqd;
3430         const int sync = rq_is_sync(rq);
3431         unsigned long now;
3432
3433         now = jiffies;
3434         cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3435                      !!(rq->cmd_flags & REQ_NOIDLE));
3436
3437         cfq_update_hw_tag(cfqd);
3438
3439         WARN_ON(!cfqd->rq_in_driver);
3440         WARN_ON(!cfqq->dispatched);
3441         cfqd->rq_in_driver--;
3442         cfqq->dispatched--;
3443         (RQ_CFQG(rq))->dispatched--;
3444         cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3445                         rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3446                         rq_data_dir(rq), rq_is_sync(rq));
3447
3448         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3449
3450         if (sync) {
3451                 RQ_CIC(rq)->last_end_request = now;
3452                 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3453                         cfqd->last_delayed_sync = now;
3454         }
3455
3456         /*
3457          * If this is the active queue, check if it needs to be expired,
3458          * or if we want to idle in case it has no pending requests.
3459          */
3460         if (cfqd->active_queue == cfqq) {
3461                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3462
3463                 if (cfq_cfqq_slice_new(cfqq)) {
3464                         cfq_set_prio_slice(cfqd, cfqq);
3465                         cfq_clear_cfqq_slice_new(cfqq);
3466                 }
3467
3468                 /*
3469                  * Should we wait for next request to come in before we expire
3470                  * the queue.
3471                  */
3472                 if (cfq_should_wait_busy(cfqd, cfqq)) {
3473                         unsigned long extend_sl = cfqd->cfq_slice_idle;
3474                         if (!cfqd->cfq_slice_idle)
3475                                 extend_sl = cfqd->cfq_group_idle;
3476                         cfqq->slice_end = jiffies + extend_sl;
3477                         cfq_mark_cfqq_wait_busy(cfqq);
3478                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3479                 }
3480
3481                 /*
3482                  * Idling is not enabled on:
3483                  * - expired queues
3484                  * - idle-priority queues
3485                  * - async queues
3486                  * - queues with still some requests queued
3487                  * - when there is a close cooperator
3488                  */
3489                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3490                         cfq_slice_expired(cfqd, 1);
3491                 else if (sync && cfqq_empty &&
3492                          !cfq_close_cooperator(cfqd, cfqq)) {
3493                         cfq_arm_slice_timer(cfqd);
3494                 }
3495         }
3496
3497         if (!cfqd->rq_in_driver)
3498                 cfq_schedule_dispatch(cfqd);
3499 }
3500
3501 /*
3502  * we temporarily boost lower priority queues if they are holding fs exclusive
3503  * resources. they are boosted to normal prio (CLASS_BE/4)
3504  */
3505 static void cfq_prio_boost(struct cfq_queue *cfqq)
3506 {
3507         if (has_fs_excl()) {
3508                 /*
3509                  * boost idle prio on transactions that would lock out other
3510                  * users of the filesystem
3511                  */
3512                 if (cfq_class_idle(cfqq))
3513                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
3514                 if (cfqq->ioprio > IOPRIO_NORM)
3515                         cfqq->ioprio = IOPRIO_NORM;
3516         } else {
3517                 /*
3518                  * unboost the queue (if needed)
3519                  */
3520                 cfqq->ioprio_class = cfqq->org_ioprio_class;
3521                 cfqq->ioprio = cfqq->org_ioprio;
3522         }
3523 }
3524
3525 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3526 {
3527         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3528                 cfq_mark_cfqq_must_alloc_slice(cfqq);
3529                 return ELV_MQUEUE_MUST;
3530         }
3531
3532         return ELV_MQUEUE_MAY;
3533 }
3534
3535 static int cfq_may_queue(struct request_queue *q, int rw)
3536 {
3537         struct cfq_data *cfqd = q->elevator->elevator_data;
3538         struct task_struct *tsk = current;
3539         struct cfq_io_context *cic;
3540         struct cfq_queue *cfqq;
3541
3542         /*
3543          * don't force setup of a queue from here, as a call to may_queue
3544          * does not necessarily imply that a request actually will be queued.
3545          * so just lookup a possibly existing queue, or return 'may queue'
3546          * if that fails
3547          */
3548         cic = cfq_cic_lookup(cfqd, tsk->io_context);
3549         if (!cic)
3550                 return ELV_MQUEUE_MAY;
3551
3552         cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3553         if (cfqq) {
3554                 cfq_init_prio_data(cfqq, cic->ioc);
3555                 cfq_prio_boost(cfqq);
3556
3557                 return __cfq_may_queue(cfqq);
3558         }
3559
3560         return ELV_MQUEUE_MAY;
3561 }
3562
3563 /*
3564  * queue lock held here
3565  */
3566 static void cfq_put_request(struct request *rq)
3567 {
3568         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3569
3570         if (cfqq) {
3571                 const int rw = rq_data_dir(rq);
3572
3573                 BUG_ON(!cfqq->allocated[rw]);
3574                 cfqq->allocated[rw]--;
3575
3576                 put_io_context(RQ_CIC(rq)->ioc);
3577
3578                 rq->elevator_private[0] = NULL;
3579                 rq->elevator_private[1] = NULL;
3580
3581                 /* Put down rq reference on cfqg */
3582                 cfq_put_cfqg(RQ_CFQG(rq));
3583                 rq->elevator_private[2] = NULL;
3584
3585                 cfq_put_queue(cfqq);
3586         }
3587 }
3588
3589 static struct cfq_queue *
3590 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3591                 struct cfq_queue *cfqq)
3592 {
3593         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3594         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3595         cfq_mark_cfqq_coop(cfqq->new_cfqq);
3596         cfq_put_queue(cfqq);
3597         return cic_to_cfqq(cic, 1);
3598 }
3599
3600 /*
3601  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3602  * was the last process referring to said cfqq.
3603  */
3604 static struct cfq_queue *
3605 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3606 {
3607         if (cfqq_process_refs(cfqq) == 1) {
3608                 cfqq->pid = current->pid;
3609                 cfq_clear_cfqq_coop(cfqq);
3610                 cfq_clear_cfqq_split_coop(cfqq);
3611                 return cfqq;
3612         }
3613
3614         cic_set_cfqq(cic, NULL, 1);
3615
3616         cfq_put_cooperator(cfqq);
3617
3618         cfq_put_queue(cfqq);
3619         return NULL;
3620 }
3621 /*
3622  * Allocate cfq data structures associated with this request.
3623  */
3624 static int
3625 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3626 {
3627         struct cfq_data *cfqd = q->elevator->elevator_data;
3628         struct cfq_io_context *cic;
3629         const int rw = rq_data_dir(rq);
3630         const bool is_sync = rq_is_sync(rq);
3631         struct cfq_queue *cfqq;
3632         unsigned long flags;
3633
3634         might_sleep_if(gfp_mask & __GFP_WAIT);
3635
3636         cic = cfq_get_io_context(cfqd, gfp_mask);
3637
3638         spin_lock_irqsave(q->queue_lock, flags);
3639
3640         if (!cic)
3641                 goto queue_fail;
3642
3643 new_queue:
3644         cfqq = cic_to_cfqq(cic, is_sync);
3645         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3646                 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3647                 cic_set_cfqq(cic, cfqq, is_sync);
3648         } else {
3649                 /*
3650                  * If the queue was seeky for too long, break it apart.
3651                  */
3652                 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3653                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3654                         cfqq = split_cfqq(cic, cfqq);
3655                         if (!cfqq)
3656                                 goto new_queue;
3657                 }
3658
3659                 /*
3660                  * Check to see if this queue is scheduled to merge with
3661                  * another, closely cooperating queue.  The merging of
3662                  * queues happens here as it must be done in process context.
3663                  * The reference on new_cfqq was taken in merge_cfqqs.
3664                  */
3665                 if (cfqq->new_cfqq)
3666                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3667         }
3668
3669         cfqq->allocated[rw]++;
3670
3671         spin_unlock_irqrestore(q->queue_lock, flags);
3672
3673         cfqq->ref++;
3674         rq->elevator_private[0] = cic;
3675         rq->elevator_private[1] = cfqq;
3676         rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3677         return 0;
3678
3679 queue_fail:
3680         if (cic)
3681                 put_io_context(cic->ioc);
3682
3683         cfq_schedule_dispatch(cfqd);
3684         spin_unlock_irqrestore(q->queue_lock, flags);
3685         cfq_log(cfqd, "set_request fail");
3686         return 1;
3687 }
3688
3689 static void cfq_kick_queue(struct work_struct *work)
3690 {
3691         struct cfq_data *cfqd =
3692                 container_of(work, struct cfq_data, unplug_work);
3693         struct request_queue *q = cfqd->queue;
3694
3695         spin_lock_irq(q->queue_lock);
3696         __blk_run_queue(cfqd->queue);
3697         spin_unlock_irq(q->queue_lock);
3698 }
3699
3700 /*
3701  * Timer running if the active_queue is currently idling inside its time slice
3702  */
3703 static void cfq_idle_slice_timer(unsigned long data)
3704 {
3705         struct cfq_data *cfqd = (struct cfq_data *) data;
3706         struct cfq_queue *cfqq;
3707         unsigned long flags;
3708         int timed_out = 1;
3709
3710         cfq_log(cfqd, "idle timer fired");
3711
3712         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3713
3714         cfqq = cfqd->active_queue;
3715         if (cfqq) {
3716                 timed_out = 0;
3717
3718                 /*
3719                  * We saw a request before the queue expired, let it through
3720                  */
3721                 if (cfq_cfqq_must_dispatch(cfqq))
3722                         goto out_kick;
3723
3724                 /*
3725                  * expired
3726                  */
3727                 if (cfq_slice_used(cfqq))
3728                         goto expire;
3729
3730                 /*
3731                  * only expire and reinvoke request handler, if there are
3732                  * other queues with pending requests
3733                  */
3734                 if (!cfqd->busy_queues)
3735                         goto out_cont;
3736
3737                 /*
3738                  * not expired and it has a request pending, let it dispatch
3739                  */
3740                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3741                         goto out_kick;
3742
3743                 /*
3744                  * Queue depth flag is reset only when the idle didn't succeed
3745                  */
3746                 cfq_clear_cfqq_deep(cfqq);
3747         }
3748 expire:
3749         cfq_slice_expired(cfqd, timed_out);
3750 out_kick:
3751         cfq_schedule_dispatch(cfqd);
3752 out_cont:
3753         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3754 }
3755
3756 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3757 {
3758         del_timer_sync(&cfqd->idle_slice_timer);
3759         cancel_work_sync(&cfqd->unplug_work);
3760 }
3761
3762 static void cfq_put_async_queues(struct cfq_data *cfqd)
3763 {
3764         int i;
3765
3766         for (i = 0; i < IOPRIO_BE_NR; i++) {
3767                 if (cfqd->async_cfqq[0][i])
3768                         cfq_put_queue(cfqd->async_cfqq[0][i]);
3769                 if (cfqd->async_cfqq[1][i])
3770                         cfq_put_queue(cfqd->async_cfqq[1][i]);
3771         }
3772
3773         if (cfqd->async_idle_cfqq)
3774                 cfq_put_queue(cfqd->async_idle_cfqq);
3775 }
3776
3777 static void cfq_cfqd_free(struct rcu_head *head)
3778 {
3779         kfree(container_of(head, struct cfq_data, rcu));
3780 }
3781
3782 static void cfq_exit_queue(struct elevator_queue *e)
3783 {
3784         struct cfq_data *cfqd = e->elevator_data;
3785         struct request_queue *q = cfqd->queue;
3786
3787         cfq_shutdown_timer_wq(cfqd);
3788
3789         spin_lock_irq(q->queue_lock);
3790
3791         if (cfqd->active_queue)
3792                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3793
3794         while (!list_empty(&cfqd->cic_list)) {
3795                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3796                                                         struct cfq_io_context,
3797                                                         queue_list);
3798
3799                 __cfq_exit_single_io_context(cfqd, cic);
3800         }
3801
3802         cfq_put_async_queues(cfqd);
3803         cfq_release_cfq_groups(cfqd);
3804         cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
3805
3806         spin_unlock_irq(q->queue_lock);
3807
3808         cfq_shutdown_timer_wq(cfqd);
3809
3810         spin_lock(&cic_index_lock);
3811         ida_remove(&cic_index_ida, cfqd->cic_index);
3812         spin_unlock(&cic_index_lock);
3813
3814         /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3815         call_rcu(&cfqd->rcu, cfq_cfqd_free);
3816 }
3817
3818 static int cfq_alloc_cic_index(void)
3819 {
3820         int index, error;
3821
3822         do {
3823                 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3824                         return -ENOMEM;
3825
3826                 spin_lock(&cic_index_lock);
3827                 error = ida_get_new(&cic_index_ida, &index);
3828                 spin_unlock(&cic_index_lock);
3829                 if (error && error != -EAGAIN)
3830                         return error;
3831         } while (error);
3832
3833         return index;
3834 }
3835
3836 static void *cfq_init_queue(struct request_queue *q)
3837 {
3838         struct cfq_data *cfqd;
3839         int i, j;
3840         struct cfq_group *cfqg;
3841         struct cfq_rb_root *st;
3842
3843         i = cfq_alloc_cic_index();
3844         if (i < 0)
3845                 return NULL;
3846
3847         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3848         if (!cfqd)
3849                 return NULL;
3850
3851         /*
3852          * Don't need take queue_lock in the routine, since we are
3853          * initializing the ioscheduler, and nobody is using cfqd
3854          */
3855         cfqd->cic_index = i;
3856
3857         /* Init root service tree */
3858         cfqd->grp_service_tree = CFQ_RB_ROOT;
3859
3860         /* Init root group */
3861         cfqg = &cfqd->root_group;
3862         for_each_cfqg_st(cfqg, i, j, st)
3863                 *st = CFQ_RB_ROOT;
3864         RB_CLEAR_NODE(&cfqg->rb_node);
3865
3866         /* Give preference to root group over other groups */
3867         cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3868
3869 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3870         /*
3871          * Take a reference to root group which we never drop. This is just
3872          * to make sure that cfq_put_cfqg() does not try to kfree root group
3873          */
3874         cfqg->ref = 1;
3875         rcu_read_lock();
3876         cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3877                                         (void *)cfqd, 0);
3878         rcu_read_unlock();
3879 #endif
3880         /*
3881          * Not strictly needed (since RB_ROOT just clears the node and we
3882          * zeroed cfqd on alloc), but better be safe in case someone decides
3883          * to add magic to the rb code
3884          */
3885         for (i = 0; i < CFQ_PRIO_LISTS; i++)
3886                 cfqd->prio_trees[i] = RB_ROOT;
3887
3888         /*
3889          * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3890          * Grab a permanent reference to it, so that the normal code flow
3891          * will not attempt to free it.
3892          */
3893         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3894         cfqd->oom_cfqq.ref++;
3895         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3896
3897         INIT_LIST_HEAD(&cfqd->cic_list);
3898
3899         cfqd->queue = q;
3900
3901         init_timer(&cfqd->idle_slice_timer);
3902         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3903         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3904
3905         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3906
3907         cfqd->cfq_quantum = cfq_quantum;
3908         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3909         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3910         cfqd->cfq_back_max = cfq_back_max;
3911         cfqd->cfq_back_penalty = cfq_back_penalty;
3912         cfqd->cfq_slice[0] = cfq_slice_async;
3913         cfqd->cfq_slice[1] = cfq_slice_sync;
3914         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3915         cfqd->cfq_slice_idle = cfq_slice_idle;
3916         cfqd->cfq_group_idle = cfq_group_idle;
3917         cfqd->cfq_latency = 1;
3918         cfqd->hw_tag = -1;
3919         /*
3920          * we optimistically start assuming sync ops weren't delayed in last
3921          * second, in order to have larger depth for async operations.
3922          */
3923         cfqd->last_delayed_sync = jiffies - HZ;
3924         return cfqd;
3925 }
3926
3927 static void cfq_slab_kill(void)
3928 {
3929         /*
3930          * Caller already ensured that pending RCU callbacks are completed,
3931          * so we should have no busy allocations at this point.
3932          */
3933         if (cfq_pool)
3934                 kmem_cache_destroy(cfq_pool);
3935         if (cfq_ioc_pool)
3936                 kmem_cache_destroy(cfq_ioc_pool);
3937 }
3938
3939 static int __init cfq_slab_setup(void)
3940 {
3941         cfq_pool = KMEM_CACHE(cfq_queue, 0);
3942         if (!cfq_pool)
3943                 goto fail;
3944
3945         cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
3946         if (!cfq_ioc_pool)
3947                 goto fail;
3948
3949         return 0;
3950 fail:
3951         cfq_slab_kill();
3952         return -ENOMEM;
3953 }
3954
3955 /*
3956  * sysfs parts below -->
3957  */
3958 static ssize_t
3959 cfq_var_show(unsigned int var, char *page)
3960 {
3961         return sprintf(page, "%d\n", var);
3962 }
3963
3964 static ssize_t
3965 cfq_var_store(unsigned int *var, const char *page, size_t count)
3966 {
3967         char *p = (char *) page;
3968
3969         *var = simple_strtoul(p, &p, 10);
3970         return count;
3971 }
3972
3973 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
3974 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
3975 {                                                                       \
3976         struct cfq_data *cfqd = e->elevator_data;                       \
3977         unsigned int __data = __VAR;                                    \
3978         if (__CONV)                                                     \
3979                 __data = jiffies_to_msecs(__data);                      \
3980         return cfq_var_show(__data, (page));                            \
3981 }
3982 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3983 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3984 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3985 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3986 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3987 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3988 SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3989 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3990 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3991 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3992 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3993 #undef SHOW_FUNCTION
3994
3995 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
3996 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
3997 {                                                                       \
3998         struct cfq_data *cfqd = e->elevator_data;                       \
3999         unsigned int __data;                                            \
4000         int ret = cfq_var_store(&__data, (page), count);                \
4001         if (__data < (MIN))                                             \
4002                 __data = (MIN);                                         \
4003         else if (__data > (MAX))                                        \
4004                 __data = (MAX);                                         \
4005         if (__CONV)                                                     \
4006                 *(__PTR) = msecs_to_jiffies(__data);                    \
4007         else                                                            \
4008                 *(__PTR) = __data;                                      \
4009         return ret;                                                     \
4010 }
4011 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4012 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4013                 UINT_MAX, 1);
4014 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4015                 UINT_MAX, 1);
4016 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4017 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4018                 UINT_MAX, 0);
4019 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4020 STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4021 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4022 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4023 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4024                 UINT_MAX, 0);
4025 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4026 #undef STORE_FUNCTION
4027
4028 #define CFQ_ATTR(name) \
4029         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4030
4031 static struct elv_fs_entry cfq_attrs[] = {
4032         CFQ_ATTR(quantum),
4033         CFQ_ATTR(fifo_expire_sync),
4034         CFQ_ATTR(fifo_expire_async),
4035         CFQ_ATTR(back_seek_max),
4036         CFQ_ATTR(back_seek_penalty),
4037         CFQ_ATTR(slice_sync),
4038         CFQ_ATTR(slice_async),
4039         CFQ_ATTR(slice_async_rq),
4040         CFQ_ATTR(slice_idle),
4041         CFQ_ATTR(group_idle),
4042         CFQ_ATTR(low_latency),
4043         __ATTR_NULL
4044 };
4045
4046 static struct elevator_type iosched_cfq = {
4047         .ops = {
4048                 .elevator_merge_fn =            cfq_merge,
4049                 .elevator_merged_fn =           cfq_merged_request,
4050                 .elevator_merge_req_fn =        cfq_merged_requests,
4051                 .elevator_allow_merge_fn =      cfq_allow_merge,
4052                 .elevator_bio_merged_fn =       cfq_bio_merged,
4053                 .elevator_dispatch_fn =         cfq_dispatch_requests,
4054                 .elevator_add_req_fn =          cfq_insert_request,
4055                 .elevator_activate_req_fn =     cfq_activate_request,
4056                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
4057                 .elevator_completed_req_fn =    cfq_completed_request,
4058                 .elevator_former_req_fn =       elv_rb_former_request,
4059                 .elevator_latter_req_fn =       elv_rb_latter_request,
4060                 .elevator_set_req_fn =          cfq_set_request,
4061                 .elevator_put_req_fn =          cfq_put_request,
4062                 .elevator_may_queue_fn =        cfq_may_queue,
4063                 .elevator_init_fn =             cfq_init_queue,
4064                 .elevator_exit_fn =             cfq_exit_queue,
4065                 .trim =                         cfq_free_io_context,
4066         },
4067         .elevator_attrs =       cfq_attrs,
4068         .elevator_name =        "cfq",
4069         .elevator_owner =       THIS_MODULE,
4070 };
4071
4072 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4073 static struct blkio_policy_type blkio_policy_cfq = {
4074         .ops = {
4075                 .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
4076                 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4077         },
4078         .plid = BLKIO_POLICY_PROP,
4079 };
4080 #else
4081 static struct blkio_policy_type blkio_policy_cfq;
4082 #endif
4083
4084 static int __init cfq_init(void)
4085 {
4086         /*
4087          * could be 0 on HZ < 1000 setups
4088          */
4089         if (!cfq_slice_async)
4090                 cfq_slice_async = 1;
4091         if (!cfq_slice_idle)
4092                 cfq_slice_idle = 1;
4093
4094 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4095         if (!cfq_group_idle)
4096                 cfq_group_idle = 1;
4097 #else
4098                 cfq_group_idle = 0;
4099 #endif
4100         if (cfq_slab_setup())
4101                 return -ENOMEM;
4102
4103         elv_register(&iosched_cfq);
4104         blkio_policy_register(&blkio_policy_cfq);
4105
4106         return 0;
4107 }
4108
4109 static void __exit cfq_exit(void)
4110 {
4111         DECLARE_COMPLETION_ONSTACK(all_gone);
4112         blkio_policy_unregister(&blkio_policy_cfq);
4113         elv_unregister(&iosched_cfq);
4114         ioc_gone = &all_gone;
4115         /* ioc_gone's update must be visible before reading ioc_count */
4116         smp_wmb();
4117
4118         /*
4119          * this also protects us from entering cfq_slab_kill() with
4120          * pending RCU callbacks
4121          */
4122         if (elv_ioc_count_read(cfq_ioc_count))
4123                 wait_for_completion(&all_gone);
4124         ida_destroy(&cic_index_ida);
4125         cfq_slab_kill();
4126 }
4127
4128 module_init(cfq_init);
4129 module_exit(cfq_exit);
4130
4131 MODULE_AUTHOR("Jens Axboe");
4132 MODULE_LICENSE("GPL");
4133 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");