Merge branch 'msm-fix' of git://codeaurora.org/quic/kernel/davidb/linux-msm into...
[cascardo/linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403         kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405
406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408         if (!tab)
409                 return;
410
411         spin_lock(&qdisc_stab_lock);
412
413         if (--tab->refcnt == 0) {
414                 list_del(&tab->list);
415                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416         }
417
418         spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424         struct nlattr *nest;
425
426         nest = nla_nest_start(skb, TCA_STAB);
427         if (nest == NULL)
428                 goto nla_put_failure;
429         NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
430         nla_nest_end(skb, nest);
431
432         return skb->len;
433
434 nla_put_failure:
435         return -1;
436 }
437
438 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
439 {
440         int pkt_len, slot;
441
442         pkt_len = skb->len + stab->szopts.overhead;
443         if (unlikely(!stab->szopts.tsize))
444                 goto out;
445
446         slot = pkt_len + stab->szopts.cell_align;
447         if (unlikely(slot < 0))
448                 slot = 0;
449
450         slot >>= stab->szopts.cell_log;
451         if (likely(slot < stab->szopts.tsize))
452                 pkt_len = stab->data[slot];
453         else
454                 pkt_len = stab->data[stab->szopts.tsize - 1] *
455                                 (slot / stab->szopts.tsize) +
456                                 stab->data[slot % stab->szopts.tsize];
457
458         pkt_len <<= stab->szopts.size_log;
459 out:
460         if (unlikely(pkt_len < 1))
461                 pkt_len = 1;
462         qdisc_skb_cb(skb)->pkt_len = pkt_len;
463 }
464 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
465
466 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
467 {
468         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
469                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
470                         txt, qdisc->ops->id, qdisc->handle >> 16);
471                 qdisc->flags |= TCQ_F_WARN_NONWC;
472         }
473 }
474 EXPORT_SYMBOL(qdisc_warn_nonwc);
475
476 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
477 {
478         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
479                                                  timer);
480
481         qdisc_unthrottled(wd->qdisc);
482         __netif_schedule(qdisc_root(wd->qdisc));
483
484         return HRTIMER_NORESTART;
485 }
486
487 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
488 {
489         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
490         wd->timer.function = qdisc_watchdog;
491         wd->qdisc = qdisc;
492 }
493 EXPORT_SYMBOL(qdisc_watchdog_init);
494
495 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
496 {
497         ktime_t time;
498
499         if (test_bit(__QDISC_STATE_DEACTIVATED,
500                      &qdisc_root_sleeping(wd->qdisc)->state))
501                 return;
502
503         qdisc_throttled(wd->qdisc);
504         time = ktime_set(0, 0);
505         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
506         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
507 }
508 EXPORT_SYMBOL(qdisc_watchdog_schedule);
509
510 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
511 {
512         hrtimer_cancel(&wd->timer);
513         qdisc_unthrottled(wd->qdisc);
514 }
515 EXPORT_SYMBOL(qdisc_watchdog_cancel);
516
517 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
518 {
519         unsigned int size = n * sizeof(struct hlist_head), i;
520         struct hlist_head *h;
521
522         if (size <= PAGE_SIZE)
523                 h = kmalloc(size, GFP_KERNEL);
524         else
525                 h = (struct hlist_head *)
526                         __get_free_pages(GFP_KERNEL, get_order(size));
527
528         if (h != NULL) {
529                 for (i = 0; i < n; i++)
530                         INIT_HLIST_HEAD(&h[i]);
531         }
532         return h;
533 }
534
535 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
536 {
537         unsigned int size = n * sizeof(struct hlist_head);
538
539         if (size <= PAGE_SIZE)
540                 kfree(h);
541         else
542                 free_pages((unsigned long)h, get_order(size));
543 }
544
545 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
546 {
547         struct Qdisc_class_common *cl;
548         struct hlist_node *n, *next;
549         struct hlist_head *nhash, *ohash;
550         unsigned int nsize, nmask, osize;
551         unsigned int i, h;
552
553         /* Rehash when load factor exceeds 0.75 */
554         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
555                 return;
556         nsize = clhash->hashsize * 2;
557         nmask = nsize - 1;
558         nhash = qdisc_class_hash_alloc(nsize);
559         if (nhash == NULL)
560                 return;
561
562         ohash = clhash->hash;
563         osize = clhash->hashsize;
564
565         sch_tree_lock(sch);
566         for (i = 0; i < osize; i++) {
567                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
568                         h = qdisc_class_hash(cl->classid, nmask);
569                         hlist_add_head(&cl->hnode, &nhash[h]);
570                 }
571         }
572         clhash->hash     = nhash;
573         clhash->hashsize = nsize;
574         clhash->hashmask = nmask;
575         sch_tree_unlock(sch);
576
577         qdisc_class_hash_free(ohash, osize);
578 }
579 EXPORT_SYMBOL(qdisc_class_hash_grow);
580
581 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
582 {
583         unsigned int size = 4;
584
585         clhash->hash = qdisc_class_hash_alloc(size);
586         if (clhash->hash == NULL)
587                 return -ENOMEM;
588         clhash->hashsize  = size;
589         clhash->hashmask  = size - 1;
590         clhash->hashelems = 0;
591         return 0;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_init);
594
595 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
596 {
597         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
598 }
599 EXPORT_SYMBOL(qdisc_class_hash_destroy);
600
601 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
602                              struct Qdisc_class_common *cl)
603 {
604         unsigned int h;
605
606         INIT_HLIST_NODE(&cl->hnode);
607         h = qdisc_class_hash(cl->classid, clhash->hashmask);
608         hlist_add_head(&cl->hnode, &clhash->hash[h]);
609         clhash->hashelems++;
610 }
611 EXPORT_SYMBOL(qdisc_class_hash_insert);
612
613 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
614                              struct Qdisc_class_common *cl)
615 {
616         hlist_del(&cl->hnode);
617         clhash->hashelems--;
618 }
619 EXPORT_SYMBOL(qdisc_class_hash_remove);
620
621 /* Allocate an unique handle from space managed by kernel
622  * Possible range is [8000-FFFF]:0000 (0x8000 values)
623  */
624 static u32 qdisc_alloc_handle(struct net_device *dev)
625 {
626         int i = 0x8000;
627         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
628
629         do {
630                 autohandle += TC_H_MAKE(0x10000U, 0);
631                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
632                         autohandle = TC_H_MAKE(0x80000000U, 0);
633                 if (!qdisc_lookup(dev, autohandle))
634                         return autohandle;
635                 cond_resched();
636         } while (--i > 0);
637
638         return 0;
639 }
640
641 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
642 {
643         const struct Qdisc_class_ops *cops;
644         unsigned long cl;
645         u32 parentid;
646
647         if (n == 0)
648                 return;
649         while ((parentid = sch->parent)) {
650                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
651                         return;
652
653                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
654                 if (sch == NULL) {
655                         WARN_ON(parentid != TC_H_ROOT);
656                         return;
657                 }
658                 cops = sch->ops->cl_ops;
659                 if (cops->qlen_notify) {
660                         cl = cops->get(sch, parentid);
661                         cops->qlen_notify(sch, cl);
662                         cops->put(sch, cl);
663                 }
664                 sch->q.qlen -= n;
665         }
666 }
667 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
668
669 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
670                                struct nlmsghdr *n, u32 clid,
671                                struct Qdisc *old, struct Qdisc *new)
672 {
673         if (new || old)
674                 qdisc_notify(net, skb, n, clid, old, new);
675
676         if (old)
677                 qdisc_destroy(old);
678 }
679
680 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
681  * to device "dev".
682  *
683  * When appropriate send a netlink notification using 'skb'
684  * and "n".
685  *
686  * On success, destroy old qdisc.
687  */
688
689 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
690                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
691                        struct Qdisc *new, struct Qdisc *old)
692 {
693         struct Qdisc *q = old;
694         struct net *net = dev_net(dev);
695         int err = 0;
696
697         if (parent == NULL) {
698                 unsigned int i, num_q, ingress;
699
700                 ingress = 0;
701                 num_q = dev->num_tx_queues;
702                 if ((q && q->flags & TCQ_F_INGRESS) ||
703                     (new && new->flags & TCQ_F_INGRESS)) {
704                         num_q = 1;
705                         ingress = 1;
706                         if (!dev_ingress_queue(dev))
707                                 return -ENOENT;
708                 }
709
710                 if (dev->flags & IFF_UP)
711                         dev_deactivate(dev);
712
713                 if (new && new->ops->attach) {
714                         new->ops->attach(new);
715                         num_q = 0;
716                 }
717
718                 for (i = 0; i < num_q; i++) {
719                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
720
721                         if (!ingress)
722                                 dev_queue = netdev_get_tx_queue(dev, i);
723
724                         old = dev_graft_qdisc(dev_queue, new);
725                         if (new && i > 0)
726                                 atomic_inc(&new->refcnt);
727
728                         if (!ingress)
729                                 qdisc_destroy(old);
730                 }
731
732                 if (!ingress) {
733                         notify_and_destroy(net, skb, n, classid,
734                                            dev->qdisc, new);
735                         if (new && !new->ops->attach)
736                                 atomic_inc(&new->refcnt);
737                         dev->qdisc = new ? : &noop_qdisc;
738                 } else {
739                         notify_and_destroy(net, skb, n, classid, old, new);
740                 }
741
742                 if (dev->flags & IFF_UP)
743                         dev_activate(dev);
744         } else {
745                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
746
747                 err = -EOPNOTSUPP;
748                 if (cops && cops->graft) {
749                         unsigned long cl = cops->get(parent, classid);
750                         if (cl) {
751                                 err = cops->graft(parent, cl, new, &old);
752                                 cops->put(parent, cl);
753                         } else
754                                 err = -ENOENT;
755                 }
756                 if (!err)
757                         notify_and_destroy(net, skb, n, classid, old, new);
758         }
759         return err;
760 }
761
762 /* lockdep annotation is needed for ingress; egress gets it only for name */
763 static struct lock_class_key qdisc_tx_lock;
764 static struct lock_class_key qdisc_rx_lock;
765
766 /*
767    Allocate and initialize new qdisc.
768
769    Parameters are passed via opt.
770  */
771
772 static struct Qdisc *
773 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
774              struct Qdisc *p, u32 parent, u32 handle,
775              struct nlattr **tca, int *errp)
776 {
777         int err;
778         struct nlattr *kind = tca[TCA_KIND];
779         struct Qdisc *sch;
780         struct Qdisc_ops *ops;
781         struct qdisc_size_table *stab;
782
783         ops = qdisc_lookup_ops(kind);
784 #ifdef CONFIG_MODULES
785         if (ops == NULL && kind != NULL) {
786                 char name[IFNAMSIZ];
787                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
788                         /* We dropped the RTNL semaphore in order to
789                          * perform the module load.  So, even if we
790                          * succeeded in loading the module we have to
791                          * tell the caller to replay the request.  We
792                          * indicate this using -EAGAIN.
793                          * We replay the request because the device may
794                          * go away in the mean time.
795                          */
796                         rtnl_unlock();
797                         request_module("sch_%s", name);
798                         rtnl_lock();
799                         ops = qdisc_lookup_ops(kind);
800                         if (ops != NULL) {
801                                 /* We will try again qdisc_lookup_ops,
802                                  * so don't keep a reference.
803                                  */
804                                 module_put(ops->owner);
805                                 err = -EAGAIN;
806                                 goto err_out;
807                         }
808                 }
809         }
810 #endif
811
812         err = -ENOENT;
813         if (ops == NULL)
814                 goto err_out;
815
816         sch = qdisc_alloc(dev_queue, ops);
817         if (IS_ERR(sch)) {
818                 err = PTR_ERR(sch);
819                 goto err_out2;
820         }
821
822         sch->parent = parent;
823
824         if (handle == TC_H_INGRESS) {
825                 sch->flags |= TCQ_F_INGRESS;
826                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
827                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
828         } else {
829                 if (handle == 0) {
830                         handle = qdisc_alloc_handle(dev);
831                         err = -ENOMEM;
832                         if (handle == 0)
833                                 goto err_out3;
834                 }
835                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
836         }
837
838         sch->handle = handle;
839
840         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
841                 if (tca[TCA_STAB]) {
842                         stab = qdisc_get_stab(tca[TCA_STAB]);
843                         if (IS_ERR(stab)) {
844                                 err = PTR_ERR(stab);
845                                 goto err_out4;
846                         }
847                         rcu_assign_pointer(sch->stab, stab);
848                 }
849                 if (tca[TCA_RATE]) {
850                         spinlock_t *root_lock;
851
852                         err = -EOPNOTSUPP;
853                         if (sch->flags & TCQ_F_MQROOT)
854                                 goto err_out4;
855
856                         if ((sch->parent != TC_H_ROOT) &&
857                             !(sch->flags & TCQ_F_INGRESS) &&
858                             (!p || !(p->flags & TCQ_F_MQROOT)))
859                                 root_lock = qdisc_root_sleeping_lock(sch);
860                         else
861                                 root_lock = qdisc_lock(sch);
862
863                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
864                                                 root_lock, tca[TCA_RATE]);
865                         if (err)
866                                 goto err_out4;
867                 }
868
869                 qdisc_list_add(sch);
870
871                 return sch;
872         }
873 err_out3:
874         dev_put(dev);
875         kfree((char *) sch - sch->padded);
876 err_out2:
877         module_put(ops->owner);
878 err_out:
879         *errp = err;
880         return NULL;
881
882 err_out4:
883         /*
884          * Any broken qdiscs that would require a ops->reset() here?
885          * The qdisc was never in action so it shouldn't be necessary.
886          */
887         qdisc_put_stab(rtnl_dereference(sch->stab));
888         if (ops->destroy)
889                 ops->destroy(sch);
890         goto err_out3;
891 }
892
893 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
894 {
895         struct qdisc_size_table *ostab, *stab = NULL;
896         int err = 0;
897
898         if (tca[TCA_OPTIONS]) {
899                 if (sch->ops->change == NULL)
900                         return -EINVAL;
901                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
902                 if (err)
903                         return err;
904         }
905
906         if (tca[TCA_STAB]) {
907                 stab = qdisc_get_stab(tca[TCA_STAB]);
908                 if (IS_ERR(stab))
909                         return PTR_ERR(stab);
910         }
911
912         ostab = rtnl_dereference(sch->stab);
913         rcu_assign_pointer(sch->stab, stab);
914         qdisc_put_stab(ostab);
915
916         if (tca[TCA_RATE]) {
917                 /* NB: ignores errors from replace_estimator
918                    because change can't be undone. */
919                 if (sch->flags & TCQ_F_MQROOT)
920                         goto out;
921                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
922                                             qdisc_root_sleeping_lock(sch),
923                                             tca[TCA_RATE]);
924         }
925 out:
926         return 0;
927 }
928
929 struct check_loop_arg {
930         struct qdisc_walker     w;
931         struct Qdisc            *p;
932         int                     depth;
933 };
934
935 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
936
937 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
938 {
939         struct check_loop_arg   arg;
940
941         if (q->ops->cl_ops == NULL)
942                 return 0;
943
944         arg.w.stop = arg.w.skip = arg.w.count = 0;
945         arg.w.fn = check_loop_fn;
946         arg.depth = depth;
947         arg.p = p;
948         q->ops->cl_ops->walk(q, &arg.w);
949         return arg.w.stop ? -ELOOP : 0;
950 }
951
952 static int
953 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
954 {
955         struct Qdisc *leaf;
956         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
957         struct check_loop_arg *arg = (struct check_loop_arg *)w;
958
959         leaf = cops->leaf(q, cl);
960         if (leaf) {
961                 if (leaf == arg->p || arg->depth > 7)
962                         return -ELOOP;
963                 return check_loop(leaf, arg->p, arg->depth + 1);
964         }
965         return 0;
966 }
967
968 /*
969  * Delete/get qdisc.
970  */
971
972 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
973 {
974         struct net *net = sock_net(skb->sk);
975         struct tcmsg *tcm = NLMSG_DATA(n);
976         struct nlattr *tca[TCA_MAX + 1];
977         struct net_device *dev;
978         u32 clid = tcm->tcm_parent;
979         struct Qdisc *q = NULL;
980         struct Qdisc *p = NULL;
981         int err;
982
983         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
984         if (!dev)
985                 return -ENODEV;
986
987         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
988         if (err < 0)
989                 return err;
990
991         if (clid) {
992                 if (clid != TC_H_ROOT) {
993                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
994                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
995                                 if (!p)
996                                         return -ENOENT;
997                                 q = qdisc_leaf(p, clid);
998                         } else if (dev_ingress_queue(dev)) {
999                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1000                         }
1001                 } else {
1002                         q = dev->qdisc;
1003                 }
1004                 if (!q)
1005                         return -ENOENT;
1006
1007                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1008                         return -EINVAL;
1009         } else {
1010                 q = qdisc_lookup(dev, tcm->tcm_handle);
1011                 if (!q)
1012                         return -ENOENT;
1013         }
1014
1015         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1016                 return -EINVAL;
1017
1018         if (n->nlmsg_type == RTM_DELQDISC) {
1019                 if (!clid)
1020                         return -EINVAL;
1021                 if (q->handle == 0)
1022                         return -ENOENT;
1023                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1024                 if (err != 0)
1025                         return err;
1026         } else {
1027                 qdisc_notify(net, skb, n, clid, NULL, q);
1028         }
1029         return 0;
1030 }
1031
1032 /*
1033  * Create/change qdisc.
1034  */
1035
1036 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1037 {
1038         struct net *net = sock_net(skb->sk);
1039         struct tcmsg *tcm;
1040         struct nlattr *tca[TCA_MAX + 1];
1041         struct net_device *dev;
1042         u32 clid;
1043         struct Qdisc *q, *p;
1044         int err;
1045
1046 replay:
1047         /* Reinit, just in case something touches this. */
1048         tcm = NLMSG_DATA(n);
1049         clid = tcm->tcm_parent;
1050         q = p = NULL;
1051
1052         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1053         if (!dev)
1054                 return -ENODEV;
1055
1056         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1057         if (err < 0)
1058                 return err;
1059
1060         if (clid) {
1061                 if (clid != TC_H_ROOT) {
1062                         if (clid != TC_H_INGRESS) {
1063                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1064                                 if (!p)
1065                                         return -ENOENT;
1066                                 q = qdisc_leaf(p, clid);
1067                         } else if (dev_ingress_queue_create(dev)) {
1068                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1069                         }
1070                 } else {
1071                         q = dev->qdisc;
1072                 }
1073
1074                 /* It may be default qdisc, ignore it */
1075                 if (q && q->handle == 0)
1076                         q = NULL;
1077
1078                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1079                         if (tcm->tcm_handle) {
1080                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1081                                         return -EEXIST;
1082                                 if (TC_H_MIN(tcm->tcm_handle))
1083                                         return -EINVAL;
1084                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1085                                 if (!q)
1086                                         goto create_n_graft;
1087                                 if (n->nlmsg_flags & NLM_F_EXCL)
1088                                         return -EEXIST;
1089                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1090                                         return -EINVAL;
1091                                 if (q == p ||
1092                                     (p && check_loop(q, p, 0)))
1093                                         return -ELOOP;
1094                                 atomic_inc(&q->refcnt);
1095                                 goto graft;
1096                         } else {
1097                                 if (!q)
1098                                         goto create_n_graft;
1099
1100                                 /* This magic test requires explanation.
1101                                  *
1102                                  *   We know, that some child q is already
1103                                  *   attached to this parent and have choice:
1104                                  *   either to change it or to create/graft new one.
1105                                  *
1106                                  *   1. We are allowed to create/graft only
1107                                  *   if CREATE and REPLACE flags are set.
1108                                  *
1109                                  *   2. If EXCL is set, requestor wanted to say,
1110                                  *   that qdisc tcm_handle is not expected
1111                                  *   to exist, so that we choose create/graft too.
1112                                  *
1113                                  *   3. The last case is when no flags are set.
1114                                  *   Alas, it is sort of hole in API, we
1115                                  *   cannot decide what to do unambiguously.
1116                                  *   For now we select create/graft, if
1117                                  *   user gave KIND, which does not match existing.
1118                                  */
1119                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1120                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1121                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1122                                      (tca[TCA_KIND] &&
1123                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1124                                         goto create_n_graft;
1125                         }
1126                 }
1127         } else {
1128                 if (!tcm->tcm_handle)
1129                         return -EINVAL;
1130                 q = qdisc_lookup(dev, tcm->tcm_handle);
1131         }
1132
1133         /* Change qdisc parameters */
1134         if (q == NULL)
1135                 return -ENOENT;
1136         if (n->nlmsg_flags & NLM_F_EXCL)
1137                 return -EEXIST;
1138         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1139                 return -EINVAL;
1140         err = qdisc_change(q, tca);
1141         if (err == 0)
1142                 qdisc_notify(net, skb, n, clid, NULL, q);
1143         return err;
1144
1145 create_n_graft:
1146         if (!(n->nlmsg_flags & NLM_F_CREATE))
1147                 return -ENOENT;
1148         if (clid == TC_H_INGRESS) {
1149                 if (dev_ingress_queue(dev))
1150                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1151                                          tcm->tcm_parent, tcm->tcm_parent,
1152                                          tca, &err);
1153                 else
1154                         err = -ENOENT;
1155         } else {
1156                 struct netdev_queue *dev_queue;
1157
1158                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1159                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1160                 else if (p)
1161                         dev_queue = p->dev_queue;
1162                 else
1163                         dev_queue = netdev_get_tx_queue(dev, 0);
1164
1165                 q = qdisc_create(dev, dev_queue, p,
1166                                  tcm->tcm_parent, tcm->tcm_handle,
1167                                  tca, &err);
1168         }
1169         if (q == NULL) {
1170                 if (err == -EAGAIN)
1171                         goto replay;
1172                 return err;
1173         }
1174
1175 graft:
1176         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1177         if (err) {
1178                 if (q)
1179                         qdisc_destroy(q);
1180                 return err;
1181         }
1182
1183         return 0;
1184 }
1185
1186 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1187                          u32 pid, u32 seq, u16 flags, int event)
1188 {
1189         struct tcmsg *tcm;
1190         struct nlmsghdr  *nlh;
1191         unsigned char *b = skb_tail_pointer(skb);
1192         struct gnet_dump d;
1193         struct qdisc_size_table *stab;
1194
1195         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1196         tcm = NLMSG_DATA(nlh);
1197         tcm->tcm_family = AF_UNSPEC;
1198         tcm->tcm__pad1 = 0;
1199         tcm->tcm__pad2 = 0;
1200         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1201         tcm->tcm_parent = clid;
1202         tcm->tcm_handle = q->handle;
1203         tcm->tcm_info = atomic_read(&q->refcnt);
1204         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1205         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1206                 goto nla_put_failure;
1207         q->qstats.qlen = q->q.qlen;
1208
1209         stab = rtnl_dereference(q->stab);
1210         if (stab && qdisc_dump_stab(skb, stab) < 0)
1211                 goto nla_put_failure;
1212
1213         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1214                                          qdisc_root_sleeping_lock(q), &d) < 0)
1215                 goto nla_put_failure;
1216
1217         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1218                 goto nla_put_failure;
1219
1220         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1221             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1222             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1223                 goto nla_put_failure;
1224
1225         if (gnet_stats_finish_copy(&d) < 0)
1226                 goto nla_put_failure;
1227
1228         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1229         return skb->len;
1230
1231 nlmsg_failure:
1232 nla_put_failure:
1233         nlmsg_trim(skb, b);
1234         return -1;
1235 }
1236
1237 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1238 {
1239         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1240 }
1241
1242 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1243                         struct nlmsghdr *n, u32 clid,
1244                         struct Qdisc *old, struct Qdisc *new)
1245 {
1246         struct sk_buff *skb;
1247         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1248
1249         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1250         if (!skb)
1251                 return -ENOBUFS;
1252
1253         if (old && !tc_qdisc_dump_ignore(old)) {
1254                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1255                                   0, RTM_DELQDISC) < 0)
1256                         goto err_out;
1257         }
1258         if (new && !tc_qdisc_dump_ignore(new)) {
1259                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1260                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1261                         goto err_out;
1262         }
1263
1264         if (skb->len)
1265                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1266                                       n->nlmsg_flags & NLM_F_ECHO);
1267
1268 err_out:
1269         kfree_skb(skb);
1270         return -EINVAL;
1271 }
1272
1273 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1274                               struct netlink_callback *cb,
1275                               int *q_idx_p, int s_q_idx)
1276 {
1277         int ret = 0, q_idx = *q_idx_p;
1278         struct Qdisc *q;
1279
1280         if (!root)
1281                 return 0;
1282
1283         q = root;
1284         if (q_idx < s_q_idx) {
1285                 q_idx++;
1286         } else {
1287                 if (!tc_qdisc_dump_ignore(q) &&
1288                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1289                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1290                         goto done;
1291                 q_idx++;
1292         }
1293         list_for_each_entry(q, &root->list, list) {
1294                 if (q_idx < s_q_idx) {
1295                         q_idx++;
1296                         continue;
1297                 }
1298                 if (!tc_qdisc_dump_ignore(q) &&
1299                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1300                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1301                         goto done;
1302                 q_idx++;
1303         }
1304
1305 out:
1306         *q_idx_p = q_idx;
1307         return ret;
1308 done:
1309         ret = -1;
1310         goto out;
1311 }
1312
1313 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1314 {
1315         struct net *net = sock_net(skb->sk);
1316         int idx, q_idx;
1317         int s_idx, s_q_idx;
1318         struct net_device *dev;
1319
1320         s_idx = cb->args[0];
1321         s_q_idx = q_idx = cb->args[1];
1322
1323         rcu_read_lock();
1324         idx = 0;
1325         for_each_netdev_rcu(net, dev) {
1326                 struct netdev_queue *dev_queue;
1327
1328                 if (idx < s_idx)
1329                         goto cont;
1330                 if (idx > s_idx)
1331                         s_q_idx = 0;
1332                 q_idx = 0;
1333
1334                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1335                         goto done;
1336
1337                 dev_queue = dev_ingress_queue(dev);
1338                 if (dev_queue &&
1339                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1340                                        &q_idx, s_q_idx) < 0)
1341                         goto done;
1342
1343 cont:
1344                 idx++;
1345         }
1346
1347 done:
1348         rcu_read_unlock();
1349
1350         cb->args[0] = idx;
1351         cb->args[1] = q_idx;
1352
1353         return skb->len;
1354 }
1355
1356
1357
1358 /************************************************
1359  *      Traffic classes manipulation.           *
1360  ************************************************/
1361
1362
1363
1364 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1365 {
1366         struct net *net = sock_net(skb->sk);
1367         struct tcmsg *tcm = NLMSG_DATA(n);
1368         struct nlattr *tca[TCA_MAX + 1];
1369         struct net_device *dev;
1370         struct Qdisc *q = NULL;
1371         const struct Qdisc_class_ops *cops;
1372         unsigned long cl = 0;
1373         unsigned long new_cl;
1374         u32 pid = tcm->tcm_parent;
1375         u32 clid = tcm->tcm_handle;
1376         u32 qid = TC_H_MAJ(clid);
1377         int err;
1378
1379         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1380         if (!dev)
1381                 return -ENODEV;
1382
1383         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1384         if (err < 0)
1385                 return err;
1386
1387         /*
1388            parent == TC_H_UNSPEC - unspecified parent.
1389            parent == TC_H_ROOT   - class is root, which has no parent.
1390            parent == X:0         - parent is root class.
1391            parent == X:Y         - parent is a node in hierarchy.
1392            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1393
1394            handle == 0:0         - generate handle from kernel pool.
1395            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1396            handle == X:Y         - clear.
1397            handle == X:0         - root class.
1398          */
1399
1400         /* Step 1. Determine qdisc handle X:0 */
1401
1402         if (pid != TC_H_ROOT) {
1403                 u32 qid1 = TC_H_MAJ(pid);
1404
1405                 if (qid && qid1) {
1406                         /* If both majors are known, they must be identical. */
1407                         if (qid != qid1)
1408                                 return -EINVAL;
1409                 } else if (qid1) {
1410                         qid = qid1;
1411                 } else if (qid == 0)
1412                         qid = dev->qdisc->handle;
1413
1414                 /* Now qid is genuine qdisc handle consistent
1415                  * both with parent and child.
1416                  *
1417                  * TC_H_MAJ(pid) still may be unspecified, complete it now.
1418                  */
1419                 if (pid)
1420                         pid = TC_H_MAKE(qid, pid);
1421         } else {
1422                 if (qid == 0)
1423                         qid = dev->qdisc->handle;
1424         }
1425
1426         /* OK. Locate qdisc */
1427         q = qdisc_lookup(dev, qid);
1428         if (!q)
1429                 return -ENOENT;
1430
1431         /* An check that it supports classes */
1432         cops = q->ops->cl_ops;
1433         if (cops == NULL)
1434                 return -EINVAL;
1435
1436         /* Now try to get class */
1437         if (clid == 0) {
1438                 if (pid == TC_H_ROOT)
1439                         clid = qid;
1440         } else
1441                 clid = TC_H_MAKE(qid, clid);
1442
1443         if (clid)
1444                 cl = cops->get(q, clid);
1445
1446         if (cl == 0) {
1447                 err = -ENOENT;
1448                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1449                     !(n->nlmsg_flags & NLM_F_CREATE))
1450                         goto out;
1451         } else {
1452                 switch (n->nlmsg_type) {
1453                 case RTM_NEWTCLASS:
1454                         err = -EEXIST;
1455                         if (n->nlmsg_flags & NLM_F_EXCL)
1456                                 goto out;
1457                         break;
1458                 case RTM_DELTCLASS:
1459                         err = -EOPNOTSUPP;
1460                         if (cops->delete)
1461                                 err = cops->delete(q, cl);
1462                         if (err == 0)
1463                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1464                         goto out;
1465                 case RTM_GETTCLASS:
1466                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1467                         goto out;
1468                 default:
1469                         err = -EINVAL;
1470                         goto out;
1471                 }
1472         }
1473
1474         new_cl = cl;
1475         err = -EOPNOTSUPP;
1476         if (cops->change)
1477                 err = cops->change(q, clid, pid, tca, &new_cl);
1478         if (err == 0)
1479                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1480
1481 out:
1482         if (cl)
1483                 cops->put(q, cl);
1484
1485         return err;
1486 }
1487
1488
1489 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1490                           unsigned long cl,
1491                           u32 pid, u32 seq, u16 flags, int event)
1492 {
1493         struct tcmsg *tcm;
1494         struct nlmsghdr  *nlh;
1495         unsigned char *b = skb_tail_pointer(skb);
1496         struct gnet_dump d;
1497         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1498
1499         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1500         tcm = NLMSG_DATA(nlh);
1501         tcm->tcm_family = AF_UNSPEC;
1502         tcm->tcm__pad1 = 0;
1503         tcm->tcm__pad2 = 0;
1504         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1505         tcm->tcm_parent = q->handle;
1506         tcm->tcm_handle = q->handle;
1507         tcm->tcm_info = 0;
1508         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1509         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1510                 goto nla_put_failure;
1511
1512         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1513                                          qdisc_root_sleeping_lock(q), &d) < 0)
1514                 goto nla_put_failure;
1515
1516         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1517                 goto nla_put_failure;
1518
1519         if (gnet_stats_finish_copy(&d) < 0)
1520                 goto nla_put_failure;
1521
1522         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1523         return skb->len;
1524
1525 nlmsg_failure:
1526 nla_put_failure:
1527         nlmsg_trim(skb, b);
1528         return -1;
1529 }
1530
1531 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1532                          struct nlmsghdr *n, struct Qdisc *q,
1533                          unsigned long cl, int event)
1534 {
1535         struct sk_buff *skb;
1536         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1537
1538         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1539         if (!skb)
1540                 return -ENOBUFS;
1541
1542         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1543                 kfree_skb(skb);
1544                 return -EINVAL;
1545         }
1546
1547         return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1548                               n->nlmsg_flags & NLM_F_ECHO);
1549 }
1550
1551 struct qdisc_dump_args {
1552         struct qdisc_walker     w;
1553         struct sk_buff          *skb;
1554         struct netlink_callback *cb;
1555 };
1556
1557 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1558 {
1559         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1560
1561         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1562                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1563 }
1564
1565 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1566                                 struct tcmsg *tcm, struct netlink_callback *cb,
1567                                 int *t_p, int s_t)
1568 {
1569         struct qdisc_dump_args arg;
1570
1571         if (tc_qdisc_dump_ignore(q) ||
1572             *t_p < s_t || !q->ops->cl_ops ||
1573             (tcm->tcm_parent &&
1574              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1575                 (*t_p)++;
1576                 return 0;
1577         }
1578         if (*t_p > s_t)
1579                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1580         arg.w.fn = qdisc_class_dump;
1581         arg.skb = skb;
1582         arg.cb = cb;
1583         arg.w.stop  = 0;
1584         arg.w.skip = cb->args[1];
1585         arg.w.count = 0;
1586         q->ops->cl_ops->walk(q, &arg.w);
1587         cb->args[1] = arg.w.count;
1588         if (arg.w.stop)
1589                 return -1;
1590         (*t_p)++;
1591         return 0;
1592 }
1593
1594 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1595                                struct tcmsg *tcm, struct netlink_callback *cb,
1596                                int *t_p, int s_t)
1597 {
1598         struct Qdisc *q;
1599
1600         if (!root)
1601                 return 0;
1602
1603         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1604                 return -1;
1605
1606         list_for_each_entry(q, &root->list, list) {
1607                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1608                         return -1;
1609         }
1610
1611         return 0;
1612 }
1613
1614 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1615 {
1616         struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1617         struct net *net = sock_net(skb->sk);
1618         struct netdev_queue *dev_queue;
1619         struct net_device *dev;
1620         int t, s_t;
1621
1622         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1623                 return 0;
1624         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1625         if (!dev)
1626                 return 0;
1627
1628         s_t = cb->args[0];
1629         t = 0;
1630
1631         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1632                 goto done;
1633
1634         dev_queue = dev_ingress_queue(dev);
1635         if (dev_queue &&
1636             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1637                                 &t, s_t) < 0)
1638                 goto done;
1639
1640 done:
1641         cb->args[0] = t;
1642
1643         dev_put(dev);
1644         return skb->len;
1645 }
1646
1647 /* Main classifier routine: scans classifier chain attached
1648  * to this qdisc, (optionally) tests for protocol and asks
1649  * specific classifiers.
1650  */
1651 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1652                        struct tcf_result *res)
1653 {
1654         __be16 protocol = skb->protocol;
1655         int err;
1656
1657         for (; tp; tp = tp->next) {
1658                 if (tp->protocol != protocol &&
1659                     tp->protocol != htons(ETH_P_ALL))
1660                         continue;
1661                 err = tp->classify(skb, tp, res);
1662
1663                 if (err >= 0) {
1664 #ifdef CONFIG_NET_CLS_ACT
1665                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1666                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1667 #endif
1668                         return err;
1669                 }
1670         }
1671         return -1;
1672 }
1673 EXPORT_SYMBOL(tc_classify_compat);
1674
1675 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1676                 struct tcf_result *res)
1677 {
1678         int err = 0;
1679 #ifdef CONFIG_NET_CLS_ACT
1680         const struct tcf_proto *otp = tp;
1681 reclassify:
1682 #endif
1683
1684         err = tc_classify_compat(skb, tp, res);
1685 #ifdef CONFIG_NET_CLS_ACT
1686         if (err == TC_ACT_RECLASSIFY) {
1687                 u32 verd = G_TC_VERD(skb->tc_verd);
1688                 tp = otp;
1689
1690                 if (verd++ >= MAX_REC_LOOP) {
1691                         if (net_ratelimit())
1692                                 pr_notice("%s: packet reclassify loop"
1693                                           " rule prio %u protocol %02x\n",
1694                                           tp->q->ops->id,
1695                                           tp->prio & 0xffff,
1696                                           ntohs(tp->protocol));
1697                         return TC_ACT_SHOT;
1698                 }
1699                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1700                 goto reclassify;
1701         }
1702 #endif
1703         return err;
1704 }
1705 EXPORT_SYMBOL(tc_classify);
1706
1707 void tcf_destroy(struct tcf_proto *tp)
1708 {
1709         tp->ops->destroy(tp);
1710         module_put(tp->ops->owner);
1711         kfree(tp);
1712 }
1713
1714 void tcf_destroy_chain(struct tcf_proto **fl)
1715 {
1716         struct tcf_proto *tp;
1717
1718         while ((tp = *fl) != NULL) {
1719                 *fl = tp->next;
1720                 tcf_destroy(tp);
1721         }
1722 }
1723 EXPORT_SYMBOL(tcf_destroy_chain);
1724
1725 #ifdef CONFIG_PROC_FS
1726 static int psched_show(struct seq_file *seq, void *v)
1727 {
1728         struct timespec ts;
1729
1730         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1731         seq_printf(seq, "%08x %08x %08x %08x\n",
1732                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1733                    1000000,
1734                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1735
1736         return 0;
1737 }
1738
1739 static int psched_open(struct inode *inode, struct file *file)
1740 {
1741         return single_open(file, psched_show, NULL);
1742 }
1743
1744 static const struct file_operations psched_fops = {
1745         .owner = THIS_MODULE,
1746         .open = psched_open,
1747         .read  = seq_read,
1748         .llseek = seq_lseek,
1749         .release = single_release,
1750 };
1751
1752 static int __net_init psched_net_init(struct net *net)
1753 {
1754         struct proc_dir_entry *e;
1755
1756         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1757         if (e == NULL)
1758                 return -ENOMEM;
1759
1760         return 0;
1761 }
1762
1763 static void __net_exit psched_net_exit(struct net *net)
1764 {
1765         proc_net_remove(net, "psched");
1766 }
1767 #else
1768 static int __net_init psched_net_init(struct net *net)
1769 {
1770         return 0;
1771 }
1772
1773 static void __net_exit psched_net_exit(struct net *net)
1774 {
1775 }
1776 #endif
1777
1778 static struct pernet_operations psched_net_ops = {
1779         .init = psched_net_init,
1780         .exit = psched_net_exit,
1781 };
1782
1783 static int __init pktsched_init(void)
1784 {
1785         int err;
1786
1787         err = register_pernet_subsys(&psched_net_ops);
1788         if (err) {
1789                 pr_err("pktsched_init: "
1790                        "cannot initialize per netns operations\n");
1791                 return err;
1792         }
1793
1794         register_qdisc(&pfifo_qdisc_ops);
1795         register_qdisc(&bfifo_qdisc_ops);
1796         register_qdisc(&pfifo_head_drop_qdisc_ops);
1797         register_qdisc(&mq_qdisc_ops);
1798
1799         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1800         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1801         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1802         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1803         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1804         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1805
1806         return 0;
1807 }
1808
1809 subsys_initcall(pktsched_init);