net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26
  27 #include <net/netlink.h>
  28 #include <net/pkt_sched.h>
  29
  30 #define VERSION "1.3"
  31
  32 /*      Network Emulation Queuing algorithm.
  33         ====================================
  34
  35         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  36                  Network Emulation Tool
  37                  [2] Luigi Rizzo, DummyNet for FreeBSD
  38
  39          ----------------------------------------------------------------
  40
  41          This started out as a simple way to delay outgoing packets to
  42          test TCP but has grown to include most of the functionality
  43          of a full blown network emulator like NISTnet. It can delay
  44          packets and add random jitter (and correlation). The random
  45          distribution can be loaded from a table as well to provide
  46          normal, Pareto, or experimental curves. Packet loss,
  47          duplication, and reordering can also be emulated.
  48
  49          This qdisc does not do classification that can be handled in
  50          layering other disciplines.  It does not need to do bandwidth
  51          control either since that can be handled by using token
  52          bucket or other rate control.
  53
  54      Correlated Loss Generator models
  55
  56         Added generation of correlated loss according to the
  57         "Gilbert-Elliot" model, a 4-state markov model.
  58
  59         References:
  60         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  61         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  62         and intuitive loss model for packet networks and its implementation
  63         in the Netem module in the Linux kernel", available in [1]
  64
  65         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  66                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  67 */
  68
  69 struct netem_sched_data {
  70         /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
  71
  72         /* optional qdisc for classful handling (NULL at netem init) */
  73         struct Qdisc    *qdisc;
  74
  75         struct qdisc_watchdog watchdog;
  76
  77         psched_tdiff_t latency;
  78         psched_tdiff_t jitter;
  79
  80         u32 loss;
  81         u32 limit;
  82         u32 counter;
  83         u32 gap;
  84         u32 duplicate;
  85         u32 reorder;
  86         u32 corrupt;
  87         u32 rate;
  88         s32 packet_overhead;
  89         u32 cell_size;
  90         u32 cell_size_reciprocal;
  91         s32 cell_overhead;
  92
  93         struct crndstate {
  94                 u32 last;
  95                 u32 rho;
  96         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  97
  98         struct disttable {
  99                 u32  size;
 100                 s16 table[0];
 101         } *delay_dist;
 102
 103         enum  {
 104                 CLG_RANDOM,
 105                 CLG_4_STATES,
 106                 CLG_GILB_ELL,
 107         } loss_model;
 108
 109         /* Correlated Loss Generation models */
 110         struct clgstate {
 111                 /* state of the Markov chain */
 112                 u8 state;
 113
 114                 /* 4-states and Gilbert-Elliot models */
 115                 u32 a1; /* p13 for 4-states or p for GE */
 116                 u32 a2; /* p31 for 4-states or r for GE */
 117                 u32 a3; /* p32 for 4-states or h for GE */
 118                 u32 a4; /* p14 for 4-states or 1-k for GE */
 119                 u32 a5; /* p23 used only in 4-states */
 120         } clg;
 121
 122 };
 123
 124 /* Time stamp put into socket buffer control block
 125  * Only valid when skbs are in our internal t(ime)fifo queue.
 126  */
 127 struct netem_skb_cb {
 128         psched_time_t   time_to_send;
 129 };
 130
 131 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 132 {
 133         BUILD_BUG_ON(sizeof(skb->cb) <
 134                 sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
 135         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 136 }
 137
 138 /* init_crandom - initialize correlated random number generator
 139  * Use entropy source for initial seed.
 140  */
 141 static void init_crandom(struct crndstate *state, unsigned long rho)
 142 {
 143         state->rho = rho;
 144         state->last = net_random();
 145 }
 146
 147 /* get_crandom - correlated random number generator
 148  * Next number depends on last value.
 149  * rho is scaled to avoid floating point.
 150  */
 151 static u32 get_crandom(struct crndstate *state)
 152 {
 153         u64 value, rho;
 154         unsigned long answer;
 155
 156         if (state->rho == 0)    /* no correlation */
 157                 return net_random();
 158
 159         value = net_random();
 160         rho = (u64)state->rho + 1;
 161         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 162         state->last = answer;
 163         return answer;
 164 }
 165
 166 /* loss_4state - 4-state model loss generator
 167  * Generates losses according to the 4-state Markov chain adopted in
 168  * the GI (General and Intuitive) loss model.
 169  */
 170 static bool loss_4state(struct netem_sched_data *q)
 171 {
 172         struct clgstate *clg = &q->clg;
 173         u32 rnd = net_random();
 174
 175         /*
 176          * Makes a comparison between rnd and the transition
 177          * probabilities outgoing from the current state, then decides the
 178          * next state and if the next packet has to be transmitted or lost.
 179          * The four states correspond to:
 180          *   1 => successfully transmitted packets within a gap period
 181          *   4 => isolated losses within a gap period
 182          *   3 => lost packets within a burst period
 183          *   2 => successfully transmitted packets within a burst period
 184          */
 185         switch (clg->state) {
 186         case 1:
 187                 if (rnd < clg->a4) {
 188                         clg->state = 4;
 189                         return true;
 190                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 191                         clg->state = 3;
 192                         return true;
 193                 } else if (clg->a1 < rnd)
 194                         clg->state = 1;
 195
 196                 break;
 197         case 2:
 198                 if (rnd < clg->a5) {
 199                         clg->state = 3;
 200                         return true;
 201                 } else
 202                         clg->state = 2;
 203
 204                 break;
 205         case 3:
 206                 if (rnd < clg->a3)
 207                         clg->state = 2;
 208                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 209                         clg->state = 1;
 210                         return true;
 211                 } else if (clg->a2 + clg->a3 < rnd) {
 212                         clg->state = 3;
 213                         return true;
 214                 }
 215                 break;
 216         case 4:
 217                 clg->state = 1;
 218                 break;
 219         }
 220
 221         return false;
 222 }
 223
 224 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 225  * Generates losses according to the Gilbert-Elliot loss model or
 226  * its special cases  (Gilbert or Simple Gilbert)
 227  *
 228  * Makes a comparison between random number and the transition
 229  * probabilities outgoing from the current state, then decides the
 230  * next state. A second random number is extracted and the comparison
 231  * with the loss probability of the current state decides if the next
 232  * packet will be transmitted or lost.
 233  */
 234 static bool loss_gilb_ell(struct netem_sched_data *q)
 235 {
 236         struct clgstate *clg = &q->clg;
 237
 238         switch (clg->state) {
 239         case 1:
 240                 if (net_random() < clg->a1)
 241                         clg->state = 2;
 242                 if (net_random() < clg->a4)
 243                         return true;
 244         case 2:
 245                 if (net_random() < clg->a2)
 246                         clg->state = 1;
 247                 if (clg->a3 > net_random())
 248                         return true;
 249         }
 250
 251         return false;
 252 }
 253
 254 static bool loss_event(struct netem_sched_data *q)
 255 {
 256         switch (q->loss_model) {
 257         case CLG_RANDOM:
 258                 /* Random packet drop 0 => none, ~0 => all */
 259                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 260
 261         case CLG_4_STATES:
 262                 /* 4state loss model algorithm (used also for GI model)
 263                 * Extracts a value from the markov 4 state loss generator,
 264                 * if it is 1 drops a packet and if needed writes the event in
 265                 * the kernel logs
 266                 */
 267                 return loss_4state(q);
 268
 269         case CLG_GILB_ELL:
 270                 /* Gilbert-Elliot loss model algorithm
 271                 * Extracts a value from the Gilbert-Elliot loss generator,
 272                 * if it is 1 drops a packet and if needed writes the event in
 273                 * the kernel logs
 274                 */
 275                 return loss_gilb_ell(q);
 276         }
 277
 278         return false;   /* not reached */
 279 }
 280
 281
 282 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 283  * std deviation sigma.  Uses table lookup to approximate the desired
 284  * distribution, and a uniformly-distributed pseudo-random source.
 285  */
 286 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 287                                 struct crndstate *state,
 288                                 const struct disttable *dist)
 289 {
 290         psched_tdiff_t x;
 291         long t;
 292         u32 rnd;
 293
 294         if (sigma == 0)
 295                 return mu;
 296
 297         rnd = get_crandom(state);
 298
 299         /* default uniform distribution */
 300         if (dist == NULL)
 301                 return (rnd % (2*sigma)) - sigma + mu;
 302
 303         t = dist->table[rnd % dist->size];
 304         x = (sigma % NETEM_DIST_SCALE) * t;
 305         if (x >= 0)
 306                 x += NETEM_DIST_SCALE/2;
 307         else
 308                 x -= NETEM_DIST_SCALE/2;
 309
 310         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 311 }
 312
 313 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 314 {
 315         u64 ticks;
 316
 317         len += q->packet_overhead;
 318
 319         if (q->cell_size) {
 320                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 321
 322                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 323                         cells++;
 324                 len = cells * (q->cell_size + q->cell_overhead);
 325         }
 326
 327         ticks = (u64)len * NSEC_PER_SEC;
 328
 329         do_div(ticks, q->rate);
 330         return PSCHED_NS2TICKS(ticks);
 331 }
 332
 333 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 334 {
 335         struct sk_buff_head *list = &sch->q;
 336         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 337         struct sk_buff *skb;
 338
 339         if (likely(skb_queue_len(list) < sch->limit)) {
 340                 skb = skb_peek_tail(list);
 341                 /* Optimize for add at tail */
 342                 if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
 343                         return qdisc_enqueue_tail(nskb, sch);
 344
 345                 skb_queue_reverse_walk(list, skb) {
 346                         if (tnext >= netem_skb_cb(skb)->time_to_send)
 347                                 break;
 348                 }
 349
 350                 __skb_queue_after(list, skb, nskb);
 351                 sch->qstats.backlog += qdisc_pkt_len(nskb);
 352                 return NET_XMIT_SUCCESS;
 353         }
 354
 355         return qdisc_reshape_fail(nskb, sch);
 356 }
 357
 358 /*
 359  * Insert one skb into qdisc.
 360  * Note: parent depends on return value to account for queue length.
 361  *      NET_XMIT_DROP: queue length didn't change.
 362  *      NET_XMIT_SUCCESS: one skb was queued.
 363  */
 364 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 365 {
 366         struct netem_sched_data *q = qdisc_priv(sch);
 367         /* We don't fill cb now as skb_unshare() may invalidate it */
 368         struct netem_skb_cb *cb;
 369         struct sk_buff *skb2;
 370         int ret;
 371         int count = 1;
 372
 373         /* Random duplication */
 374         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 375                 ++count;
 376
 377         /* Drop packet? */
 378         if (loss_event(q))
 379                 --count;
 380
 381         if (count == 0) {
 382                 sch->qstats.drops++;
 383                 kfree_skb(skb);
 384                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 385         }
 386
 387         skb_orphan(skb);
 388
 389         /*
 390          * If we need to duplicate packet, then re-insert at top of the
 391          * qdisc tree, since parent queuer expects that only one
 392          * skb will be queued.
 393          */
 394         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 395                 struct Qdisc *rootq = qdisc_root(sch);
 396                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 397                 q->duplicate = 0;
 398
 399                 qdisc_enqueue_root(skb2, rootq);
 400                 q->duplicate = dupsave;
 401         }
 402
 403         /*
 404          * Randomized packet corruption.
 405          * Make copy if needed since we are modifying
 406          * If packet is going to be hardware checksummed, then
 407          * do it now in software before we mangle it.
 408          */
 409         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 410                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 411                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 412                      skb_checksum_help(skb))) {
 413                         sch->qstats.drops++;
 414                         return NET_XMIT_DROP;
 415                 }
 416
 417                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 418         }
 419
 420         cb = netem_skb_cb(skb);
 421         if (q->gap == 0 ||              /* not doing reordering */
 422             q->counter < q->gap ||      /* inside last reordering gap */
 423             q->reorder < get_crandom(&q->reorder_cor)) {
 424                 psched_time_t now;
 425                 psched_tdiff_t delay;
 426
 427                 delay = tabledist(q->latency, q->jitter,
 428                                   &q->delay_cor, q->delay_dist);
 429
 430                 now = psched_get_time();
 431
 432                 if (q->rate) {
 433                         struct sk_buff_head *list = &sch->q;
 434
 435                         delay += packet_len_2_sched_time(skb->len, q);
 436
 437                         if (!skb_queue_empty(list)) {
 438                                 /*
 439                                  * Last packet in queue is reference point (now).
 440                                  * First packet in queue is already in flight,
 441                                  * calculate this time bonus and substract
 442                                  * from delay.
 443                                  */
 444                                 delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
 445                                 now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 446                         }
 447                 }
 448
 449                 cb->time_to_send = now + delay;
 450                 ++q->counter;
 451                 ret = tfifo_enqueue(skb, sch);
 452         } else {
 453                 /*
 454                  * Do re-ordering by putting one out of N packets at the front
 455                  * of the queue.
 456                  */
 457                 cb->time_to_send = psched_get_time();
 458                 q->counter = 0;
 459
 460                 __skb_queue_head(&sch->q, skb);
 461                 q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
 462                 q->qdisc->qstats.requeues++;
 463                 ret = NET_XMIT_SUCCESS;
 464         }
 465
 466         if (ret != NET_XMIT_SUCCESS) {
 467                 if (net_xmit_drop_count(ret)) {
 468                         sch->qstats.drops++;
 469                         return ret;
 470                 }
 471         }
 472
 473         return NET_XMIT_SUCCESS;
 474 }
 475
 476 static unsigned int netem_drop(struct Qdisc *sch)
 477 {
 478         struct netem_sched_data *q = qdisc_priv(sch);
 479         unsigned int len;
 480
 481         len = qdisc_queue_drop(sch);
 482         if (!len && q->qdisc && q->qdisc->ops->drop)
 483             len = q->qdisc->ops->drop(q->qdisc);
 484         if (len)
 485                 sch->qstats.drops++;
 486
 487         return len;
 488 }
 489
 490 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 491 {
 492         struct netem_sched_data *q = qdisc_priv(sch);
 493         struct sk_buff *skb;
 494
 495         if (qdisc_is_throttled(sch))
 496                 return NULL;
 497
 498 tfifo_dequeue:
 499         skb = qdisc_peek_head(sch);
 500         if (skb) {
 501                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 502
 503                 /* if more time remaining? */
 504                 if (cb->time_to_send <= psched_get_time()) {
 505                         skb = qdisc_dequeue_tail(sch);
 506                         if (unlikely(!skb))
 507                                 goto qdisc_dequeue;
 508
 509 #ifdef CONFIG_NET_CLS_ACT
 510                         /*
 511                          * If it's at ingress let's pretend the delay is
 512                          * from the network (tstamp will be updated).
 513                          */
 514                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 515                                 skb->tstamp.tv64 = 0;
 516 #endif
 517
 518                         if (q->qdisc) {
 519                                 int err = qdisc_enqueue(skb, q->qdisc);
 520
 521                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 522                                         if (net_xmit_drop_count(err)) {
 523                                                 sch->qstats.drops++;
 524                                                 qdisc_tree_decrease_qlen(sch, 1);
 525                                         }
 526                                 }
 527                                 goto tfifo_dequeue;
 528                         }
 529 deliver:
 530                         qdisc_unthrottled(sch);
 531                         qdisc_bstats_update(sch, skb);
 532                         return skb;
 533                 }
 534
 535                 if (q->qdisc) {
 536                         skb = q->qdisc->ops->dequeue(q->qdisc);
 537                         if (skb)
 538                                 goto deliver;
 539                 }
 540                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 541         }
 542
 543 qdisc_dequeue:
 544         if (q->qdisc) {
 545                 skb = q->qdisc->ops->dequeue(q->qdisc);
 546                 if (skb)
 547                         goto deliver;
 548         }
 549         return NULL;
 550 }
 551
 552 static void netem_reset(struct Qdisc *sch)
 553 {
 554         struct netem_sched_data *q = qdisc_priv(sch);
 555
 556         qdisc_reset_queue(sch);
 557         if (q->qdisc)
 558                 qdisc_reset(q->qdisc);
 559         qdisc_watchdog_cancel(&q->watchdog);
 560 }
 561
 562 static void dist_free(struct disttable *d)
 563 {
 564         if (d) {
 565                 if (is_vmalloc_addr(d))
 566                         vfree(d);
 567                 else
 568                         kfree(d);
 569         }
 570 }
 571
 572 /*
 573  * Distribution data is a variable size payload containing
 574  * signed 16 bit values.
 575  */
 576 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 577 {
 578         struct netem_sched_data *q = qdisc_priv(sch);
 579         size_t n = nla_len(attr)/sizeof(__s16);
 580         const __s16 *data = nla_data(attr);
 581         spinlock_t *root_lock;
 582         struct disttable *d;
 583         int i;
 584         size_t s;
 585
 586         if (n > NETEM_DIST_MAX)
 587                 return -EINVAL;
 588
 589         s = sizeof(struct disttable) + n * sizeof(s16);
 590         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 591         if (!d)
 592                 d = vmalloc(s);
 593         if (!d)
 594                 return -ENOMEM;
 595
 596         d->size = n;
 597         for (i = 0; i < n; i++)
 598                 d->table[i] = data[i];
 599
 600         root_lock = qdisc_root_sleeping_lock(sch);
 601
 602         spin_lock_bh(root_lock);
 603         swap(q->delay_dist, d);
 604         spin_unlock_bh(root_lock);
 605
 606         dist_free(d);
 607         return 0;
 608 }
 609
 610 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 611 {
 612         struct netem_sched_data *q = qdisc_priv(sch);
 613         const struct tc_netem_corr *c = nla_data(attr);
 614
 615         init_crandom(&q->delay_cor, c->delay_corr);
 616         init_crandom(&q->loss_cor, c->loss_corr);
 617         init_crandom(&q->dup_cor, c->dup_corr);
 618 }
 619
 620 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 621 {
 622         struct netem_sched_data *q = qdisc_priv(sch);
 623         const struct tc_netem_reorder *r = nla_data(attr);
 624
 625         q->reorder = r->probability;
 626         init_crandom(&q->reorder_cor, r->correlation);
 627 }
 628
 629 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 630 {
 631         struct netem_sched_data *q = qdisc_priv(sch);
 632         const struct tc_netem_corrupt *r = nla_data(attr);
 633
 634         q->corrupt = r->probability;
 635         init_crandom(&q->corrupt_cor, r->correlation);
 636 }
 637
 638 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 639 {
 640         struct netem_sched_data *q = qdisc_priv(sch);
 641         const struct tc_netem_rate *r = nla_data(attr);
 642
 643         q->rate = r->rate;
 644         q->packet_overhead = r->packet_overhead;
 645         q->cell_size = r->cell_size;
 646         if (q->cell_size)
 647                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 648         q->cell_overhead = r->cell_overhead;
 649 }
 650
 651 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 652 {
 653         struct netem_sched_data *q = qdisc_priv(sch);
 654         const struct nlattr *la;
 655         int rem;
 656
 657         nla_for_each_nested(la, attr, rem) {
 658                 u16 type = nla_type(la);
 659
 660                 switch(type) {
 661                 case NETEM_LOSS_GI: {
 662                         const struct tc_netem_gimodel *gi = nla_data(la);
 663
 664                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 665                                 pr_info("netem: incorrect gi model size\n");
 666                                 return -EINVAL;
 667                         }
 668
 669                         q->loss_model = CLG_4_STATES;
 670
 671                         q->clg.state = 1;
 672                         q->clg.a1 = gi->p13;
 673                         q->clg.a2 = gi->p31;
 674                         q->clg.a3 = gi->p32;
 675                         q->clg.a4 = gi->p14;
 676                         q->clg.a5 = gi->p23;
 677                         break;
 678                 }
 679
 680                 case NETEM_LOSS_GE: {
 681                         const struct tc_netem_gemodel *ge = nla_data(la);
 682
 683                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 684                                 pr_info("netem: incorrect ge model size\n");
 685                                 return -EINVAL;
 686                         }
 687
 688                         q->loss_model = CLG_GILB_ELL;
 689                         q->clg.state = 1;
 690                         q->clg.a1 = ge->p;
 691                         q->clg.a2 = ge->r;
 692                         q->clg.a3 = ge->h;
 693                         q->clg.a4 = ge->k1;
 694                         break;
 695                 }
 696
 697                 default:
 698                         pr_info("netem: unknown loss type %u\n", type);
 699                         return -EINVAL;
 700                 }
 701         }
 702
 703         return 0;
 704 }
 705
 706 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 707         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 708         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 709         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 710         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 711         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 712 };
 713
 714 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 715                       const struct nla_policy *policy, int len)
 716 {
 717         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 718
 719         if (nested_len < 0) {
 720                 pr_info("netem: invalid attributes len %d\n", nested_len);
 721                 return -EINVAL;
 722         }
 723
 724         if (nested_len >= nla_attr_size(0))
 725                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 726                                  nested_len, policy);
 727
 728         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 729         return 0;
 730 }
 731
 732 /* Parse netlink message to set options */
 733 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 734 {
 735         struct netem_sched_data *q = qdisc_priv(sch);
 736         struct nlattr *tb[TCA_NETEM_MAX + 1];
 737         struct tc_netem_qopt *qopt;
 738         int ret;
 739
 740         if (opt == NULL)
 741                 return -EINVAL;
 742
 743         qopt = nla_data(opt);
 744         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 745         if (ret < 0)
 746                 return ret;
 747
 748         sch->limit = qopt->limit;
 749
 750         q->latency = qopt->latency;
 751         q->jitter = qopt->jitter;
 752         q->limit = qopt->limit;
 753         q->gap = qopt->gap;
 754         q->counter = 0;
 755         q->loss = qopt->loss;
 756         q->duplicate = qopt->duplicate;
 757
 758         /* for compatibility with earlier versions.
 759          * if gap is set, need to assume 100% probability
 760          */
 761         if (q->gap)
 762                 q->reorder = ~0;
 763
 764         if (tb[TCA_NETEM_CORR])
 765                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 766
 767         if (tb[TCA_NETEM_DELAY_DIST]) {
 768                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 769                 if (ret)
 770                         return ret;
 771         }
 772
 773         if (tb[TCA_NETEM_REORDER])
 774                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 775
 776         if (tb[TCA_NETEM_CORRUPT])
 777                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 778
 779         if (tb[TCA_NETEM_RATE])
 780                 get_rate(sch, tb[TCA_NETEM_RATE]);
 781
 782         q->loss_model = CLG_RANDOM;
 783         if (tb[TCA_NETEM_LOSS])
 784                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 785
 786         return ret;
 787 }
 788
 789 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 790 {
 791         struct netem_sched_data *q = qdisc_priv(sch);
 792         int ret;
 793
 794         if (!opt)
 795                 return -EINVAL;
 796
 797         qdisc_watchdog_init(&q->watchdog, sch);
 798
 799         q->loss_model = CLG_RANDOM;
 800         ret = netem_change(sch, opt);
 801         if (ret)
 802                 pr_info("netem: change failed\n");
 803         return ret;
 804 }
 805
 806 static void netem_destroy(struct Qdisc *sch)
 807 {
 808         struct netem_sched_data *q = qdisc_priv(sch);
 809
 810         qdisc_watchdog_cancel(&q->watchdog);
 811         if (q->qdisc)
 812                 qdisc_destroy(q->qdisc);
 813         dist_free(q->delay_dist);
 814 }
 815
 816 static int dump_loss_model(const struct netem_sched_data *q,
 817                            struct sk_buff *skb)
 818 {
 819         struct nlattr *nest;
 820
 821         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 822         if (nest == NULL)
 823                 goto nla_put_failure;
 824
 825         switch (q->loss_model) {
 826         case CLG_RANDOM:
 827                 /* legacy loss model */
 828                 nla_nest_cancel(skb, nest);
 829                 return 0;       /* no data */
 830
 831         case CLG_4_STATES: {
 832                 struct tc_netem_gimodel gi = {
 833                         .p13 = q->clg.a1,
 834                         .p31 = q->clg.a2,
 835                         .p32 = q->clg.a3,
 836                         .p14 = q->clg.a4,
 837                         .p23 = q->clg.a5,
 838                 };
 839
 840                 NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
 841                 break;
 842         }
 843         case CLG_GILB_ELL: {
 844                 struct tc_netem_gemodel ge = {
 845                         .p = q->clg.a1,
 846                         .r = q->clg.a2,
 847                         .h = q->clg.a3,
 848                         .k1 = q->clg.a4,
 849                 };
 850
 851                 NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
 852                 break;
 853         }
 854         }
 855
 856         nla_nest_end(skb, nest);
 857         return 0;
 858
 859 nla_put_failure:
 860         nla_nest_cancel(skb, nest);
 861         return -1;
 862 }
 863
 864 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 865 {
 866         const struct netem_sched_data *q = qdisc_priv(sch);
 867         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 868         struct tc_netem_qopt qopt;
 869         struct tc_netem_corr cor;
 870         struct tc_netem_reorder reorder;
 871         struct tc_netem_corrupt corrupt;
 872         struct tc_netem_rate rate;
 873
 874         qopt.latency = q->latency;
 875         qopt.jitter = q->jitter;
 876         qopt.limit = q->limit;
 877         qopt.loss = q->loss;
 878         qopt.gap = q->gap;
 879         qopt.duplicate = q->duplicate;
 880         NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
 881
 882         cor.delay_corr = q->delay_cor.rho;
 883         cor.loss_corr = q->loss_cor.rho;
 884         cor.dup_corr = q->dup_cor.rho;
 885         NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
 886
 887         reorder.probability = q->reorder;
 888         reorder.correlation = q->reorder_cor.rho;
 889         NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 890
 891         corrupt.probability = q->corrupt;
 892         corrupt.correlation = q->corrupt_cor.rho;
 893         NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 894
 895         rate.rate = q->rate;
 896         rate.packet_overhead = q->packet_overhead;
 897         rate.cell_size = q->cell_size;
 898         rate.cell_overhead = q->cell_overhead;
 899         NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
 900
 901         if (dump_loss_model(q, skb) != 0)
 902                 goto nla_put_failure;
 903
 904         return nla_nest_end(skb, nla);
 905
 906 nla_put_failure:
 907         nlmsg_trim(skb, nla);
 908         return -1;
 909 }
 910
 911 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 912                           struct sk_buff *skb, struct tcmsg *tcm)
 913 {
 914         struct netem_sched_data *q = qdisc_priv(sch);
 915
 916         if (cl != 1 || !q->qdisc)       /* only one class */
 917                 return -ENOENT;
 918
 919         tcm->tcm_handle |= TC_H_MIN(1);
 920         tcm->tcm_info = q->qdisc->handle;
 921
 922         return 0;
 923 }
 924
 925 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 926                      struct Qdisc **old)
 927 {
 928         struct netem_sched_data *q = qdisc_priv(sch);
 929
 930         sch_tree_lock(sch);
 931         *old = q->qdisc;
 932         q->qdisc = new;
 933         if (*old) {
 934                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 935                 qdisc_reset(*old);
 936         }
 937         sch_tree_unlock(sch);
 938
 939         return 0;
 940 }
 941
 942 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 943 {
 944         struct netem_sched_data *q = qdisc_priv(sch);
 945         return q->qdisc;
 946 }
 947
 948 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 949 {
 950         return 1;
 951 }
 952
 953 static void netem_put(struct Qdisc *sch, unsigned long arg)
 954 {
 955 }
 956
 957 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 958 {
 959         if (!walker->stop) {
 960                 if (walker->count >= walker->skip)
 961                         if (walker->fn(sch, 1, walker) < 0) {
 962                                 walker->stop = 1;
 963                                 return;
 964                         }
 965                 walker->count++;
 966         }
 967 }
 968
 969 static const struct Qdisc_class_ops netem_class_ops = {
 970         .graft          =       netem_graft,
 971         .leaf           =       netem_leaf,
 972         .get            =       netem_get,
 973         .put            =       netem_put,
 974         .walk           =       netem_walk,
 975         .dump           =       netem_dump_class,
 976 };
 977
 978 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 979         .id             =       "netem",
 980         .cl_ops         =       &netem_class_ops,
 981         .priv_size      =       sizeof(struct netem_sched_data),
 982         .enqueue        =       netem_enqueue,
 983         .dequeue        =       netem_dequeue,
 984         .peek           =       qdisc_peek_dequeued,
 985         .drop           =       netem_drop,
 986         .init           =       netem_init,
 987         .reset          =       netem_reset,
 988         .destroy        =       netem_destroy,
 989         .change         =       netem_change,
 990         .dump           =       netem_dump,
 991         .owner          =       THIS_MODULE,
 992 };
 993
 994
 995 static int __init netem_module_init(void)
 996 {
 997         pr_info("netem: version " VERSION "\n");
 998         return register_qdisc(&netem_qdisc_ops);
 999 }
1000 static void __exit netem_module_exit(void)
1001 {
1002         unregister_qdisc(&netem_qdisc_ops);
1003 }
1004 module_init(netem_module_init)
1005 module_exit(netem_module_exit)
1006 MODULE_LICENSE("GPL");