net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct netem_sched_data {
  72         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  73         struct rb_root t_root;
  74
  75         /* optional qdisc for classful handling (NULL at netem init) */
  76         struct Qdisc    *qdisc;
  77
  78         struct qdisc_watchdog watchdog;
  79
  80         psched_tdiff_t latency;
  81         psched_tdiff_t jitter;
  82
  83         u32 loss;
  84         u32 ecn;
  85         u32 limit;
  86         u32 counter;
  87         u32 gap;
  88         u32 duplicate;
  89         u32 reorder;
  90         u32 corrupt;
  91         u32 rate;
  92         s32 packet_overhead;
  93         u32 cell_size;
  94         u32 cell_size_reciprocal;
  95         s32 cell_overhead;
  96
  97         struct crndstate {
  98                 u32 last;
  99                 u32 rho;
 100         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 101
 102         struct disttable {
 103                 u32  size;
 104                 s16 table[0];
 105         } *delay_dist;
 106
 107         enum  {
 108                 CLG_RANDOM,
 109                 CLG_4_STATES,
 110                 CLG_GILB_ELL,
 111         } loss_model;
 112
 113         /* Correlated Loss Generation models */
 114         struct clgstate {
 115                 /* state of the Markov chain */
 116                 u8 state;
 117
 118                 /* 4-states and Gilbert-Elliot models */
 119                 u32 a1; /* p13 for 4-states or p for GE */
 120                 u32 a2; /* p31 for 4-states or r for GE */
 121                 u32 a3; /* p32 for 4-states or h for GE */
 122                 u32 a4; /* p14 for 4-states or 1-k for GE */
 123                 u32 a5; /* p23 used only in 4-states */
 124         } clg;
 125
 126 };
 127
 128 /* Time stamp put into socket buffer control block
 129  * Only valid when skbs are in our internal t(ime)fifo queue.
 130  */
 131 struct netem_skb_cb {
 132         psched_time_t   time_to_send;
 133         ktime_t         tstamp_save;
 134 };
 135
 136 /* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
 137  * to hold a rb_node structure.
 138  *
 139  * If struct sk_buff layout is changed, the following checks will complain.
 140  */
 141 static struct rb_node *netem_rb_node(struct sk_buff *skb)
 142 {
 143         BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
 144         BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
 145                      offsetof(struct sk_buff, next) + sizeof(skb->next));
 146         BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
 147                      offsetof(struct sk_buff, prev) + sizeof(skb->prev));
 148         BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
 149                                               sizeof(skb->prev) +
 150                                               sizeof(skb->tstamp));
 151         return (struct rb_node *)&skb->next;
 152 }
 153
 154 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
 155 {
 156         return (struct sk_buff *)rb;
 157 }
 158
 159 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 160 {
 161         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 162         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 163         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 164 }
 165
 166 /* init_crandom - initialize correlated random number generator
 167  * Use entropy source for initial seed.
 168  */
 169 static void init_crandom(struct crndstate *state, unsigned long rho)
 170 {
 171         state->rho = rho;
 172         state->last = net_random();
 173 }
 174
 175 /* get_crandom - correlated random number generator
 176  * Next number depends on last value.
 177  * rho is scaled to avoid floating point.
 178  */
 179 static u32 get_crandom(struct crndstate *state)
 180 {
 181         u64 value, rho;
 182         unsigned long answer;
 183
 184         if (state->rho == 0)    /* no correlation */
 185                 return net_random();
 186
 187         value = net_random();
 188         rho = (u64)state->rho + 1;
 189         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 190         state->last = answer;
 191         return answer;
 192 }
 193
 194 /* loss_4state - 4-state model loss generator
 195  * Generates losses according to the 4-state Markov chain adopted in
 196  * the GI (General and Intuitive) loss model.
 197  */
 198 static bool loss_4state(struct netem_sched_data *q)
 199 {
 200         struct clgstate *clg = &q->clg;
 201         u32 rnd = net_random();
 202
 203         /*
 204          * Makes a comparison between rnd and the transition
 205          * probabilities outgoing from the current state, then decides the
 206          * next state and if the next packet has to be transmitted or lost.
 207          * The four states correspond to:
 208          *   1 => successfully transmitted packets within a gap period
 209          *   4 => isolated losses within a gap period
 210          *   3 => lost packets within a burst period
 211          *   2 => successfully transmitted packets within a burst period
 212          */
 213         switch (clg->state) {
 214         case 1:
 215                 if (rnd < clg->a4) {
 216                         clg->state = 4;
 217                         return true;
 218                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 219                         clg->state = 3;
 220                         return true;
 221                 } else if (clg->a1 < rnd)
 222                         clg->state = 1;
 223
 224                 break;
 225         case 2:
 226                 if (rnd < clg->a5) {
 227                         clg->state = 3;
 228                         return true;
 229                 } else
 230                         clg->state = 2;
 231
 232                 break;
 233         case 3:
 234                 if (rnd < clg->a3)
 235                         clg->state = 2;
 236                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 237                         clg->state = 1;
 238                         return true;
 239                 } else if (clg->a2 + clg->a3 < rnd) {
 240                         clg->state = 3;
 241                         return true;
 242                 }
 243                 break;
 244         case 4:
 245                 clg->state = 1;
 246                 break;
 247         }
 248
 249         return false;
 250 }
 251
 252 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 253  * Generates losses according to the Gilbert-Elliot loss model or
 254  * its special cases  (Gilbert or Simple Gilbert)
 255  *
 256  * Makes a comparison between random number and the transition
 257  * probabilities outgoing from the current state, then decides the
 258  * next state. A second random number is extracted and the comparison
 259  * with the loss probability of the current state decides if the next
 260  * packet will be transmitted or lost.
 261  */
 262 static bool loss_gilb_ell(struct netem_sched_data *q)
 263 {
 264         struct clgstate *clg = &q->clg;
 265
 266         switch (clg->state) {
 267         case 1:
 268                 if (net_random() < clg->a1)
 269                         clg->state = 2;
 270                 if (net_random() < clg->a4)
 271                         return true;
 272         case 2:
 273                 if (net_random() < clg->a2)
 274                         clg->state = 1;
 275                 if (clg->a3 > net_random())
 276                         return true;
 277         }
 278
 279         return false;
 280 }
 281
 282 static bool loss_event(struct netem_sched_data *q)
 283 {
 284         switch (q->loss_model) {
 285         case CLG_RANDOM:
 286                 /* Random packet drop 0 => none, ~0 => all */
 287                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 288
 289         case CLG_4_STATES:
 290                 /* 4state loss model algorithm (used also for GI model)
 291                 * Extracts a value from the markov 4 state loss generator,
 292                 * if it is 1 drops a packet and if needed writes the event in
 293                 * the kernel logs
 294                 */
 295                 return loss_4state(q);
 296
 297         case CLG_GILB_ELL:
 298                 /* Gilbert-Elliot loss model algorithm
 299                 * Extracts a value from the Gilbert-Elliot loss generator,
 300                 * if it is 1 drops a packet and if needed writes the event in
 301                 * the kernel logs
 302                 */
 303                 return loss_gilb_ell(q);
 304         }
 305
 306         return false;   /* not reached */
 307 }
 308
 309
 310 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 311  * std deviation sigma.  Uses table lookup to approximate the desired
 312  * distribution, and a uniformly-distributed pseudo-random source.
 313  */
 314 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 315                                 struct crndstate *state,
 316                                 const struct disttable *dist)
 317 {
 318         psched_tdiff_t x;
 319         long t;
 320         u32 rnd;
 321
 322         if (sigma == 0)
 323                 return mu;
 324
 325         rnd = get_crandom(state);
 326
 327         /* default uniform distribution */
 328         if (dist == NULL)
 329                 return (rnd % (2*sigma)) - sigma + mu;
 330
 331         t = dist->table[rnd % dist->size];
 332         x = (sigma % NETEM_DIST_SCALE) * t;
 333         if (x >= 0)
 334                 x += NETEM_DIST_SCALE/2;
 335         else
 336                 x -= NETEM_DIST_SCALE/2;
 337
 338         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 339 }
 340
 341 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 342 {
 343         u64 ticks;
 344
 345         len += q->packet_overhead;
 346
 347         if (q->cell_size) {
 348                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 349
 350                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 351                         cells++;
 352                 len = cells * (q->cell_size + q->cell_overhead);
 353         }
 354
 355         ticks = (u64)len * NSEC_PER_SEC;
 356
 357         do_div(ticks, q->rate);
 358         return PSCHED_NS2TICKS(ticks);
 359 }
 360
 361 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 362 {
 363         struct netem_sched_data *q = qdisc_priv(sch);
 364         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 365         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 366
 367         while (*p) {
 368                 struct sk_buff *skb;
 369
 370                 parent = *p;
 371                 skb = netem_rb_to_skb(parent);
 372                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 373                         p = &parent->rb_right;
 374                 else
 375                         p = &parent->rb_left;
 376         }
 377         rb_link_node(netem_rb_node(nskb), parent, p);
 378         rb_insert_color(netem_rb_node(nskb), &q->t_root);
 379         sch->q.qlen++;
 380 }
 381
 382 /*
 383  * Insert one skb into qdisc.
 384  * Note: parent depends on return value to account for queue length.
 385  *      NET_XMIT_DROP: queue length didn't change.
 386  *      NET_XMIT_SUCCESS: one skb was queued.
 387  */
 388 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 389 {
 390         struct netem_sched_data *q = qdisc_priv(sch);
 391         /* We don't fill cb now as skb_unshare() may invalidate it */
 392         struct netem_skb_cb *cb;
 393         struct sk_buff *skb2;
 394         int count = 1;
 395
 396         /* Random duplication */
 397         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 398                 ++count;
 399
 400         /* Drop packet? */
 401         if (loss_event(q)) {
 402                 if (q->ecn && INET_ECN_set_ce(skb))
 403                         sch->qstats.drops++; /* mark packet */
 404                 else
 405                         --count;
 406         }
 407         if (count == 0) {
 408                 sch->qstats.drops++;
 409                 kfree_skb(skb);
 410                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 411         }
 412
 413         /* If a delay is expected, orphan the skb. (orphaning usually takes
 414          * place at TX completion time, so _before_ the link transit delay)
 415          * Ideally, this orphaning should be done after the rate limiting
 416          * module, because this breaks TCP Small Queue, and other mechanisms
 417          * based on socket sk_wmem_alloc.
 418          */
 419         if (q->latency || q->jitter)
 420                 skb_orphan(skb);
 421
 422         /*
 423          * If we need to duplicate packet, then re-insert at top of the
 424          * qdisc tree, since parent queuer expects that only one
 425          * skb will be queued.
 426          */
 427         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 428                 struct Qdisc *rootq = qdisc_root(sch);
 429                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 430                 q->duplicate = 0;
 431
 432                 qdisc_enqueue_root(skb2, rootq);
 433                 q->duplicate = dupsave;
 434         }
 435
 436         /*
 437          * Randomized packet corruption.
 438          * Make copy if needed since we are modifying
 439          * If packet is going to be hardware checksummed, then
 440          * do it now in software before we mangle it.
 441          */
 442         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 443                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 444                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 445                      skb_checksum_help(skb)))
 446                         return qdisc_drop(skb, sch);
 447
 448                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 449         }
 450
 451         if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 452                 return qdisc_reshape_fail(skb, sch);
 453
 454         sch->qstats.backlog += qdisc_pkt_len(skb);
 455
 456         cb = netem_skb_cb(skb);
 457         if (q->gap == 0 ||              /* not doing reordering */
 458             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 459             q->reorder < get_crandom(&q->reorder_cor)) {
 460                 psched_time_t now;
 461                 psched_tdiff_t delay;
 462
 463                 delay = tabledist(q->latency, q->jitter,
 464                                   &q->delay_cor, q->delay_dist);
 465
 466                 now = psched_get_time();
 467
 468                 if (q->rate) {
 469                         struct sk_buff *last;
 470
 471                         if (!skb_queue_empty(&sch->q))
 472                                 last = skb_peek_tail(&sch->q);
 473                         else
 474                                 last = netem_rb_to_skb(rb_last(&q->t_root));
 475                         if (last) {
 476                                 /*
 477                                  * Last packet in queue is reference point (now),
 478                                  * calculate this time bonus and subtract
 479                                  * from delay.
 480                                  */
 481                                 delay -= netem_skb_cb(last)->time_to_send - now;
 482                                 delay = max_t(psched_tdiff_t, 0, delay);
 483                                 now = netem_skb_cb(last)->time_to_send;
 484                         }
 485
 486                         delay += packet_len_2_sched_time(skb->len, q);
 487                 }
 488
 489                 cb->time_to_send = now + delay;
 490                 cb->tstamp_save = skb->tstamp;
 491                 ++q->counter;
 492                 tfifo_enqueue(skb, sch);
 493         } else {
 494                 /*
 495                  * Do re-ordering by putting one out of N packets at the front
 496                  * of the queue.
 497                  */
 498                 cb->time_to_send = psched_get_time();
 499                 q->counter = 0;
 500
 501                 __skb_queue_head(&sch->q, skb);
 502                 sch->qstats.requeues++;
 503         }
 504
 505         return NET_XMIT_SUCCESS;
 506 }
 507
 508 static unsigned int netem_drop(struct Qdisc *sch)
 509 {
 510         struct netem_sched_data *q = qdisc_priv(sch);
 511         unsigned int len;
 512
 513         len = qdisc_queue_drop(sch);
 514
 515         if (!len) {
 516                 struct rb_node *p = rb_first(&q->t_root);
 517
 518                 if (p) {
 519                         struct sk_buff *skb = netem_rb_to_skb(p);
 520
 521                         rb_erase(p, &q->t_root);
 522                         sch->q.qlen--;
 523                         skb->next = NULL;
 524                         skb->prev = NULL;
 525                         len = qdisc_pkt_len(skb);
 526                         kfree_skb(skb);
 527                 }
 528         }
 529         if (!len && q->qdisc && q->qdisc->ops->drop)
 530             len = q->qdisc->ops->drop(q->qdisc);
 531         if (len)
 532                 sch->qstats.drops++;
 533
 534         return len;
 535 }
 536
 537 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 538 {
 539         struct netem_sched_data *q = qdisc_priv(sch);
 540         struct sk_buff *skb;
 541         struct rb_node *p;
 542
 543         if (qdisc_is_throttled(sch))
 544                 return NULL;
 545
 546 tfifo_dequeue:
 547         skb = __skb_dequeue(&sch->q);
 548         if (skb) {
 549 deliver:
 550                 sch->qstats.backlog -= qdisc_pkt_len(skb);
 551                 qdisc_unthrottled(sch);
 552                 qdisc_bstats_update(sch, skb);
 553                 return skb;
 554         }
 555         p = rb_first(&q->t_root);
 556         if (p) {
 557                 psched_time_t time_to_send;
 558
 559                 skb = netem_rb_to_skb(p);
 560
 561                 /* if more time remaining? */
 562                 time_to_send = netem_skb_cb(skb)->time_to_send;
 563                 if (time_to_send <= psched_get_time()) {
 564                         rb_erase(p, &q->t_root);
 565
 566                         sch->q.qlen--;
 567                         skb->next = NULL;
 568                         skb->prev = NULL;
 569                         skb->tstamp = netem_skb_cb(skb)->tstamp_save;
 570
 571 #ifdef CONFIG_NET_CLS_ACT
 572                         /*
 573                          * If it's at ingress let's pretend the delay is
 574                          * from the network (tstamp will be updated).
 575                          */
 576                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 577                                 skb->tstamp.tv64 = 0;
 578 #endif
 579
 580                         if (q->qdisc) {
 581                                 int err = qdisc_enqueue(skb, q->qdisc);
 582
 583                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 584                                         if (net_xmit_drop_count(err)) {
 585                                                 sch->qstats.drops++;
 586                                                 qdisc_tree_decrease_qlen(sch, 1);
 587                                         }
 588                                 }
 589                                 goto tfifo_dequeue;
 590                         }
 591                         goto deliver;
 592                 }
 593
 594                 if (q->qdisc) {
 595                         skb = q->qdisc->ops->dequeue(q->qdisc);
 596                         if (skb)
 597                                 goto deliver;
 598                 }
 599                 qdisc_watchdog_schedule(&q->watchdog, time_to_send);
 600         }
 601
 602         if (q->qdisc) {
 603                 skb = q->qdisc->ops->dequeue(q->qdisc);
 604                 if (skb)
 605                         goto deliver;
 606         }
 607         return NULL;
 608 }
 609
 610 static void netem_reset(struct Qdisc *sch)
 611 {
 612         struct netem_sched_data *q = qdisc_priv(sch);
 613
 614         qdisc_reset_queue(sch);
 615         if (q->qdisc)
 616                 qdisc_reset(q->qdisc);
 617         qdisc_watchdog_cancel(&q->watchdog);
 618 }
 619
 620 static void dist_free(struct disttable *d)
 621 {
 622         if (d) {
 623                 if (is_vmalloc_addr(d))
 624                         vfree(d);
 625                 else
 626                         kfree(d);
 627         }
 628 }
 629
 630 /*
 631  * Distribution data is a variable size payload containing
 632  * signed 16 bit values.
 633  */
 634 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 635 {
 636         struct netem_sched_data *q = qdisc_priv(sch);
 637         size_t n = nla_len(attr)/sizeof(__s16);
 638         const __s16 *data = nla_data(attr);
 639         spinlock_t *root_lock;
 640         struct disttable *d;
 641         int i;
 642         size_t s;
 643
 644         if (n > NETEM_DIST_MAX)
 645                 return -EINVAL;
 646
 647         s = sizeof(struct disttable) + n * sizeof(s16);
 648         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 649         if (!d)
 650                 d = vmalloc(s);
 651         if (!d)
 652                 return -ENOMEM;
 653
 654         d->size = n;
 655         for (i = 0; i < n; i++)
 656                 d->table[i] = data[i];
 657
 658         root_lock = qdisc_root_sleeping_lock(sch);
 659
 660         spin_lock_bh(root_lock);
 661         swap(q->delay_dist, d);
 662         spin_unlock_bh(root_lock);
 663
 664         dist_free(d);
 665         return 0;
 666 }
 667
 668 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 669 {
 670         struct netem_sched_data *q = qdisc_priv(sch);
 671         const struct tc_netem_corr *c = nla_data(attr);
 672
 673         init_crandom(&q->delay_cor, c->delay_corr);
 674         init_crandom(&q->loss_cor, c->loss_corr);
 675         init_crandom(&q->dup_cor, c->dup_corr);
 676 }
 677
 678 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 679 {
 680         struct netem_sched_data *q = qdisc_priv(sch);
 681         const struct tc_netem_reorder *r = nla_data(attr);
 682
 683         q->reorder = r->probability;
 684         init_crandom(&q->reorder_cor, r->correlation);
 685 }
 686
 687 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 688 {
 689         struct netem_sched_data *q = qdisc_priv(sch);
 690         const struct tc_netem_corrupt *r = nla_data(attr);
 691
 692         q->corrupt = r->probability;
 693         init_crandom(&q->corrupt_cor, r->correlation);
 694 }
 695
 696 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 697 {
 698         struct netem_sched_data *q = qdisc_priv(sch);
 699         const struct tc_netem_rate *r = nla_data(attr);
 700
 701         q->rate = r->rate;
 702         q->packet_overhead = r->packet_overhead;
 703         q->cell_size = r->cell_size;
 704         if (q->cell_size)
 705                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 706         q->cell_overhead = r->cell_overhead;
 707 }
 708
 709 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 710 {
 711         struct netem_sched_data *q = qdisc_priv(sch);
 712         const struct nlattr *la;
 713         int rem;
 714
 715         nla_for_each_nested(la, attr, rem) {
 716                 u16 type = nla_type(la);
 717
 718                 switch(type) {
 719                 case NETEM_LOSS_GI: {
 720                         const struct tc_netem_gimodel *gi = nla_data(la);
 721
 722                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 723                                 pr_info("netem: incorrect gi model size\n");
 724                                 return -EINVAL;
 725                         }
 726
 727                         q->loss_model = CLG_4_STATES;
 728
 729                         q->clg.state = 1;
 730                         q->clg.a1 = gi->p13;
 731                         q->clg.a2 = gi->p31;
 732                         q->clg.a3 = gi->p32;
 733                         q->clg.a4 = gi->p14;
 734                         q->clg.a5 = gi->p23;
 735                         break;
 736                 }
 737
 738                 case NETEM_LOSS_GE: {
 739                         const struct tc_netem_gemodel *ge = nla_data(la);
 740
 741                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 742                                 pr_info("netem: incorrect ge model size\n");
 743                                 return -EINVAL;
 744                         }
 745
 746                         q->loss_model = CLG_GILB_ELL;
 747                         q->clg.state = 1;
 748                         q->clg.a1 = ge->p;
 749                         q->clg.a2 = ge->r;
 750                         q->clg.a3 = ge->h;
 751                         q->clg.a4 = ge->k1;
 752                         break;
 753                 }
 754
 755                 default:
 756                         pr_info("netem: unknown loss type %u\n", type);
 757                         return -EINVAL;
 758                 }
 759         }
 760
 761         return 0;
 762 }
 763
 764 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 765         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 766         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 767         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 768         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 769         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 770         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 771 };
 772
 773 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 774                       const struct nla_policy *policy, int len)
 775 {
 776         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 777
 778         if (nested_len < 0) {
 779                 pr_info("netem: invalid attributes len %d\n", nested_len);
 780                 return -EINVAL;
 781         }
 782
 783         if (nested_len >= nla_attr_size(0))
 784                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 785                                  nested_len, policy);
 786
 787         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 788         return 0;
 789 }
 790
 791 /* Parse netlink message to set options */
 792 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 793 {
 794         struct netem_sched_data *q = qdisc_priv(sch);
 795         struct nlattr *tb[TCA_NETEM_MAX + 1];
 796         struct tc_netem_qopt *qopt;
 797         int ret;
 798
 799         if (opt == NULL)
 800                 return -EINVAL;
 801
 802         qopt = nla_data(opt);
 803         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 804         if (ret < 0)
 805                 return ret;
 806
 807         sch->limit = qopt->limit;
 808
 809         q->latency = qopt->latency;
 810         q->jitter = qopt->jitter;
 811         q->limit = qopt->limit;
 812         q->gap = qopt->gap;
 813         q->counter = 0;
 814         q->loss = qopt->loss;
 815         q->duplicate = qopt->duplicate;
 816
 817         /* for compatibility with earlier versions.
 818          * if gap is set, need to assume 100% probability
 819          */
 820         if (q->gap)
 821                 q->reorder = ~0;
 822
 823         if (tb[TCA_NETEM_CORR])
 824                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 825
 826         if (tb[TCA_NETEM_DELAY_DIST]) {
 827                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 828                 if (ret)
 829                         return ret;
 830         }
 831
 832         if (tb[TCA_NETEM_REORDER])
 833                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 834
 835         if (tb[TCA_NETEM_CORRUPT])
 836                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 837
 838         if (tb[TCA_NETEM_RATE])
 839                 get_rate(sch, tb[TCA_NETEM_RATE]);
 840
 841         if (tb[TCA_NETEM_ECN])
 842                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 843
 844         q->loss_model = CLG_RANDOM;
 845         if (tb[TCA_NETEM_LOSS])
 846                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 847
 848         return ret;
 849 }
 850
 851 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 852 {
 853         struct netem_sched_data *q = qdisc_priv(sch);
 854         int ret;
 855
 856         if (!opt)
 857                 return -EINVAL;
 858
 859         qdisc_watchdog_init(&q->watchdog, sch);
 860
 861         q->loss_model = CLG_RANDOM;
 862         ret = netem_change(sch, opt);
 863         if (ret)
 864                 pr_info("netem: change failed\n");
 865         return ret;
 866 }
 867
 868 static void netem_destroy(struct Qdisc *sch)
 869 {
 870         struct netem_sched_data *q = qdisc_priv(sch);
 871
 872         qdisc_watchdog_cancel(&q->watchdog);
 873         if (q->qdisc)
 874                 qdisc_destroy(q->qdisc);
 875         dist_free(q->delay_dist);
 876 }
 877
 878 static int dump_loss_model(const struct netem_sched_data *q,
 879                            struct sk_buff *skb)
 880 {
 881         struct nlattr *nest;
 882
 883         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 884         if (nest == NULL)
 885                 goto nla_put_failure;
 886
 887         switch (q->loss_model) {
 888         case CLG_RANDOM:
 889                 /* legacy loss model */
 890                 nla_nest_cancel(skb, nest);
 891                 return 0;       /* no data */
 892
 893         case CLG_4_STATES: {
 894                 struct tc_netem_gimodel gi = {
 895                         .p13 = q->clg.a1,
 896                         .p31 = q->clg.a2,
 897                         .p32 = q->clg.a3,
 898                         .p14 = q->clg.a4,
 899                         .p23 = q->clg.a5,
 900                 };
 901
 902                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 903                         goto nla_put_failure;
 904                 break;
 905         }
 906         case CLG_GILB_ELL: {
 907                 struct tc_netem_gemodel ge = {
 908                         .p = q->clg.a1,
 909                         .r = q->clg.a2,
 910                         .h = q->clg.a3,
 911                         .k1 = q->clg.a4,
 912                 };
 913
 914                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 915                         goto nla_put_failure;
 916                 break;
 917         }
 918         }
 919
 920         nla_nest_end(skb, nest);
 921         return 0;
 922
 923 nla_put_failure:
 924         nla_nest_cancel(skb, nest);
 925         return -1;
 926 }
 927
 928 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 929 {
 930         const struct netem_sched_data *q = qdisc_priv(sch);
 931         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 932         struct tc_netem_qopt qopt;
 933         struct tc_netem_corr cor;
 934         struct tc_netem_reorder reorder;
 935         struct tc_netem_corrupt corrupt;
 936         struct tc_netem_rate rate;
 937
 938         qopt.latency = q->latency;
 939         qopt.jitter = q->jitter;
 940         qopt.limit = q->limit;
 941         qopt.loss = q->loss;
 942         qopt.gap = q->gap;
 943         qopt.duplicate = q->duplicate;
 944         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 945                 goto nla_put_failure;
 946
 947         cor.delay_corr = q->delay_cor.rho;
 948         cor.loss_corr = q->loss_cor.rho;
 949         cor.dup_corr = q->dup_cor.rho;
 950         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 951                 goto nla_put_failure;
 952
 953         reorder.probability = q->reorder;
 954         reorder.correlation = q->reorder_cor.rho;
 955         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 956                 goto nla_put_failure;
 957
 958         corrupt.probability = q->corrupt;
 959         corrupt.correlation = q->corrupt_cor.rho;
 960         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 961                 goto nla_put_failure;
 962
 963         rate.rate = q->rate;
 964         rate.packet_overhead = q->packet_overhead;
 965         rate.cell_size = q->cell_size;
 966         rate.cell_overhead = q->cell_overhead;
 967         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 968                 goto nla_put_failure;
 969
 970         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 971                 goto nla_put_failure;
 972
 973         if (dump_loss_model(q, skb) != 0)
 974                 goto nla_put_failure;
 975
 976         return nla_nest_end(skb, nla);
 977
 978 nla_put_failure:
 979         nlmsg_trim(skb, nla);
 980         return -1;
 981 }
 982
 983 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 984                           struct sk_buff *skb, struct tcmsg *tcm)
 985 {
 986         struct netem_sched_data *q = qdisc_priv(sch);
 987
 988         if (cl != 1 || !q->qdisc)       /* only one class */
 989                 return -ENOENT;
 990
 991         tcm->tcm_handle |= TC_H_MIN(1);
 992         tcm->tcm_info = q->qdisc->handle;
 993
 994         return 0;
 995 }
 996
 997 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 998                      struct Qdisc **old)
 999 {
1000         struct netem_sched_data *q = qdisc_priv(sch);
1001
1002         sch_tree_lock(sch);
1003         *old = q->qdisc;
1004         q->qdisc = new;
1005         if (*old) {
1006                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1007                 qdisc_reset(*old);
1008         }
1009         sch_tree_unlock(sch);
1010
1011         return 0;
1012 }
1013
1014 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1015 {
1016         struct netem_sched_data *q = qdisc_priv(sch);
1017         return q->qdisc;
1018 }
1019
1020 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1021 {
1022         return 1;
1023 }
1024
1025 static void netem_put(struct Qdisc *sch, unsigned long arg)
1026 {
1027 }
1028
1029 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1030 {
1031         if (!walker->stop) {
1032                 if (walker->count >= walker->skip)
1033                         if (walker->fn(sch, 1, walker) < 0) {
1034                                 walker->stop = 1;
1035                                 return;
1036                         }
1037                 walker->count++;
1038         }
1039 }
1040
1041 static const struct Qdisc_class_ops netem_class_ops = {
1042         .graft          =       netem_graft,
1043         .leaf           =       netem_leaf,
1044         .get            =       netem_get,
1045         .put            =       netem_put,
1046         .walk           =       netem_walk,
1047         .dump           =       netem_dump_class,
1048 };
1049
1050 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1051         .id             =       "netem",
1052         .cl_ops         =       &netem_class_ops,
1053         .priv_size      =       sizeof(struct netem_sched_data),
1054         .enqueue        =       netem_enqueue,
1055         .dequeue        =       netem_dequeue,
1056         .peek           =       qdisc_peek_dequeued,
1057         .drop           =       netem_drop,
1058         .init           =       netem_init,
1059         .reset          =       netem_reset,
1060         .destroy        =       netem_destroy,
1061         .change         =       netem_change,
1062         .dump           =       netem_dump,
1063         .owner          =       THIS_MODULE,
1064 };
1065
1066
1067 static int __init netem_module_init(void)
1068 {
1069         pr_info("netem: version " VERSION "\n");
1070         return register_qdisc(&netem_qdisc_ops);
1071 }
1072 static void __exit netem_module_exit(void)
1073 {
1074         unregister_qdisc(&netem_qdisc_ops);
1075 }
1076 module_init(netem_module_init)
1077 module_exit(netem_module_exit)
1078 MODULE_LICENSE("GPL");