net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return &ptype_all;
 375         else
 376                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 377 }
 378
 379 /**
 380  *      dev_add_pack - add packet handler
 381  *      @pt: packet type declaration
 382  *
 383  *      Add a protocol handler to the networking stack. The passed &packet_type
 384  *      is linked into kernel lists and may not be freed until it has been
 385  *      removed from the kernel lists.
 386  *
 387  *      This call does not sleep therefore it can not
 388  *      guarantee all CPU's that are in middle of receiving packets
 389  *      will see the new packet type (until the next received packet).
 390  */
 391
 392 void dev_add_pack(struct packet_type *pt)
 393 {
 394         struct list_head *head = ptype_head(pt);
 395
 396         spin_lock(&ptype_lock);
 397         list_add_rcu(&pt->list, head);
 398         spin_unlock(&ptype_lock);
 399 }
 400 EXPORT_SYMBOL(dev_add_pack);
 401
 402 /**
 403  *      __dev_remove_pack        - remove packet handler
 404  *      @pt: packet type declaration
 405  *
 406  *      Remove a protocol handler that was previously added to the kernel
 407  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 408  *      from the kernel lists and can be freed or reused once this function
 409  *      returns.
 410  *
 411  *      The packet type might still be in use by receivers
 412  *      and must not be freed until after all the CPU's have gone
 413  *      through a quiescent state.
 414  */
 415 void __dev_remove_pack(struct packet_type *pt)
 416 {
 417         struct list_head *head = ptype_head(pt);
 418         struct packet_type *pt1;
 419
 420         spin_lock(&ptype_lock);
 421
 422         list_for_each_entry(pt1, head, list) {
 423                 if (pt == pt1) {
 424                         list_del_rcu(&pt->list);
 425                         goto out;
 426                 }
 427         }
 428
 429         pr_warn("dev_remove_pack: %p not found\n", pt);
 430 out:
 431         spin_unlock(&ptype_lock);
 432 }
 433 EXPORT_SYMBOL(__dev_remove_pack);
 434
 435 /**
 436  *      dev_remove_pack  - remove packet handler
 437  *      @pt: packet type declaration
 438  *
 439  *      Remove a protocol handler that was previously added to the kernel
 440  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 441  *      from the kernel lists and can be freed or reused once this function
 442  *      returns.
 443  *
 444  *      This call sleeps to guarantee that no CPU is looking at the packet
 445  *      type after return.
 446  */
 447 void dev_remove_pack(struct packet_type *pt)
 448 {
 449         __dev_remove_pack(pt);
 450
 451         synchronize_net();
 452 }
 453 EXPORT_SYMBOL(dev_remove_pack);
 454
 455
 456 /**
 457  *      dev_add_offload - register offload handlers
 458  *      @po: protocol offload declaration
 459  *
 460  *      Add protocol offload handlers to the networking stack. The passed
 461  *      &proto_offload is linked into kernel lists and may not be freed until
 462  *      it has been removed from the kernel lists.
 463  *
 464  *      This call does not sleep therefore it can not
 465  *      guarantee all CPU's that are in middle of receiving packets
 466  *      will see the new offload handlers (until the next received packet).
 467  */
 468 void dev_add_offload(struct packet_offload *po)
 469 {
 470         struct list_head *head = &offload_base;
 471
 472         spin_lock(&offload_lock);
 473         list_add_rcu(&po->list, head);
 474         spin_unlock(&offload_lock);
 475 }
 476 EXPORT_SYMBOL(dev_add_offload);
 477
 478 /**
 479  *      __dev_remove_offload     - remove offload handler
 480  *      @po: packet offload declaration
 481  *
 482  *      Remove a protocol offload handler that was previously added to the
 483  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 484  *      is removed from the kernel lists and can be freed or reused once this
 485  *      function returns.
 486  *
 487  *      The packet type might still be in use by receivers
 488  *      and must not be freed until after all the CPU's have gone
 489  *      through a quiescent state.
 490  */
 491 static void __dev_remove_offload(struct packet_offload *po)
 492 {
 493         struct list_head *head = &offload_base;
 494         struct packet_offload *po1;
 495
 496         spin_lock(&offload_lock);
 497
 498         list_for_each_entry(po1, head, list) {
 499                 if (po == po1) {
 500                         list_del_rcu(&po->list);
 501                         goto out;
 502                 }
 503         }
 504
 505         pr_warn("dev_remove_offload: %p not found\n", po);
 506 out:
 507         spin_unlock(&offload_lock);
 508 }
 509
 510 /**
 511  *      dev_remove_offload       - remove packet offload handler
 512  *      @po: packet offload declaration
 513  *
 514  *      Remove a packet offload handler that was previously added to the kernel
 515  *      offload handlers by dev_add_offload(). The passed &offload_type is
 516  *      removed from the kernel lists and can be freed or reused once this
 517  *      function returns.
 518  *
 519  *      This call sleeps to guarantee that no CPU is looking at the packet
 520  *      type after return.
 521  */
 522 void dev_remove_offload(struct packet_offload *po)
 523 {
 524         __dev_remove_offload(po);
 525
 526         synchronize_net();
 527 }
 528 EXPORT_SYMBOL(dev_remove_offload);
 529
 530 /******************************************************************************
 531
 532                       Device Boot-time Settings Routines
 533
 534 *******************************************************************************/
 535
 536 /* Boot time configuration table */
 537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 538
 539 /**
 540  *      netdev_boot_setup_add   - add new setup entry
 541  *      @name: name of the device
 542  *      @map: configured settings for the device
 543  *
 544  *      Adds new setup entry to the dev_boot_setup list.  The function
 545  *      returns 0 on error and 1 on success.  This is a generic routine to
 546  *      all netdevices.
 547  */
 548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 549 {
 550         struct netdev_boot_setup *s;
 551         int i;
 552
 553         s = dev_boot_setup;
 554         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 555                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 556                         memset(s[i].name, 0, sizeof(s[i].name));
 557                         strlcpy(s[i].name, name, IFNAMSIZ);
 558                         memcpy(&s[i].map, map, sizeof(s[i].map));
 559                         break;
 560                 }
 561         }
 562
 563         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 564 }
 565
 566 /**
 567  *      netdev_boot_setup_check - check boot time settings
 568  *      @dev: the netdevice
 569  *
 570  *      Check boot time settings for the device.
 571  *      The found settings are set for the device to be used
 572  *      later in the device probing.
 573  *      Returns 0 if no settings found, 1 if they are.
 574  */
 575 int netdev_boot_setup_check(struct net_device *dev)
 576 {
 577         struct netdev_boot_setup *s = dev_boot_setup;
 578         int i;
 579
 580         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 581                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 582                     !strcmp(dev->name, s[i].name)) {
 583                         dev->irq        = s[i].map.irq;
 584                         dev->base_addr  = s[i].map.base_addr;
 585                         dev->mem_start  = s[i].map.mem_start;
 586                         dev->mem_end    = s[i].map.mem_end;
 587                         return 1;
 588                 }
 589         }
 590         return 0;
 591 }
 592 EXPORT_SYMBOL(netdev_boot_setup_check);
 593
 594
 595 /**
 596  *      netdev_boot_base        - get address from boot time settings
 597  *      @prefix: prefix for network device
 598  *      @unit: id for network device
 599  *
 600  *      Check boot time settings for the base address of device.
 601  *      The found settings are set for the device to be used
 602  *      later in the device probing.
 603  *      Returns 0 if no settings found.
 604  */
 605 unsigned long netdev_boot_base(const char *prefix, int unit)
 606 {
 607         const struct netdev_boot_setup *s = dev_boot_setup;
 608         char name[IFNAMSIZ];
 609         int i;
 610
 611         sprintf(name, "%s%d", prefix, unit);
 612
 613         /*
 614          * If device already registered then return base of 1
 615          * to indicate not to probe for this interface
 616          */
 617         if (__dev_get_by_name(&init_net, name))
 618                 return 1;
 619
 620         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 621                 if (!strcmp(name, s[i].name))
 622                         return s[i].map.base_addr;
 623         return 0;
 624 }
 625
 626 /*
 627  * Saves at boot time configured settings for any netdevice.
 628  */
 629 int __init netdev_boot_setup(char *str)
 630 {
 631         int ints[5];
 632         struct ifmap map;
 633
 634         str = get_options(str, ARRAY_SIZE(ints), ints);
 635         if (!str || !*str)
 636                 return 0;
 637
 638         /* Save settings */
 639         memset(&map, 0, sizeof(map));
 640         if (ints[0] > 0)
 641                 map.irq = ints[1];
 642         if (ints[0] > 1)
 643                 map.base_addr = ints[2];
 644         if (ints[0] > 2)
 645                 map.mem_start = ints[3];
 646         if (ints[0] > 3)
 647                 map.mem_end = ints[4];
 648
 649         /* Add new entry to the list */
 650         return netdev_boot_setup_add(str, &map);
 651 }
 652
 653 __setup("netdev=", netdev_boot_setup);
 654
 655 /*******************************************************************************
 656
 657                             Device Interface Subroutines
 658
 659 *******************************************************************************/
 660
 661 /**
 662  *      __dev_get_by_name       - find a device by its name
 663  *      @net: the applicable net namespace
 664  *      @name: name to find
 665  *
 666  *      Find an interface by name. Must be called under RTNL semaphore
 667  *      or @dev_base_lock. If the name is found a pointer to the device
 668  *      is returned. If the name is not found then %NULL is returned. The
 669  *      reference counters are not incremented so the caller must be
 670  *      careful with locks.
 671  */
 672
 673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 674 {
 675         struct net_device *dev;
 676         struct hlist_head *head = dev_name_hash(net, name);
 677
 678         hlist_for_each_entry(dev, head, name_hlist)
 679                 if (!strncmp(dev->name, name, IFNAMSIZ))
 680                         return dev;
 681
 682         return NULL;
 683 }
 684 EXPORT_SYMBOL(__dev_get_by_name);
 685
 686 /**
 687  *      dev_get_by_name_rcu     - find a device by its name
 688  *      @net: the applicable net namespace
 689  *      @name: name to find
 690  *
 691  *      Find an interface by name.
 692  *      If the name is found a pointer to the device is returned.
 693  *      If the name is not found then %NULL is returned.
 694  *      The reference counters are not incremented so the caller must be
 695  *      careful with locks. The caller must hold RCU lock.
 696  */
 697
 698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 699 {
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_name_hash(net, name);
 702
 703         hlist_for_each_entry_rcu(dev, head, name_hlist)
 704                 if (!strncmp(dev->name, name, IFNAMSIZ))
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(dev_get_by_name_rcu);
 710
 711 /**
 712  *      dev_get_by_name         - find a device by its name
 713  *      @net: the applicable net namespace
 714  *      @name: name to find
 715  *
 716  *      Find an interface by name. This can be called from any
 717  *      context and does its own locking. The returned handle has
 718  *      the usage count incremented and the caller must use dev_put() to
 719  *      release it when it is no longer needed. %NULL is returned if no
 720  *      matching device is found.
 721  */
 722
 723 struct net_device *dev_get_by_name(struct net *net, const char *name)
 724 {
 725         struct net_device *dev;
 726
 727         rcu_read_lock();
 728         dev = dev_get_by_name_rcu(net, name);
 729         if (dev)
 730                 dev_hold(dev);
 731         rcu_read_unlock();
 732         return dev;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_name);
 735
 736 /**
 737  *      __dev_get_by_index - find a device by its ifindex
 738  *      @net: the applicable net namespace
 739  *      @ifindex: index of device
 740  *
 741  *      Search for an interface by index. Returns %NULL if the device
 742  *      is not found or a pointer to the device. The device has not
 743  *      had its reference counter increased so the caller must be careful
 744  *      about locking. The caller must hold either the RTNL semaphore
 745  *      or @dev_base_lock.
 746  */
 747
 748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751         struct hlist_head *head = dev_index_hash(net, ifindex);
 752
 753         hlist_for_each_entry(dev, head, index_hlist)
 754                 if (dev->ifindex == ifindex)
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(__dev_get_by_index);
 760
 761 /**
 762  *      dev_get_by_index_rcu - find a device by its ifindex
 763  *      @net: the applicable net namespace
 764  *      @ifindex: index of device
 765  *
 766  *      Search for an interface by index. Returns %NULL if the device
 767  *      is not found or a pointer to the device. The device has not
 768  *      had its reference counter increased so the caller must be careful
 769  *      about locking. The caller must hold RCU lock.
 770  */
 771
 772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 773 {
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry_rcu(dev, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(dev_get_by_index_rcu);
 784
 785
 786 /**
 787  *      dev_get_by_index - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns NULL if the device
 792  *      is not found or a pointer to the device. The device returned has
 793  *      had a reference added and the pointer is safe until the user calls
 794  *      dev_put to indicate they have finished with it.
 795  */
 796
 797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 798 {
 799         struct net_device *dev;
 800
 801         rcu_read_lock();
 802         dev = dev_get_by_index_rcu(net, ifindex);
 803         if (dev)
 804                 dev_hold(dev);
 805         rcu_read_unlock();
 806         return dev;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index);
 809
 810 /**
 811  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 812  *      @net: network namespace
 813  *      @name: a pointer to the buffer where the name will be stored.
 814  *      @ifindex: the ifindex of the interface to get the name from.
 815  *
 816  *      The use of raw_seqcount_begin() and cond_resched() before
 817  *      retrying is required as we want to give the writers a chance
 818  *      to complete when CONFIG_PREEMPT is not set.
 819  */
 820 int netdev_get_name(struct net *net, char *name, int ifindex)
 821 {
 822         struct net_device *dev;
 823         unsigned int seq;
 824
 825 retry:
 826         seq = raw_seqcount_begin(&devnet_rename_seq);
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (!dev) {
 830                 rcu_read_unlock();
 831                 return -ENODEV;
 832         }
 833
 834         strcpy(name, dev->name);
 835         rcu_read_unlock();
 836         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 837                 cond_resched();
 838                 goto retry;
 839         }
 840
 841         return 0;
 842 }
 843
 844 /**
 845  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 846  *      @net: the applicable net namespace
 847  *      @type: media type of device
 848  *      @ha: hardware address
 849  *
 850  *      Search for an interface by MAC address. Returns NULL if the device
 851  *      is not found or a pointer to the device.
 852  *      The caller must hold RCU or RTNL.
 853  *      The returned device has not had its ref count increased
 854  *      and the caller must therefore be careful about locking
 855  *
 856  */
 857
 858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 859                                        const char *ha)
 860 {
 861         struct net_device *dev;
 862
 863         for_each_netdev_rcu(net, dev)
 864                 if (dev->type == type &&
 865                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 866                         return dev;
 867
 868         return NULL;
 869 }
 870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 871
 872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 873 {
 874         struct net_device *dev;
 875
 876         ASSERT_RTNL();
 877         for_each_netdev(net, dev)
 878                 if (dev->type == type)
 879                         return dev;
 880
 881         return NULL;
 882 }
 883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 884
 885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 886 {
 887         struct net_device *dev, *ret = NULL;
 888
 889         rcu_read_lock();
 890         for_each_netdev_rcu(net, dev)
 891                 if (dev->type == type) {
 892                         dev_hold(dev);
 893                         ret = dev;
 894                         break;
 895                 }
 896         rcu_read_unlock();
 897         return ret;
 898 }
 899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 900
 901 /**
 902  *      __dev_get_by_flags - find any device with given flags
 903  *      @net: the applicable net namespace
 904  *      @if_flags: IFF_* values
 905  *      @mask: bitmask of bits in if_flags to check
 906  *
 907  *      Search for any interface with the given flags. Returns NULL if a device
 908  *      is not found or a pointer to the device. Must be called inside
 909  *      rtnl_lock(), and result refcount is unchanged.
 910  */
 911
 912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 913                                       unsigned short mask)
 914 {
 915         struct net_device *dev, *ret;
 916
 917         ASSERT_RTNL();
 918
 919         ret = NULL;
 920         for_each_netdev(net, dev) {
 921                 if (((dev->flags ^ if_flags) & mask) == 0) {
 922                         ret = dev;
 923                         break;
 924                 }
 925         }
 926         return ret;
 927 }
 928 EXPORT_SYMBOL(__dev_get_by_flags);
 929
 930 /**
 931  *      dev_valid_name - check if name is okay for network device
 932  *      @name: name string
 933  *
 934  *      Network device names need to be valid file names to
 935  *      to allow sysfs to work.  We also disallow any kind of
 936  *      whitespace.
 937  */
 938 bool dev_valid_name(const char *name)
 939 {
 940         if (*name == '\0')
 941                 return false;
 942         if (strlen(name) >= IFNAMSIZ)
 943                 return false;
 944         if (!strcmp(name, ".") || !strcmp(name, ".."))
 945                 return false;
 946
 947         while (*name) {
 948                 if (*name == '/' || isspace(*name))
 949                         return false;
 950                 name++;
 951         }
 952         return true;
 953 }
 954 EXPORT_SYMBOL(dev_valid_name);
 955
 956 /**
 957  *      __dev_alloc_name - allocate a name for a device
 958  *      @net: network namespace to allocate the device name in
 959  *      @name: name format string
 960  *      @buf:  scratch buffer and result name string
 961  *
 962  *      Passed a format string - eg "lt%d" it will try and find a suitable
 963  *      id. It scans list of devices to build up a free map, then chooses
 964  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 965  *      while allocating the name and adding the device in order to avoid
 966  *      duplicates.
 967  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 968  *      Returns the number of the unit assigned or a negative errno code.
 969  */
 970
 971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 972 {
 973         int i = 0;
 974         const char *p;
 975         const int max_netdevices = 8*PAGE_SIZE;
 976         unsigned long *inuse;
 977         struct net_device *d;
 978
 979         p = strnchr(name, IFNAMSIZ-1, '%');
 980         if (p) {
 981                 /*
 982                  * Verify the string as this thing may have come from
 983                  * the user.  There must be either one "%d" and no other "%"
 984                  * characters.
 985                  */
 986                 if (p[1] != 'd' || strchr(p + 2, '%'))
 987                         return -EINVAL;
 988
 989                 /* Use one page as a bit array of possible slots */
 990                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 991                 if (!inuse)
 992                         return -ENOMEM;
 993
 994                 for_each_netdev(net, d) {
 995                         if (!sscanf(d->name, name, &i))
 996                                 continue;
 997                         if (i < 0 || i >= max_netdevices)
 998                                 continue;
 999
1000                         /*  avoid cases where sscanf is not exact inverse of printf */
1001                         snprintf(buf, IFNAMSIZ, name, i);
1002                         if (!strncmp(buf, d->name, IFNAMSIZ))
1003                                 set_bit(i, inuse);
1004                 }
1005
1006                 i = find_first_zero_bit(inuse, max_netdevices);
1007                 free_page((unsigned long) inuse);
1008         }
1009
1010         if (buf != name)
1011                 snprintf(buf, IFNAMSIZ, name, i);
1012         if (!__dev_get_by_name(net, buf))
1013                 return i;
1014
1015         /* It is possible to run out of possible slots
1016          * when the name is long and there isn't enough space left
1017          * for the digits, or if all bits are used.
1018          */
1019         return -ENFILE;
1020 }
1021
1022 /**
1023  *      dev_alloc_name - allocate a name for a device
1024  *      @dev: device
1025  *      @name: name format string
1026  *
1027  *      Passed a format string - eg "lt%d" it will try and find a suitable
1028  *      id. It scans list of devices to build up a free map, then chooses
1029  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *      while allocating the name and adding the device in order to avoid
1031  *      duplicates.
1032  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *      Returns the number of the unit assigned or a negative errno code.
1034  */
1035
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038         char buf[IFNAMSIZ];
1039         struct net *net;
1040         int ret;
1041
1042         BUG_ON(!dev_net(dev));
1043         net = dev_net(dev);
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050
1051 static int dev_alloc_name_ns(struct net *net,
1052                              struct net_device *dev,
1053                              const char *name)
1054 {
1055         char buf[IFNAMSIZ];
1056         int ret;
1057
1058         ret = __dev_alloc_name(net, name, buf);
1059         if (ret >= 0)
1060                 strlcpy(dev->name, buf, IFNAMSIZ);
1061         return ret;
1062 }
1063
1064 static int dev_get_valid_name(struct net *net,
1065                               struct net_device *dev,
1066                               const char *name)
1067 {
1068         BUG_ON(!net);
1069
1070         if (!dev_valid_name(name))
1071                 return -EINVAL;
1072
1073         if (strchr(name, '%'))
1074                 return dev_alloc_name_ns(net, dev, name);
1075         else if (__dev_get_by_name(net, name))
1076                 return -EEXIST;
1077         else if (dev->name != name)
1078                 strlcpy(dev->name, name, IFNAMSIZ);
1079
1080         return 0;
1081 }
1082
1083 /**
1084  *      dev_change_name - change name of a device
1085  *      @dev: device
1086  *      @newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *      Change name of a device, can pass format strings "eth%d".
1089  *      for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093         unsigned char old_assign_type;
1094         char oldname[IFNAMSIZ];
1095         int err = 0;
1096         int ret;
1097         struct net *net;
1098
1099         ASSERT_RTNL();
1100         BUG_ON(!dev_net(dev));
1101
1102         net = dev_net(dev);
1103         if (dev->flags & IFF_UP)
1104                 return -EBUSY;
1105
1106         write_seqcount_begin(&devnet_rename_seq);
1107
1108         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109                 write_seqcount_end(&devnet_rename_seq);
1110                 return 0;
1111         }
1112
1113         memcpy(oldname, dev->name, IFNAMSIZ);
1114
1115         err = dev_get_valid_name(net, dev, newname);
1116         if (err < 0) {
1117                 write_seqcount_end(&devnet_rename_seq);
1118                 return err;
1119         }
1120
1121         if (oldname[0] && !strchr(oldname, '%'))
1122                 netdev_info(dev, "renamed from %s\n", oldname);
1123
1124         old_assign_type = dev->name_assign_type;
1125         dev->name_assign_type = NET_NAME_RENAMED;
1126
1127 rollback:
1128         ret = device_rename(&dev->dev, dev->name);
1129         if (ret) {
1130                 memcpy(dev->name, oldname, IFNAMSIZ);
1131                 dev->name_assign_type = old_assign_type;
1132                 write_seqcount_end(&devnet_rename_seq);
1133                 return ret;
1134         }
1135
1136         write_seqcount_end(&devnet_rename_seq);
1137
1138         netdev_adjacent_rename_links(dev, oldname);
1139
1140         write_lock_bh(&dev_base_lock);
1141         hlist_del_rcu(&dev->name_hlist);
1142         write_unlock_bh(&dev_base_lock);
1143
1144         synchronize_rcu();
1145
1146         write_lock_bh(&dev_base_lock);
1147         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148         write_unlock_bh(&dev_base_lock);
1149
1150         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151         ret = notifier_to_errno(ret);
1152
1153         if (ret) {
1154                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1155                 if (err >= 0) {
1156                         err = ret;
1157                         write_seqcount_begin(&devnet_rename_seq);
1158                         memcpy(dev->name, oldname, IFNAMSIZ);
1159                         memcpy(oldname, newname, IFNAMSIZ);
1160                         dev->name_assign_type = old_assign_type;
1161                         old_assign_type = NET_NAME_RENAMED;
1162                         goto rollback;
1163                 } else {
1164                         pr_err("%s: name change rollback failed: %d\n",
1165                                dev->name, ret);
1166                 }
1167         }
1168
1169         return err;
1170 }
1171
1172 /**
1173  *      dev_set_alias - change ifalias of a device
1174  *      @dev: device
1175  *      @alias: name up to IFALIASZ
1176  *      @len: limit of bytes to copy from info
1177  *
1178  *      Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182         char *new_ifalias;
1183
1184         ASSERT_RTNL();
1185
1186         if (len >= IFALIASZ)
1187                 return -EINVAL;
1188
1189         if (!len) {
1190                 kfree(dev->ifalias);
1191                 dev->ifalias = NULL;
1192                 return 0;
1193         }
1194
1195         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196         if (!new_ifalias)
1197                 return -ENOMEM;
1198         dev->ifalias = new_ifalias;
1199
1200         strlcpy(dev->ifalias, alias, len+1);
1201         return len;
1202 }
1203
1204
1205 /**
1206  *      netdev_features_change - device changes features
1207  *      @dev: device to cause notification
1208  *
1209  *      Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216
1217 /**
1218  *      netdev_state_change - device changes state
1219  *      @dev: device to cause notification
1220  *
1221  *      Called to indicate a device has changed state. This function calls
1222  *      the notifier chains for netdev_chain and sends a NEWLINK message
1223  *      to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227         if (dev->flags & IFF_UP) {
1228                 struct netdev_notifier_change_info change_info;
1229
1230                 change_info.flags_changed = 0;
1231                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232                                               &change_info.info);
1233                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234         }
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237
1238 /**
1239  *      netdev_notify_peers - notify network peers about existence of @dev
1240  *      @dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250         rtnl_lock();
1251         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252         rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255
1256 static int __dev_open(struct net_device *dev)
1257 {
1258         const struct net_device_ops *ops = dev->netdev_ops;
1259         int ret;
1260
1261         ASSERT_RTNL();
1262
1263         if (!netif_device_present(dev))
1264                 return -ENODEV;
1265
1266         /* Block netpoll from trying to do any rx path servicing.
1267          * If we don't do this there is a chance ndo_poll_controller
1268          * or ndo_poll may be running while we open the device
1269          */
1270         netpoll_poll_disable(dev);
1271
1272         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273         ret = notifier_to_errno(ret);
1274         if (ret)
1275                 return ret;
1276
1277         set_bit(__LINK_STATE_START, &dev->state);
1278
1279         if (ops->ndo_validate_addr)
1280                 ret = ops->ndo_validate_addr(dev);
1281
1282         if (!ret && ops->ndo_open)
1283                 ret = ops->ndo_open(dev);
1284
1285         netpoll_poll_enable(dev);
1286
1287         if (ret)
1288                 clear_bit(__LINK_STATE_START, &dev->state);
1289         else {
1290                 dev->flags |= IFF_UP;
1291                 dev_set_rx_mode(dev);
1292                 dev_activate(dev);
1293                 add_device_randomness(dev->dev_addr, dev->addr_len);
1294         }
1295
1296         return ret;
1297 }
1298
1299 /**
1300  *      dev_open        - prepare an interface for use.
1301  *      @dev:   device to open
1302  *
1303  *      Takes a device from down to up state. The device's private open
1304  *      function is invoked and then the multicast lists are loaded. Finally
1305  *      the device is moved into the up state and a %NETDEV_UP message is
1306  *      sent to the netdev notifier chain.
1307  *
1308  *      Calling this function on an active interface is a nop. On a failure
1309  *      a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313         int ret;
1314
1315         if (dev->flags & IFF_UP)
1316                 return 0;
1317
1318         ret = __dev_open(dev);
1319         if (ret < 0)
1320                 return ret;
1321
1322         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323         call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325         return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331         struct net_device *dev;
1332
1333         ASSERT_RTNL();
1334         might_sleep();
1335
1336         list_for_each_entry(dev, head, close_list) {
1337                 /* Temporarily disable netpoll until the interface is down */
1338                 netpoll_poll_disable(dev);
1339
1340                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341
1342                 clear_bit(__LINK_STATE_START, &dev->state);
1343
1344                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                  * can be even on different cpu. So just clear netif_running().
1346                  *
1347                  * dev->stop() will invoke napi_disable() on all of it's
1348                  * napi_struct instances on this device.
1349                  */
1350                 smp_mb__after_atomic(); /* Commit netif_running(). */
1351         }
1352
1353         dev_deactivate_many(head);
1354
1355         list_for_each_entry(dev, head, close_list) {
1356                 const struct net_device_ops *ops = dev->netdev_ops;
1357
1358                 /*
1359                  *      Call the device specific close. This cannot fail.
1360                  *      Only if device is UP
1361                  *
1362                  *      We allow it to be called even after a DETACH hot-plug
1363                  *      event.
1364                  */
1365                 if (ops->ndo_stop)
1366                         ops->ndo_stop(dev);
1367
1368                 dev->flags &= ~IFF_UP;
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         struct net_device *lower_dev;
1441         struct list_head *iter;
1442
1443         dev->wanted_features &= ~NETIF_F_LRO;
1444         netdev_update_features(dev);
1445
1446         if (unlikely(dev->features & NETIF_F_LRO))
1447                 netdev_WARN(dev, "failed to disable LRO!\n");
1448
1449         netdev_for_each_lower_dev(dev, lower_dev, iter)
1450                 dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455                                    struct net_device *dev)
1456 {
1457         struct netdev_notifier_info info;
1458
1459         netdev_notifier_info_init(&info, dev);
1460         return nb->notifier_call(nb, val, &info);
1461 }
1462
1463 static int dev_boot_phase = 1;
1464
1465 /**
1466  *      register_netdevice_notifier - register a network notifier block
1467  *      @nb: notifier
1468  *
1469  *      Register a notifier to be called when network device events occur.
1470  *      The notifier passed is linked into the kernel structures and must
1471  *      not be reused until it has been unregistered. A negative errno code
1472  *      is returned on a failure.
1473  *
1474  *      When registered all registration and up events are replayed
1475  *      to the new notifier to allow device to have a race free
1476  *      view of the network device list.
1477  */
1478
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481         struct net_device *dev;
1482         struct net_device *last;
1483         struct net *net;
1484         int err;
1485
1486         rtnl_lock();
1487         err = raw_notifier_chain_register(&netdev_chain, nb);
1488         if (err)
1489                 goto unlock;
1490         if (dev_boot_phase)
1491                 goto unlock;
1492         for_each_net(net) {
1493                 for_each_netdev(net, dev) {
1494                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495                         err = notifier_to_errno(err);
1496                         if (err)
1497                                 goto rollback;
1498
1499                         if (!(dev->flags & IFF_UP))
1500                                 continue;
1501
1502                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1503                 }
1504         }
1505
1506 unlock:
1507         rtnl_unlock();
1508         return err;
1509
1510 rollback:
1511         last = dev;
1512         for_each_net(net) {
1513                 for_each_netdev(net, dev) {
1514                         if (dev == last)
1515                                 goto outroll;
1516
1517                         if (dev->flags & IFF_UP) {
1518                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519                                                         dev);
1520                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521                         }
1522                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523                 }
1524         }
1525
1526 outroll:
1527         raw_notifier_chain_unregister(&netdev_chain, nb);
1528         goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531
1532 /**
1533  *      unregister_netdevice_notifier - unregister a network notifier block
1534  *      @nb: notifier
1535  *
1536  *      Unregister a notifier previously registered by
1537  *      register_netdevice_notifier(). The notifier is unlinked into the
1538  *      kernel structures and may then be reused. A negative errno code
1539  *      is returned on a failure.
1540  *
1541  *      After unregistering unregister and down device events are synthesized
1542  *      for all devices on the device list to the removed notifier to remove
1543  *      the need for special case cleanup code.
1544  */
1545
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548         struct net_device *dev;
1549         struct net *net;
1550         int err;
1551
1552         rtnl_lock();
1553         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554         if (err)
1555                 goto unlock;
1556
1557         for_each_net(net) {
1558                 for_each_netdev(net, dev) {
1559                         if (dev->flags & IFF_UP) {
1560                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561                                                         dev);
1562                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563                         }
1564                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565                 }
1566         }
1567 unlock:
1568         rtnl_unlock();
1569         return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572
1573 /**
1574  *      call_netdevice_notifiers_info - call all network notifier blocks
1575  *      @val: value passed unmodified to notifier function
1576  *      @dev: net_device pointer passed unmodified to notifier function
1577  *      @info: notifier information data
1578  *
1579  *      Call all network notifier blocks.  Parameters and return value
1580  *      are as for raw_notifier_call_chain().
1581  */
1582
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584                                          struct net_device *dev,
1585                                          struct netdev_notifier_info *info)
1586 {
1587         ASSERT_RTNL();
1588         netdev_notifier_info_init(info, dev);
1589         return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591
1592 /**
1593  *      call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *      Call all network notifier blocks.  Parameters and return value
1598  *      are as for raw_notifier_call_chain().
1599  */
1600
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603         struct netdev_notifier_info info;
1604
1605         return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622
1623         if (deferred) {
1624                 while (--deferred)
1625                         static_key_slow_dec(&netstamp_needed);
1626                 return;
1627         }
1628 #endif
1629         static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636         if (in_interrupt()) {
1637                 atomic_inc(&netstamp_needed_deferred);
1638                 return;
1639         }
1640 #endif
1641         static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647         skb->tstamp.tv64 = 0;
1648         if (static_key_false(&netstamp_needed))
1649                 __net_timestamp(skb);
1650 }
1651
1652 #define net_timestamp_check(COND, SKB)                  \
1653         if (static_key_false(&netstamp_needed)) {               \
1654                 if ((COND) && !(SKB)->tstamp.tv64)      \
1655                         __net_timestamp(SKB);           \
1656         }                                               \
1657
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660         unsigned int len;
1661
1662         if (!(dev->flags & IFF_UP))
1663                 return false;
1664
1665         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666         if (skb->len <= len)
1667                 return true;
1668
1669         /* if TSO is enabled, we don't care about the length as the packet
1670          * could be forwarded without being segmented before
1671          */
1672         if (skb_is_gso(skb))
1673                 return true;
1674
1675         return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                         atomic_long_inc(&dev->rx_dropped);
1684                         kfree_skb(skb);
1685                         return NET_RX_DROP;
1686                 }
1687         }
1688
1689         if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                 atomic_long_inc(&dev->rx_dropped);
1691                 kfree_skb(skb);
1692                 return NET_RX_DROP;
1693         }
1694
1695         skb_scrub_packet(skb, true);
1696         skb->protocol = eth_type_trans(skb, dev);
1697         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698
1699         return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *      NET_RX_SUCCESS  (no congestion)
1711  *      NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726
1727 static inline int deliver_skb(struct sk_buff *skb,
1728                               struct packet_type *pt_prev,
1729                               struct net_device *orig_dev)
1730 {
1731         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732                 return -ENOMEM;
1733         atomic_inc(&skb->users);
1734         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739         if (!ptype->af_packet_priv || !skb->sk)
1740                 return false;
1741
1742         if (ptype->id_match)
1743                 return ptype->id_match(ptype, skb->sk);
1744         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745                 return true;
1746
1747         return false;
1748 }
1749
1750 /*
1751  *      Support routine. Sends outgoing frames to any network
1752  *      taps currently in use.
1753  */
1754
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757         struct packet_type *ptype;
1758         struct sk_buff *skb2 = NULL;
1759         struct packet_type *pt_prev = NULL;
1760
1761         rcu_read_lock();
1762         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763                 /* Never send packets back to the socket
1764                  * they originated from - MvS (miquels@drinkel.ow.org)
1765                  */
1766                 if ((ptype->dev == dev || !ptype->dev) &&
1767                     (!skb_loop_sk(ptype, skb))) {
1768                         if (pt_prev) {
1769                                 deliver_skb(skb2, pt_prev, skb->dev);
1770                                 pt_prev = ptype;
1771                                 continue;
1772                         }
1773
1774                         skb2 = skb_clone(skb, GFP_ATOMIC);
1775                         if (!skb2)
1776                                 break;
1777
1778                         net_timestamp_set(skb2);
1779
1780                         /* skb->nh should be correctly
1781                            set by sender, so that the second statement is
1782                            just protection against buggy protocols.
1783                          */
1784                         skb_reset_mac_header(skb2);
1785
1786                         if (skb_network_header(skb2) < skb2->data ||
1787                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789                                                      ntohs(skb2->protocol),
1790                                                      dev->name);
1791                                 skb_reset_network_header(skb2);
1792                         }
1793
1794                         skb2->transport_header = skb2->network_header;
1795                         skb2->pkt_type = PACKET_OUTGOING;
1796                         pt_prev = ptype;
1797                 }
1798         }
1799         if (pt_prev)
1800                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801         rcu_read_unlock();
1802 }
1803
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819         int i;
1820         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821
1822         /* If TC0 is invalidated disable TC mapping */
1823         if (tc->offset + tc->count > txq) {
1824                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825                 dev->num_tc = 0;
1826                 return;
1827         }
1828
1829         /* Invalidated prio to tc mappings set to TC0 */
1830         for (i = 1; i < TC_BITMASK + 1; i++) {
1831                 int q = netdev_get_prio_tc_map(dev, i);
1832
1833                 tc = &dev->tc_to_txq[q];
1834                 if (tc->offset + tc->count > txq) {
1835                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836                                 i, q);
1837                         netdev_set_prio_tc_map(dev, i, 0);
1838                 }
1839         }
1840 }
1841
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)             \
1845         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848                                         int cpu, u16 index)
1849 {
1850         struct xps_map *map = NULL;
1851         int pos;
1852
1853         if (dev_maps)
1854                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855
1856         for (pos = 0; map && pos < map->len; pos++) {
1857                 if (map->queues[pos] == index) {
1858                         if (map->len > 1) {
1859                                 map->queues[pos] = map->queues[--map->len];
1860                         } else {
1861                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862                                 kfree_rcu(map, rcu);
1863                                 map = NULL;
1864                         }
1865                         break;
1866                 }
1867         }
1868
1869         return map;
1870 }
1871
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874         struct xps_dev_maps *dev_maps;
1875         int cpu, i;
1876         bool active = false;
1877
1878         mutex_lock(&xps_map_mutex);
1879         dev_maps = xmap_dereference(dev->xps_maps);
1880
1881         if (!dev_maps)
1882                 goto out_no_maps;
1883
1884         for_each_possible_cpu(cpu) {
1885                 for (i = index; i < dev->num_tx_queues; i++) {
1886                         if (!remove_xps_queue(dev_maps, cpu, i))
1887                                 break;
1888                 }
1889                 if (i == dev->num_tx_queues)
1890                         active = true;
1891         }
1892
1893         if (!active) {
1894                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1895                 kfree_rcu(dev_maps, rcu);
1896         }
1897
1898         for (i = index; i < dev->num_tx_queues; i++)
1899                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900                                              NUMA_NO_NODE);
1901
1902 out_no_maps:
1903         mutex_unlock(&xps_map_mutex);
1904 }
1905
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907                                       int cpu, u16 index)
1908 {
1909         struct xps_map *new_map;
1910         int alloc_len = XPS_MIN_MAP_ALLOC;
1911         int i, pos;
1912
1913         for (pos = 0; map && pos < map->len; pos++) {
1914                 if (map->queues[pos] != index)
1915                         continue;
1916                 return map;
1917         }
1918
1919         /* Need to add queue to this CPU's existing map */
1920         if (map) {
1921                 if (pos < map->alloc_len)
1922                         return map;
1923
1924                 alloc_len = map->alloc_len * 2;
1925         }
1926
1927         /* Need to allocate new map to store queue on this CPU's map */
1928         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929                                cpu_to_node(cpu));
1930         if (!new_map)
1931                 return NULL;
1932
1933         for (i = 0; i < pos; i++)
1934                 new_map->queues[i] = map->queues[i];
1935         new_map->alloc_len = alloc_len;
1936         new_map->len = pos;
1937
1938         return new_map;
1939 }
1940
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942                         u16 index)
1943 {
1944         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945         struct xps_map *map, *new_map;
1946         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947         int cpu, numa_node_id = -2;
1948         bool active = false;
1949
1950         mutex_lock(&xps_map_mutex);
1951
1952         dev_maps = xmap_dereference(dev->xps_maps);
1953
1954         /* allocate memory for queue storage */
1955         for_each_online_cpu(cpu) {
1956                 if (!cpumask_test_cpu(cpu, mask))
1957                         continue;
1958
1959                 if (!new_dev_maps)
1960                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961                 if (!new_dev_maps) {
1962                         mutex_unlock(&xps_map_mutex);
1963                         return -ENOMEM;
1964                 }
1965
1966                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967                                  NULL;
1968
1969                 map = expand_xps_map(map, cpu, index);
1970                 if (!map)
1971                         goto error;
1972
1973                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974         }
1975
1976         if (!new_dev_maps)
1977                 goto out_no_new_maps;
1978
1979         for_each_possible_cpu(cpu) {
1980                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981                         /* add queue to CPU maps */
1982                         int pos = 0;
1983
1984                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985                         while ((pos < map->len) && (map->queues[pos] != index))
1986                                 pos++;
1987
1988                         if (pos == map->len)
1989                                 map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991                         if (numa_node_id == -2)
1992                                 numa_node_id = cpu_to_node(cpu);
1993                         else if (numa_node_id != cpu_to_node(cpu))
1994                                 numa_node_id = -1;
1995 #endif
1996                 } else if (dev_maps) {
1997                         /* fill in the new device map from the old device map */
1998                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000                 }
2001
2002         }
2003
2004         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005
2006         /* Cleanup old maps */
2007         if (dev_maps) {
2008                 for_each_possible_cpu(cpu) {
2009                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011                         if (map && map != new_map)
2012                                 kfree_rcu(map, rcu);
2013                 }
2014
2015                 kfree_rcu(dev_maps, rcu);
2016         }
2017
2018         dev_maps = new_dev_maps;
2019         active = true;
2020
2021 out_no_new_maps:
2022         /* update Tx queue numa node */
2023         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024                                      (numa_node_id >= 0) ? numa_node_id :
2025                                      NUMA_NO_NODE);
2026
2027         if (!dev_maps)
2028                 goto out_no_maps;
2029
2030         /* removes queue from unused CPUs */
2031         for_each_possible_cpu(cpu) {
2032                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033                         continue;
2034
2035                 if (remove_xps_queue(dev_maps, cpu, index))
2036                         active = true;
2037         }
2038
2039         /* free map if not active */
2040         if (!active) {
2041                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044
2045 out_no_maps:
2046         mutex_unlock(&xps_map_mutex);
2047
2048         return 0;
2049 error:
2050         /* remove any maps that we added */
2051         for_each_possible_cpu(cpu) {
2052                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054                                  NULL;
2055                 if (new_map && new_map != map)
2056                         kfree(new_map);
2057         }
2058
2059         mutex_unlock(&xps_map_mutex);
2060
2061         kfree(new_dev_maps);
2062         return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073         int rc;
2074
2075         if (txq < 1 || txq > dev->num_tx_queues)
2076                 return -EINVAL;
2077
2078         if (dev->reg_state == NETREG_REGISTERED ||
2079             dev->reg_state == NETREG_UNREGISTERING) {
2080                 ASSERT_RTNL();
2081
2082                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083                                                   txq);
2084                 if (rc)
2085                         return rc;
2086
2087                 if (dev->num_tc)
2088                         netif_setup_tc(dev, txq);
2089
2090                 if (txq < dev->real_num_tx_queues) {
2091                         qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093                         netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095                 }
2096         }
2097
2098         dev->real_num_tx_queues = txq;
2099         return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *      @dev: Network device
2107  *      @rxq: Actual number of RX queues
2108  *
2109  *      This must be called either with the rtnl_lock held or before
2110  *      registration of the net device.  Returns 0 on success, or a
2111  *      negative error code.  If called before registration, it always
2112  *      succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116         int rc;
2117
2118         if (rxq < 1 || rxq > dev->num_rx_queues)
2119                 return -EINVAL;
2120
2121         if (dev->reg_state == NETREG_REGISTERED) {
2122                 ASSERT_RTNL();
2123
2124                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125                                                   rxq);
2126                 if (rc)
2127                         return rc;
2128         }
2129
2130         dev->real_num_rx_queues = rxq;
2131         return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150         struct softnet_data *sd;
2151         unsigned long flags;
2152
2153         local_irq_save(flags);
2154         sd = this_cpu_ptr(&softnet_data);
2155         q->next_sched = NULL;
2156         *sd->output_queue_tailp = q;
2157         sd->output_queue_tailp = &q->next_sched;
2158         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159         local_irq_restore(flags);
2160 }
2161
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165                 __netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168
2169 struct dev_kfree_skb_cb {
2170         enum skb_free_reason reason;
2171 };
2172
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175         return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180         rcu_read_lock();
2181         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2183
2184                 __netif_schedule(q);
2185         }
2186         rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189
2190 /**
2191  *      netif_wake_subqueue - allow sending packets on subqueue
2192  *      @dev: network device
2193  *      @queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200
2201         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202                 struct Qdisc *q;
2203
2204                 rcu_read_lock();
2205                 q = rcu_dereference(txq->qdisc);
2206                 __netif_schedule(q);
2207                 rcu_read_unlock();
2208         }
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215                 struct Qdisc *q;
2216
2217                 rcu_read_lock();
2218                 q = rcu_dereference(dev_queue->qdisc);
2219                 __netif_schedule(q);
2220                 rcu_read_unlock();
2221         }
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227         unsigned long flags;
2228
2229         if (likely(atomic_read(&skb->users) == 1)) {
2230                 smp_rmb();
2231                 atomic_set(&skb->users, 0);
2232         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2233                 return;
2234         }
2235         get_kfree_skb_cb(skb)->reason = reason;
2236         local_irq_save(flags);
2237         skb->next = __this_cpu_read(softnet_data.completion_queue);
2238         __this_cpu_write(softnet_data.completion_queue, skb);
2239         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240         local_irq_restore(flags);
2241 }
2242 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243
2244 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245 {
2246         if (in_irq() || irqs_disabled())
2247                 __dev_kfree_skb_irq(skb, reason);
2248         else
2249                 dev_kfree_skb(skb);
2250 }
2251 EXPORT_SYMBOL(__dev_kfree_skb_any);
2252
2253
2254 /**
2255  * netif_device_detach - mark device as removed
2256  * @dev: network device
2257  *
2258  * Mark device as removed from system and therefore no longer available.
2259  */
2260 void netif_device_detach(struct net_device *dev)
2261 {
2262         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263             netif_running(dev)) {
2264                 netif_tx_stop_all_queues(dev);
2265         }
2266 }
2267 EXPORT_SYMBOL(netif_device_detach);
2268
2269 /**
2270  * netif_device_attach - mark device as attached
2271  * @dev: network device
2272  *
2273  * Mark device as attached from system and restart if needed.
2274  */
2275 void netif_device_attach(struct net_device *dev)
2276 {
2277         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278             netif_running(dev)) {
2279                 netif_tx_wake_all_queues(dev);
2280                 __netdev_watchdog_up(dev);
2281         }
2282 }
2283 EXPORT_SYMBOL(netif_device_attach);
2284
2285 static void skb_warn_bad_offload(const struct sk_buff *skb)
2286 {
2287         static const netdev_features_t null_features = 0;
2288         struct net_device *dev = skb->dev;
2289         const char *driver = "";
2290
2291         if (!net_ratelimit())
2292                 return;
2293
2294         if (dev && dev->dev.parent)
2295                 driver = dev_driver_string(dev->dev.parent);
2296
2297         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298              "gso_type=%d ip_summed=%d\n",
2299              driver, dev ? &dev->features : &null_features,
2300              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302              skb_shinfo(skb)->gso_type, skb->ip_summed);
2303 }
2304
2305 /*
2306  * Invalidate hardware checksum when packet is to be mangled, and
2307  * complete checksum manually on outgoing path.
2308  */
2309 int skb_checksum_help(struct sk_buff *skb)
2310 {
2311         __wsum csum;
2312         int ret = 0, offset;
2313
2314         if (skb->ip_summed == CHECKSUM_COMPLETE)
2315                 goto out_set_summed;
2316
2317         if (unlikely(skb_shinfo(skb)->gso_size)) {
2318                 skb_warn_bad_offload(skb);
2319                 return -EINVAL;
2320         }
2321
2322         /* Before computing a checksum, we should make sure no frag could
2323          * be modified by an external entity : checksum could be wrong.
2324          */
2325         if (skb_has_shared_frag(skb)) {
2326                 ret = __skb_linearize(skb);
2327                 if (ret)
2328                         goto out;
2329         }
2330
2331         offset = skb_checksum_start_offset(skb);
2332         BUG_ON(offset >= skb_headlen(skb));
2333         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334
2335         offset += skb->csum_offset;
2336         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337
2338         if (skb_cloned(skb) &&
2339             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341                 if (ret)
2342                         goto out;
2343         }
2344
2345         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346 out_set_summed:
2347         skb->ip_summed = CHECKSUM_NONE;
2348 out:
2349         return ret;
2350 }
2351 EXPORT_SYMBOL(skb_checksum_help);
2352
2353 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354 {
2355         unsigned int vlan_depth = skb->mac_len;
2356         __be16 type = skb->protocol;
2357
2358         /* Tunnel gso handlers can set protocol to ethernet. */
2359         if (type == htons(ETH_P_TEB)) {
2360                 struct ethhdr *eth;
2361
2362                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2363                         return 0;
2364
2365                 eth = (struct ethhdr *)skb_mac_header(skb);
2366                 type = eth->h_proto;
2367         }
2368
2369         /* if skb->protocol is 802.1Q/AD then the header should already be
2370          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2371          * ETH_HLEN otherwise
2372          */
2373         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2374                 if (vlan_depth) {
2375                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2376                                 return 0;
2377                         vlan_depth -= VLAN_HLEN;
2378                 } else {
2379                         vlan_depth = ETH_HLEN;
2380                 }
2381                 do {
2382                         struct vlan_hdr *vh;
2383
2384                         if (unlikely(!pskb_may_pull(skb,
2385                                                     vlan_depth + VLAN_HLEN)))
2386                                 return 0;
2387
2388                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2389                         type = vh->h_vlan_encapsulated_proto;
2390                         vlan_depth += VLAN_HLEN;
2391                 } while (type == htons(ETH_P_8021Q) ||
2392                          type == htons(ETH_P_8021AD));
2393         }
2394
2395         *depth = vlan_depth;
2396
2397         return type;
2398 }
2399
2400 /**
2401  *      skb_mac_gso_segment - mac layer segmentation handler.
2402  *      @skb: buffer to segment
2403  *      @features: features for the output path (see dev->features)
2404  */
2405 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2406                                     netdev_features_t features)
2407 {
2408         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2409         struct packet_offload *ptype;
2410         int vlan_depth = skb->mac_len;
2411         __be16 type = skb_network_protocol(skb, &vlan_depth);
2412
2413         if (unlikely(!type))
2414                 return ERR_PTR(-EINVAL);
2415
2416         __skb_pull(skb, vlan_depth);
2417
2418         rcu_read_lock();
2419         list_for_each_entry_rcu(ptype, &offload_base, list) {
2420                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2421                         segs = ptype->callbacks.gso_segment(skb, features);
2422                         break;
2423                 }
2424         }
2425         rcu_read_unlock();
2426
2427         __skb_push(skb, skb->data - skb_mac_header(skb));
2428
2429         return segs;
2430 }
2431 EXPORT_SYMBOL(skb_mac_gso_segment);
2432
2433
2434 /* openvswitch calls this on rx path, so we need a different check.
2435  */
2436 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2437 {
2438         if (tx_path)
2439                 return skb->ip_summed != CHECKSUM_PARTIAL;
2440         else
2441                 return skb->ip_summed == CHECKSUM_NONE;
2442 }
2443
2444 /**
2445  *      __skb_gso_segment - Perform segmentation on skb.
2446  *      @skb: buffer to segment
2447  *      @features: features for the output path (see dev->features)
2448  *      @tx_path: whether it is called in TX path
2449  *
2450  *      This function segments the given skb and returns a list of segments.
2451  *
2452  *      It may return NULL if the skb requires no segmentation.  This is
2453  *      only possible when GSO is used for verifying header integrity.
2454  */
2455 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2456                                   netdev_features_t features, bool tx_path)
2457 {
2458         if (unlikely(skb_needs_check(skb, tx_path))) {
2459                 int err;
2460
2461                 skb_warn_bad_offload(skb);
2462
2463                 err = skb_cow_head(skb, 0);
2464                 if (err < 0)
2465                         return ERR_PTR(err);
2466         }
2467
2468         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2469         SKB_GSO_CB(skb)->encap_level = 0;
2470
2471         skb_reset_mac_header(skb);
2472         skb_reset_mac_len(skb);
2473
2474         return skb_mac_gso_segment(skb, features);
2475 }
2476 EXPORT_SYMBOL(__skb_gso_segment);
2477
2478 /* Take action when hardware reception checksum errors are detected. */
2479 #ifdef CONFIG_BUG
2480 void netdev_rx_csum_fault(struct net_device *dev)
2481 {
2482         if (net_ratelimit()) {
2483                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2484                 dump_stack();
2485         }
2486 }
2487 EXPORT_SYMBOL(netdev_rx_csum_fault);
2488 #endif
2489
2490 /* Actually, we should eliminate this check as soon as we know, that:
2491  * 1. IOMMU is present and allows to map all the memory.
2492  * 2. No high memory really exists on this machine.
2493  */
2494
2495 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2496 {
2497 #ifdef CONFIG_HIGHMEM
2498         int i;
2499         if (!(dev->features & NETIF_F_HIGHDMA)) {
2500                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2501                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2502                         if (PageHighMem(skb_frag_page(frag)))
2503                                 return 1;
2504                 }
2505         }
2506
2507         if (PCI_DMA_BUS_IS_PHYS) {
2508                 struct device *pdev = dev->dev.parent;
2509
2510                 if (!pdev)
2511                         return 0;
2512                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2513                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2514                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2515                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2516                                 return 1;
2517                 }
2518         }
2519 #endif
2520         return 0;
2521 }
2522
2523 /* If MPLS offload request, verify we are testing hardware MPLS features
2524  * instead of standard features for the netdev.
2525  */
2526 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2527 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528                                            netdev_features_t features,
2529                                            __be16 type)
2530 {
2531         if (eth_p_mpls(type))
2532                 features &= skb->dev->mpls_features;
2533
2534         return features;
2535 }
2536 #else
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538                                            netdev_features_t features,
2539                                            __be16 type)
2540 {
2541         return features;
2542 }
2543 #endif
2544
2545 static netdev_features_t harmonize_features(struct sk_buff *skb,
2546         netdev_features_t features)
2547 {
2548         int tmp;
2549         __be16 type;
2550
2551         type = skb_network_protocol(skb, &tmp);
2552         features = net_mpls_features(skb, features, type);
2553
2554         if (skb->ip_summed != CHECKSUM_NONE &&
2555             !can_checksum_protocol(features, type)) {
2556                 features &= ~NETIF_F_ALL_CSUM;
2557         } else if (illegal_highdma(skb->dev, skb)) {
2558                 features &= ~NETIF_F_SG;
2559         }
2560
2561         return features;
2562 }
2563
2564 netdev_features_t netif_skb_features(struct sk_buff *skb)
2565 {
2566         const struct net_device *dev = skb->dev;
2567         netdev_features_t features = dev->features;
2568         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2569         __be16 protocol = skb->protocol;
2570
2571         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2572                 features &= ~NETIF_F_GSO_MASK;
2573
2574         if (!vlan_tx_tag_present(skb)) {
2575                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2576                              protocol == htons(ETH_P_8021AD))) {
2577                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2578                         protocol = veh->h_vlan_encapsulated_proto;
2579                 } else {
2580                         return harmonize_features(skb, features);
2581                 }
2582         }
2583
2584         features = netdev_intersect_features(features,
2585                                              dev->vlan_features |
2586                                              NETIF_F_HW_VLAN_CTAG_TX |
2587                                              NETIF_F_HW_VLAN_STAG_TX);
2588
2589         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2590                 features = netdev_intersect_features(features,
2591                                                      NETIF_F_SG |
2592                                                      NETIF_F_HIGHDMA |
2593                                                      NETIF_F_FRAGLIST |
2594                                                      NETIF_F_GEN_CSUM |
2595                                                      NETIF_F_HW_VLAN_CTAG_TX |
2596                                                      NETIF_F_HW_VLAN_STAG_TX);
2597
2598         return harmonize_features(skb, features);
2599 }
2600 EXPORT_SYMBOL(netif_skb_features);
2601
2602 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2603                     struct netdev_queue *txq, bool more)
2604 {
2605         unsigned int len;
2606         int rc;
2607
2608         if (!list_empty(&ptype_all))
2609                 dev_queue_xmit_nit(skb, dev);
2610
2611         len = skb->len;
2612         trace_net_dev_start_xmit(skb, dev);
2613         rc = netdev_start_xmit(skb, dev, txq, more);
2614         trace_net_dev_xmit(skb, rc, dev, len);
2615
2616         return rc;
2617 }
2618
2619 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2620                                     struct netdev_queue *txq, int *ret)
2621 {
2622         struct sk_buff *skb = first;
2623         int rc = NETDEV_TX_OK;
2624
2625         while (skb) {
2626                 struct sk_buff *next = skb->next;
2627
2628                 skb->next = NULL;
2629                 rc = xmit_one(skb, dev, txq, next != NULL);
2630                 if (unlikely(!dev_xmit_complete(rc))) {
2631                         skb->next = next;
2632                         goto out;
2633                 }
2634
2635                 skb = next;
2636                 if (netif_xmit_stopped(txq) && skb) {
2637                         rc = NETDEV_TX_BUSY;
2638                         break;
2639                 }
2640         }
2641
2642 out:
2643         *ret = rc;
2644         return skb;
2645 }
2646
2647 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2648                                           netdev_features_t features)
2649 {
2650         if (vlan_tx_tag_present(skb) &&
2651             !vlan_hw_offload_capable(features, skb->vlan_proto))
2652                 skb = __vlan_hwaccel_push_inside(skb);
2653         return skb;
2654 }
2655
2656 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2657 {
2658         netdev_features_t features;
2659
2660         if (skb->next)
2661                 return skb;
2662
2663         features = netif_skb_features(skb);
2664         skb = validate_xmit_vlan(skb, features);
2665         if (unlikely(!skb))
2666                 goto out_null;
2667
2668         /* If encapsulation offload request, verify we are testing
2669          * hardware encapsulation features instead of standard
2670          * features for the netdev
2671          */
2672         if (skb->encapsulation)
2673                 features &= dev->hw_enc_features;
2674
2675         if (netif_needs_gso(dev, skb, features)) {
2676                 struct sk_buff *segs;
2677
2678                 segs = skb_gso_segment(skb, features);
2679                 if (IS_ERR(segs)) {
2680                         goto out_kfree_skb;
2681                 } else if (segs) {
2682                         consume_skb(skb);
2683                         skb = segs;
2684                 }
2685         } else {
2686                 if (skb_needs_linearize(skb, features) &&
2687                     __skb_linearize(skb))
2688                         goto out_kfree_skb;
2689
2690                 /* If packet is not checksummed and device does not
2691                  * support checksumming for this protocol, complete
2692                  * checksumming here.
2693                  */
2694                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2695                         if (skb->encapsulation)
2696                                 skb_set_inner_transport_header(skb,
2697                                                                skb_checksum_start_offset(skb));
2698                         else
2699                                 skb_set_transport_header(skb,
2700                                                          skb_checksum_start_offset(skb));
2701                         if (!(features & NETIF_F_ALL_CSUM) &&
2702                             skb_checksum_help(skb))
2703                                 goto out_kfree_skb;
2704                 }
2705         }
2706
2707         return skb;
2708
2709 out_kfree_skb:
2710         kfree_skb(skb);
2711 out_null:
2712         return NULL;
2713 }
2714
2715 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2716 {
2717         struct sk_buff *next, *head = NULL, *tail;
2718
2719         for (; skb != NULL; skb = next) {
2720                 next = skb->next;
2721                 skb->next = NULL;
2722
2723                 /* in case skb wont be segmented, point to itself */
2724                 skb->prev = skb;
2725
2726                 skb = validate_xmit_skb(skb, dev);
2727                 if (!skb)
2728                         continue;
2729
2730                 if (!head)
2731                         head = skb;
2732                 else
2733                         tail->next = skb;
2734                 /* If skb was segmented, skb->prev points to
2735                  * the last segment. If not, it still contains skb.
2736                  */
2737                 tail = skb->prev;
2738         }
2739         return head;
2740 }
2741
2742 static void qdisc_pkt_len_init(struct sk_buff *skb)
2743 {
2744         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2745
2746         qdisc_skb_cb(skb)->pkt_len = skb->len;
2747
2748         /* To get more precise estimation of bytes sent on wire,
2749          * we add to pkt_len the headers size of all segments
2750          */
2751         if (shinfo->gso_size)  {
2752                 unsigned int hdr_len;
2753                 u16 gso_segs = shinfo->gso_segs;
2754
2755                 /* mac layer + network layer */
2756                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2757
2758                 /* + transport layer */
2759                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2760                         hdr_len += tcp_hdrlen(skb);
2761                 else
2762                         hdr_len += sizeof(struct udphdr);
2763
2764                 if (shinfo->gso_type & SKB_GSO_DODGY)
2765                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2766                                                 shinfo->gso_size);
2767
2768                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2769         }
2770 }
2771
2772 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2773                                  struct net_device *dev,
2774                                  struct netdev_queue *txq)
2775 {
2776         spinlock_t *root_lock = qdisc_lock(q);
2777         bool contended;
2778         int rc;
2779
2780         qdisc_pkt_len_init(skb);
2781         qdisc_calculate_pkt_len(skb, q);
2782         /*
2783          * Heuristic to force contended enqueues to serialize on a
2784          * separate lock before trying to get qdisc main lock.
2785          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2786          * often and dequeue packets faster.
2787          */
2788         contended = qdisc_is_running(q);
2789         if (unlikely(contended))
2790                 spin_lock(&q->busylock);
2791
2792         spin_lock(root_lock);
2793         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2794                 kfree_skb(skb);
2795                 rc = NET_XMIT_DROP;
2796         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2797                    qdisc_run_begin(q)) {
2798                 /*
2799                  * This is a work-conserving queue; there are no old skbs
2800                  * waiting to be sent out; and the qdisc is not running -
2801                  * xmit the skb directly.
2802                  */
2803
2804                 qdisc_bstats_update(q, skb);
2805
2806                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2807                         if (unlikely(contended)) {
2808                                 spin_unlock(&q->busylock);
2809                                 contended = false;
2810                         }
2811                         __qdisc_run(q);
2812                 } else
2813                         qdisc_run_end(q);
2814
2815                 rc = NET_XMIT_SUCCESS;
2816         } else {
2817                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2818                 if (qdisc_run_begin(q)) {
2819                         if (unlikely(contended)) {
2820                                 spin_unlock(&q->busylock);
2821                                 contended = false;
2822                         }
2823                         __qdisc_run(q);
2824                 }
2825         }
2826         spin_unlock(root_lock);
2827         if (unlikely(contended))
2828                 spin_unlock(&q->busylock);
2829         return rc;
2830 }
2831
2832 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2833 static void skb_update_prio(struct sk_buff *skb)
2834 {
2835         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2836
2837         if (!skb->priority && skb->sk && map) {
2838                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2839
2840                 if (prioidx < map->priomap_len)
2841                         skb->priority = map->priomap[prioidx];
2842         }
2843 }
2844 #else
2845 #define skb_update_prio(skb)
2846 #endif
2847
2848 static DEFINE_PER_CPU(int, xmit_recursion);
2849 #define RECURSION_LIMIT 10
2850
2851 /**
2852  *      dev_loopback_xmit - loop back @skb
2853  *      @skb: buffer to transmit
2854  */
2855 int dev_loopback_xmit(struct sk_buff *skb)
2856 {
2857         skb_reset_mac_header(skb);
2858         __skb_pull(skb, skb_network_offset(skb));
2859         skb->pkt_type = PACKET_LOOPBACK;
2860         skb->ip_summed = CHECKSUM_UNNECESSARY;
2861         WARN_ON(!skb_dst(skb));
2862         skb_dst_force(skb);
2863         netif_rx_ni(skb);
2864         return 0;
2865 }
2866 EXPORT_SYMBOL(dev_loopback_xmit);
2867
2868 /**
2869  *      __dev_queue_xmit - transmit a buffer
2870  *      @skb: buffer to transmit
2871  *      @accel_priv: private data used for L2 forwarding offload
2872  *
2873  *      Queue a buffer for transmission to a network device. The caller must
2874  *      have set the device and priority and built the buffer before calling
2875  *      this function. The function can be called from an interrupt.
2876  *
2877  *      A negative errno code is returned on a failure. A success does not
2878  *      guarantee the frame will be transmitted as it may be dropped due
2879  *      to congestion or traffic shaping.
2880  *
2881  * -----------------------------------------------------------------------------------
2882  *      I notice this method can also return errors from the queue disciplines,
2883  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2884  *      be positive.
2885  *
2886  *      Regardless of the return value, the skb is consumed, so it is currently
2887  *      difficult to retry a send to this method.  (You can bump the ref count
2888  *      before sending to hold a reference for retry if you are careful.)
2889  *
2890  *      When calling this method, interrupts MUST be enabled.  This is because
2891  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2892  *          --BLG
2893  */
2894 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2895 {
2896         struct net_device *dev = skb->dev;
2897         struct netdev_queue *txq;
2898         struct Qdisc *q;
2899         int rc = -ENOMEM;
2900
2901         skb_reset_mac_header(skb);
2902
2903         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2904                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2905
2906         /* Disable soft irqs for various locks below. Also
2907          * stops preemption for RCU.
2908          */
2909         rcu_read_lock_bh();
2910
2911         skb_update_prio(skb);
2912
2913         /* If device/qdisc don't need skb->dst, release it right now while
2914          * its hot in this cpu cache.
2915          */
2916         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2917                 skb_dst_drop(skb);
2918         else
2919                 skb_dst_force(skb);
2920
2921         txq = netdev_pick_tx(dev, skb, accel_priv);
2922         q = rcu_dereference_bh(txq->qdisc);
2923
2924 #ifdef CONFIG_NET_CLS_ACT
2925         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2926 #endif
2927         trace_net_dev_queue(skb);
2928         if (q->enqueue) {
2929                 rc = __dev_xmit_skb(skb, q, dev, txq);
2930                 goto out;
2931         }
2932
2933         /* The device has no queue. Common case for software devices:
2934            loopback, all the sorts of tunnels...
2935
2936            Really, it is unlikely that netif_tx_lock protection is necessary
2937            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2938            counters.)
2939            However, it is possible, that they rely on protection
2940            made by us here.
2941
2942            Check this and shot the lock. It is not prone from deadlocks.
2943            Either shot noqueue qdisc, it is even simpler 8)
2944          */
2945         if (dev->flags & IFF_UP) {
2946                 int cpu = smp_processor_id(); /* ok because BHs are off */
2947
2948                 if (txq->xmit_lock_owner != cpu) {
2949
2950                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2951                                 goto recursion_alert;
2952
2953                         skb = validate_xmit_skb(skb, dev);
2954                         if (!skb)
2955                                 goto drop;
2956
2957                         HARD_TX_LOCK(dev, txq, cpu);
2958
2959                         if (!netif_xmit_stopped(txq)) {
2960                                 __this_cpu_inc(xmit_recursion);
2961                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2962                                 __this_cpu_dec(xmit_recursion);
2963                                 if (dev_xmit_complete(rc)) {
2964                                         HARD_TX_UNLOCK(dev, txq);
2965                                         goto out;
2966                                 }
2967                         }
2968                         HARD_TX_UNLOCK(dev, txq);
2969                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2970                                              dev->name);
2971                 } else {
2972                         /* Recursion is detected! It is possible,
2973                          * unfortunately
2974                          */
2975 recursion_alert:
2976                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2977                                              dev->name);
2978                 }
2979         }
2980
2981         rc = -ENETDOWN;
2982 drop:
2983         rcu_read_unlock_bh();
2984
2985         atomic_long_inc(&dev->tx_dropped);
2986         kfree_skb_list(skb);
2987         return rc;
2988 out:
2989         rcu_read_unlock_bh();
2990         return rc;
2991 }
2992
2993 int dev_queue_xmit(struct sk_buff *skb)
2994 {
2995         return __dev_queue_xmit(skb, NULL);
2996 }
2997 EXPORT_SYMBOL(dev_queue_xmit);
2998
2999 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3000 {
3001         return __dev_queue_xmit(skb, accel_priv);
3002 }
3003 EXPORT_SYMBOL(dev_queue_xmit_accel);
3004
3005
3006 /*=======================================================================
3007                         Receiver routines
3008   =======================================================================*/
3009
3010 int netdev_max_backlog __read_mostly = 1000;
3011 EXPORT_SYMBOL(netdev_max_backlog);
3012
3013 int netdev_tstamp_prequeue __read_mostly = 1;
3014 int netdev_budget __read_mostly = 300;
3015 int weight_p __read_mostly = 64;            /* old backlog weight */
3016
3017 /* Called with irq disabled */
3018 static inline void ____napi_schedule(struct softnet_data *sd,
3019                                      struct napi_struct *napi)
3020 {
3021         list_add_tail(&napi->poll_list, &sd->poll_list);
3022         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3023 }
3024
3025 #ifdef CONFIG_RPS
3026
3027 /* One global table that all flow-based protocols share. */
3028 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3029 EXPORT_SYMBOL(rps_sock_flow_table);
3030
3031 struct static_key rps_needed __read_mostly;
3032
3033 static struct rps_dev_flow *
3034 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3035             struct rps_dev_flow *rflow, u16 next_cpu)
3036 {
3037         if (next_cpu != RPS_NO_CPU) {
3038 #ifdef CONFIG_RFS_ACCEL
3039                 struct netdev_rx_queue *rxqueue;
3040                 struct rps_dev_flow_table *flow_table;
3041                 struct rps_dev_flow *old_rflow;
3042                 u32 flow_id;
3043                 u16 rxq_index;
3044                 int rc;
3045
3046                 /* Should we steer this flow to a different hardware queue? */
3047                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3048                     !(dev->features & NETIF_F_NTUPLE))
3049                         goto out;
3050                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3051                 if (rxq_index == skb_get_rx_queue(skb))
3052                         goto out;
3053
3054                 rxqueue = dev->_rx + rxq_index;
3055                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3056                 if (!flow_table)
3057                         goto out;
3058                 flow_id = skb_get_hash(skb) & flow_table->mask;
3059                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3060                                                         rxq_index, flow_id);
3061                 if (rc < 0)
3062                         goto out;
3063                 old_rflow = rflow;
3064                 rflow = &flow_table->flows[flow_id];
3065                 rflow->filter = rc;
3066                 if (old_rflow->filter == rflow->filter)
3067                         old_rflow->filter = RPS_NO_FILTER;
3068         out:
3069 #endif
3070                 rflow->last_qtail =
3071                         per_cpu(softnet_data, next_cpu).input_queue_head;
3072         }
3073
3074         rflow->cpu = next_cpu;
3075         return rflow;
3076 }
3077
3078 /*
3079  * get_rps_cpu is called from netif_receive_skb and returns the target
3080  * CPU from the RPS map of the receiving queue for a given skb.
3081  * rcu_read_lock must be held on entry.
3082  */
3083 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3084                        struct rps_dev_flow **rflowp)
3085 {
3086         struct netdev_rx_queue *rxqueue;
3087         struct rps_map *map;
3088         struct rps_dev_flow_table *flow_table;
3089         struct rps_sock_flow_table *sock_flow_table;
3090         int cpu = -1;
3091         u16 tcpu;
3092         u32 hash;
3093
3094         if (skb_rx_queue_recorded(skb)) {
3095                 u16 index = skb_get_rx_queue(skb);
3096                 if (unlikely(index >= dev->real_num_rx_queues)) {
3097                         WARN_ONCE(dev->real_num_rx_queues > 1,
3098                                   "%s received packet on queue %u, but number "
3099                                   "of RX queues is %u\n",
3100                                   dev->name, index, dev->real_num_rx_queues);
3101                         goto done;
3102                 }
3103                 rxqueue = dev->_rx + index;
3104         } else
3105                 rxqueue = dev->_rx;
3106
3107         map = rcu_dereference(rxqueue->rps_map);
3108         if (map) {
3109                 if (map->len == 1 &&
3110                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3111                         tcpu = map->cpus[0];
3112                         if (cpu_online(tcpu))
3113                                 cpu = tcpu;
3114                         goto done;
3115                 }
3116         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3117                 goto done;
3118         }
3119
3120         skb_reset_network_header(skb);
3121         hash = skb_get_hash(skb);
3122         if (!hash)
3123                 goto done;
3124
3125         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3126         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3127         if (flow_table && sock_flow_table) {
3128                 u16 next_cpu;
3129                 struct rps_dev_flow *rflow;
3130
3131                 rflow = &flow_table->flows[hash & flow_table->mask];
3132                 tcpu = rflow->cpu;
3133
3134                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3135
3136                 /*
3137                  * If the desired CPU (where last recvmsg was done) is
3138                  * different from current CPU (one in the rx-queue flow
3139                  * table entry), switch if one of the following holds:
3140                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3141                  *   - Current CPU is offline.
3142                  *   - The current CPU's queue tail has advanced beyond the
3143                  *     last packet that was enqueued using this table entry.
3144                  *     This guarantees that all previous packets for the flow
3145                  *     have been dequeued, thus preserving in order delivery.
3146                  */
3147                 if (unlikely(tcpu != next_cpu) &&
3148                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3149                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3150                       rflow->last_qtail)) >= 0)) {
3151                         tcpu = next_cpu;
3152                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3153                 }
3154
3155                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3156                         *rflowp = rflow;
3157                         cpu = tcpu;
3158                         goto done;
3159                 }
3160         }
3161
3162         if (map) {
3163                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3164                 if (cpu_online(tcpu)) {
3165                         cpu = tcpu;
3166                         goto done;
3167                 }
3168         }
3169
3170 done:
3171         return cpu;
3172 }
3173
3174 #ifdef CONFIG_RFS_ACCEL
3175
3176 /**
3177  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3178  * @dev: Device on which the filter was set
3179  * @rxq_index: RX queue index
3180  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3181  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3182  *
3183  * Drivers that implement ndo_rx_flow_steer() should periodically call
3184  * this function for each installed filter and remove the filters for
3185  * which it returns %true.
3186  */
3187 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3188                          u32 flow_id, u16 filter_id)
3189 {
3190         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3191         struct rps_dev_flow_table *flow_table;
3192         struct rps_dev_flow *rflow;
3193         bool expire = true;
3194         int cpu;
3195
3196         rcu_read_lock();
3197         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3198         if (flow_table && flow_id <= flow_table->mask) {
3199                 rflow = &flow_table->flows[flow_id];
3200                 cpu = ACCESS_ONCE(rflow->cpu);
3201                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3202                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3203                            rflow->last_qtail) <
3204                      (int)(10 * flow_table->mask)))
3205                         expire = false;
3206         }
3207         rcu_read_unlock();
3208         return expire;
3209 }
3210 EXPORT_SYMBOL(rps_may_expire_flow);
3211
3212 #endif /* CONFIG_RFS_ACCEL */
3213
3214 /* Called from hardirq (IPI) context */
3215 static void rps_trigger_softirq(void *data)
3216 {
3217         struct softnet_data *sd = data;
3218
3219         ____napi_schedule(sd, &sd->backlog);
3220         sd->received_rps++;
3221 }
3222
3223 #endif /* CONFIG_RPS */
3224
3225 /*
3226  * Check if this softnet_data structure is another cpu one
3227  * If yes, queue it to our IPI list and return 1
3228  * If no, return 0
3229  */
3230 static int rps_ipi_queued(struct softnet_data *sd)
3231 {
3232 #ifdef CONFIG_RPS
3233         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3234
3235         if (sd != mysd) {
3236                 sd->rps_ipi_next = mysd->rps_ipi_list;
3237                 mysd->rps_ipi_list = sd;
3238
3239                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3240                 return 1;
3241         }
3242 #endif /* CONFIG_RPS */
3243         return 0;
3244 }
3245
3246 #ifdef CONFIG_NET_FLOW_LIMIT
3247 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3248 #endif
3249
3250 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3251 {
3252 #ifdef CONFIG_NET_FLOW_LIMIT
3253         struct sd_flow_limit *fl;
3254         struct softnet_data *sd;
3255         unsigned int old_flow, new_flow;
3256
3257         if (qlen < (netdev_max_backlog >> 1))
3258                 return false;
3259
3260         sd = this_cpu_ptr(&softnet_data);
3261
3262         rcu_read_lock();
3263         fl = rcu_dereference(sd->flow_limit);
3264         if (fl) {
3265                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3266                 old_flow = fl->history[fl->history_head];
3267                 fl->history[fl->history_head] = new_flow;
3268
3269                 fl->history_head++;
3270                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3271
3272                 if (likely(fl->buckets[old_flow]))
3273                         fl->buckets[old_flow]--;
3274
3275                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3276                         fl->count++;
3277                         rcu_read_unlock();
3278                         return true;
3279                 }
3280         }
3281         rcu_read_unlock();
3282 #endif
3283         return false;
3284 }
3285
3286 /*
3287  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3288  * queue (may be a remote CPU queue).
3289  */
3290 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3291                               unsigned int *qtail)
3292 {
3293         struct softnet_data *sd;
3294         unsigned long flags;
3295         unsigned int qlen;
3296
3297         sd = &per_cpu(softnet_data, cpu);
3298
3299         local_irq_save(flags);
3300
3301         rps_lock(sd);
3302         qlen = skb_queue_len(&sd->input_pkt_queue);
3303         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3304                 if (qlen) {
3305 enqueue:
3306                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3307                         input_queue_tail_incr_save(sd, qtail);
3308                         rps_unlock(sd);
3309                         local_irq_restore(flags);
3310                         return NET_RX_SUCCESS;
3311                 }
3312
3313                 /* Schedule NAPI for backlog device
3314                  * We can use non atomic operation since we own the queue lock
3315                  */
3316                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3317                         if (!rps_ipi_queued(sd))
3318                                 ____napi_schedule(sd, &sd->backlog);
3319                 }
3320                 goto enqueue;
3321         }
3322
3323         sd->dropped++;
3324         rps_unlock(sd);
3325
3326         local_irq_restore(flags);
3327
3328         atomic_long_inc(&skb->dev->rx_dropped);
3329         kfree_skb(skb);
3330         return NET_RX_DROP;
3331 }
3332
3333 static int netif_rx_internal(struct sk_buff *skb)
3334 {
3335         int ret;
3336
3337         net_timestamp_check(netdev_tstamp_prequeue, skb);
3338
3339         trace_netif_rx(skb);
3340 #ifdef CONFIG_RPS
3341         if (static_key_false(&rps_needed)) {
3342                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3343                 int cpu;
3344
3345                 preempt_disable();
3346                 rcu_read_lock();
3347
3348                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3349                 if (cpu < 0)
3350                         cpu = smp_processor_id();
3351
3352                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3353
3354                 rcu_read_unlock();
3355                 preempt_enable();
3356         } else
3357 #endif
3358         {
3359                 unsigned int qtail;
3360                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3361                 put_cpu();
3362         }
3363         return ret;
3364 }
3365
3366 /**
3367  *      netif_rx        -       post buffer to the network code
3368  *      @skb: buffer to post
3369  *
3370  *      This function receives a packet from a device driver and queues it for
3371  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3372  *      may be dropped during processing for congestion control or by the
3373  *      protocol layers.
3374  *
3375  *      return values:
3376  *      NET_RX_SUCCESS  (no congestion)
3377  *      NET_RX_DROP     (packet was dropped)
3378  *
3379  */
3380
3381 int netif_rx(struct sk_buff *skb)
3382 {
3383         trace_netif_rx_entry(skb);
3384
3385         return netif_rx_internal(skb);
3386 }
3387 EXPORT_SYMBOL(netif_rx);
3388
3389 int netif_rx_ni(struct sk_buff *skb)
3390 {
3391         int err;
3392
3393         trace_netif_rx_ni_entry(skb);
3394
3395         preempt_disable();
3396         err = netif_rx_internal(skb);
3397         if (local_softirq_pending())
3398                 do_softirq();
3399         preempt_enable();
3400
3401         return err;
3402 }
3403 EXPORT_SYMBOL(netif_rx_ni);
3404
3405 static void net_tx_action(struct softirq_action *h)
3406 {
3407         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3408
3409         if (sd->completion_queue) {
3410                 struct sk_buff *clist;
3411
3412                 local_irq_disable();
3413                 clist = sd->completion_queue;
3414                 sd->completion_queue = NULL;
3415                 local_irq_enable();
3416
3417                 while (clist) {
3418                         struct sk_buff *skb = clist;
3419                         clist = clist->next;
3420
3421                         WARN_ON(atomic_read(&skb->users));
3422                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3423                                 trace_consume_skb(skb);
3424                         else
3425                                 trace_kfree_skb(skb, net_tx_action);
3426                         __kfree_skb(skb);
3427                 }
3428         }
3429
3430         if (sd->output_queue) {
3431                 struct Qdisc *head;
3432
3433                 local_irq_disable();
3434                 head = sd->output_queue;
3435                 sd->output_queue = NULL;
3436                 sd->output_queue_tailp = &sd->output_queue;
3437                 local_irq_enable();
3438
3439                 while (head) {
3440                         struct Qdisc *q = head;
3441                         spinlock_t *root_lock;
3442
3443                         head = head->next_sched;
3444
3445                         root_lock = qdisc_lock(q);
3446                         if (spin_trylock(root_lock)) {
3447                                 smp_mb__before_atomic();
3448                                 clear_bit(__QDISC_STATE_SCHED,
3449                                           &q->state);
3450                                 qdisc_run(q);
3451                                 spin_unlock(root_lock);
3452                         } else {
3453                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3454                                               &q->state)) {
3455                                         __netif_reschedule(q);
3456                                 } else {
3457                                         smp_mb__before_atomic();
3458                                         clear_bit(__QDISC_STATE_SCHED,
3459                                                   &q->state);
3460                                 }
3461                         }
3462                 }
3463         }
3464 }
3465
3466 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3467     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3468 /* This hook is defined here for ATM LANE */
3469 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3470                              unsigned char *addr) __read_mostly;
3471 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3472 #endif
3473
3474 #ifdef CONFIG_NET_CLS_ACT
3475 /* TODO: Maybe we should just force sch_ingress to be compiled in
3476  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3477  * a compare and 2 stores extra right now if we dont have it on
3478  * but have CONFIG_NET_CLS_ACT
3479  * NOTE: This doesn't stop any functionality; if you dont have
3480  * the ingress scheduler, you just can't add policies on ingress.
3481  *
3482  */
3483 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3484 {
3485         struct net_device *dev = skb->dev;
3486         u32 ttl = G_TC_RTTL(skb->tc_verd);
3487         int result = TC_ACT_OK;
3488         struct Qdisc *q;
3489
3490         if (unlikely(MAX_RED_LOOP < ttl++)) {
3491                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3492                                      skb->skb_iif, dev->ifindex);
3493                 return TC_ACT_SHOT;
3494         }
3495
3496         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3497         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3498
3499         q = rcu_dereference(rxq->qdisc);
3500         if (q != &noop_qdisc) {
3501                 spin_lock(qdisc_lock(q));
3502                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3503                         result = qdisc_enqueue_root(skb, q);
3504                 spin_unlock(qdisc_lock(q));
3505         }
3506
3507         return result;
3508 }
3509
3510 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3511                                          struct packet_type **pt_prev,
3512                                          int *ret, struct net_device *orig_dev)
3513 {
3514         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3515
3516         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3517                 goto out;
3518
3519         if (*pt_prev) {
3520                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3521                 *pt_prev = NULL;
3522         }
3523
3524         switch (ing_filter(skb, rxq)) {
3525         case TC_ACT_SHOT:
3526         case TC_ACT_STOLEN:
3527                 kfree_skb(skb);
3528                 return NULL;
3529         }
3530
3531 out:
3532         skb->tc_verd = 0;
3533         return skb;
3534 }
3535 #endif
3536
3537 /**
3538  *      netdev_rx_handler_register - register receive handler
3539  *      @dev: device to register a handler for
3540  *      @rx_handler: receive handler to register
3541  *      @rx_handler_data: data pointer that is used by rx handler
3542  *
3543  *      Register a receive handler for a device. This handler will then be
3544  *      called from __netif_receive_skb. A negative errno code is returned
3545  *      on a failure.
3546  *
3547  *      The caller must hold the rtnl_mutex.
3548  *
3549  *      For a general description of rx_handler, see enum rx_handler_result.
3550  */
3551 int netdev_rx_handler_register(struct net_device *dev,
3552                                rx_handler_func_t *rx_handler,
3553                                void *rx_handler_data)
3554 {
3555         ASSERT_RTNL();
3556
3557         if (dev->rx_handler)
3558                 return -EBUSY;
3559
3560         /* Note: rx_handler_data must be set before rx_handler */
3561         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3562         rcu_assign_pointer(dev->rx_handler, rx_handler);
3563
3564         return 0;
3565 }
3566 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3567
3568 /**
3569  *      netdev_rx_handler_unregister - unregister receive handler
3570  *      @dev: device to unregister a handler from
3571  *
3572  *      Unregister a receive handler from a device.
3573  *
3574  *      The caller must hold the rtnl_mutex.
3575  */
3576 void netdev_rx_handler_unregister(struct net_device *dev)
3577 {
3578
3579         ASSERT_RTNL();
3580         RCU_INIT_POINTER(dev->rx_handler, NULL);
3581         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3582          * section has a guarantee to see a non NULL rx_handler_data
3583          * as well.
3584          */
3585         synchronize_net();
3586         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3587 }
3588 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3589
3590 /*
3591  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3592  * the special handling of PFMEMALLOC skbs.
3593  */
3594 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3595 {
3596         switch (skb->protocol) {
3597         case htons(ETH_P_ARP):
3598         case htons(ETH_P_IP):
3599         case htons(ETH_P_IPV6):
3600         case htons(ETH_P_8021Q):
3601         case htons(ETH_P_8021AD):
3602                 return true;
3603         default:
3604                 return false;
3605         }
3606 }
3607
3608 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3609 {
3610         struct packet_type *ptype, *pt_prev;
3611         rx_handler_func_t *rx_handler;
3612         struct net_device *orig_dev;
3613         struct net_device *null_or_dev;
3614         bool deliver_exact = false;
3615         int ret = NET_RX_DROP;
3616         __be16 type;
3617
3618         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3619
3620         trace_netif_receive_skb(skb);
3621
3622         orig_dev = skb->dev;
3623
3624         skb_reset_network_header(skb);
3625         if (!skb_transport_header_was_set(skb))
3626                 skb_reset_transport_header(skb);
3627         skb_reset_mac_len(skb);
3628
3629         pt_prev = NULL;
3630
3631         rcu_read_lock();
3632
3633 another_round:
3634         skb->skb_iif = skb->dev->ifindex;
3635
3636         __this_cpu_inc(softnet_data.processed);
3637
3638         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3639             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3640                 skb = skb_vlan_untag(skb);
3641                 if (unlikely(!skb))
3642                         goto unlock;
3643         }
3644
3645 #ifdef CONFIG_NET_CLS_ACT
3646         if (skb->tc_verd & TC_NCLS) {
3647                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3648                 goto ncls;
3649         }
3650 #endif
3651
3652         if (pfmemalloc)
3653                 goto skip_taps;
3654
3655         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3656                 if (!ptype->dev || ptype->dev == skb->dev) {
3657                         if (pt_prev)
3658                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3659                         pt_prev = ptype;
3660                 }
3661         }
3662
3663 skip_taps:
3664 #ifdef CONFIG_NET_CLS_ACT
3665         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3666         if (!skb)
3667                 goto unlock;
3668 ncls:
3669 #endif
3670
3671         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3672                 goto drop;
3673
3674         if (vlan_tx_tag_present(skb)) {
3675                 if (pt_prev) {
3676                         ret = deliver_skb(skb, pt_prev, orig_dev);
3677                         pt_prev = NULL;
3678                 }
3679                 if (vlan_do_receive(&skb))
3680                         goto another_round;
3681                 else if (unlikely(!skb))
3682                         goto unlock;
3683         }
3684
3685         rx_handler = rcu_dereference(skb->dev->rx_handler);
3686         if (rx_handler) {
3687                 if (pt_prev) {
3688                         ret = deliver_skb(skb, pt_prev, orig_dev);
3689                         pt_prev = NULL;
3690                 }
3691                 switch (rx_handler(&skb)) {
3692                 case RX_HANDLER_CONSUMED:
3693                         ret = NET_RX_SUCCESS;
3694                         goto unlock;
3695                 case RX_HANDLER_ANOTHER:
3696                         goto another_round;
3697                 case RX_HANDLER_EXACT:
3698                         deliver_exact = true;
3699                 case RX_HANDLER_PASS:
3700                         break;
3701                 default:
3702                         BUG();
3703                 }
3704         }
3705
3706         if (unlikely(vlan_tx_tag_present(skb))) {
3707                 if (vlan_tx_tag_get_id(skb))
3708                         skb->pkt_type = PACKET_OTHERHOST;
3709                 /* Note: we might in the future use prio bits
3710                  * and set skb->priority like in vlan_do_receive()
3711                  * For the time being, just ignore Priority Code Point
3712                  */
3713                 skb->vlan_tci = 0;
3714         }
3715
3716         /* deliver only exact match when indicated */
3717         null_or_dev = deliver_exact ? skb->dev : NULL;
3718
3719         type = skb->protocol;
3720         list_for_each_entry_rcu(ptype,
3721                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3722                 if (ptype->type == type &&
3723                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3724                      ptype->dev == orig_dev)) {
3725                         if (pt_prev)
3726                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3727                         pt_prev = ptype;
3728                 }
3729         }
3730
3731         if (pt_prev) {
3732                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3733                         goto drop;
3734                 else
3735                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3736         } else {
3737 drop:
3738                 atomic_long_inc(&skb->dev->rx_dropped);
3739                 kfree_skb(skb);
3740                 /* Jamal, now you will not able to escape explaining
3741                  * me how you were going to use this. :-)
3742                  */
3743                 ret = NET_RX_DROP;
3744         }
3745
3746 unlock:
3747         rcu_read_unlock();
3748         return ret;
3749 }
3750
3751 static int __netif_receive_skb(struct sk_buff *skb)
3752 {
3753         int ret;
3754
3755         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3756                 unsigned long pflags = current->flags;
3757
3758                 /*
3759                  * PFMEMALLOC skbs are special, they should
3760                  * - be delivered to SOCK_MEMALLOC sockets only
3761                  * - stay away from userspace
3762                  * - have bounded memory usage
3763                  *
3764                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3765                  * context down to all allocation sites.
3766                  */
3767                 current->flags |= PF_MEMALLOC;
3768                 ret = __netif_receive_skb_core(skb, true);
3769                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3770         } else
3771                 ret = __netif_receive_skb_core(skb, false);
3772
3773         return ret;
3774 }
3775
3776 static int netif_receive_skb_internal(struct sk_buff *skb)
3777 {
3778         net_timestamp_check(netdev_tstamp_prequeue, skb);
3779
3780         if (skb_defer_rx_timestamp(skb))
3781                 return NET_RX_SUCCESS;
3782
3783 #ifdef CONFIG_RPS
3784         if (static_key_false(&rps_needed)) {
3785                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3786                 int cpu, ret;
3787
3788                 rcu_read_lock();
3789
3790                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3791
3792                 if (cpu >= 0) {
3793                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3794                         rcu_read_unlock();
3795                         return ret;
3796                 }
3797                 rcu_read_unlock();
3798         }
3799 #endif
3800         return __netif_receive_skb(skb);
3801 }
3802
3803 /**
3804  *      netif_receive_skb - process receive buffer from network
3805  *      @skb: buffer to process
3806  *
3807  *      netif_receive_skb() is the main receive data processing function.
3808  *      It always succeeds. The buffer may be dropped during processing
3809  *      for congestion control or by the protocol layers.
3810  *
3811  *      This function may only be called from softirq context and interrupts
3812  *      should be enabled.
3813  *
3814  *      Return values (usually ignored):
3815  *      NET_RX_SUCCESS: no congestion
3816  *      NET_RX_DROP: packet was dropped
3817  */
3818 int netif_receive_skb(struct sk_buff *skb)
3819 {
3820         trace_netif_receive_skb_entry(skb);
3821
3822         return netif_receive_skb_internal(skb);
3823 }
3824 EXPORT_SYMBOL(netif_receive_skb);
3825
3826 /* Network device is going away, flush any packets still pending
3827  * Called with irqs disabled.
3828  */
3829 static void flush_backlog(void *arg)
3830 {
3831         struct net_device *dev = arg;
3832         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3833         struct sk_buff *skb, *tmp;
3834
3835         rps_lock(sd);
3836         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3837                 if (skb->dev == dev) {
3838                         __skb_unlink(skb, &sd->input_pkt_queue);
3839                         kfree_skb(skb);
3840                         input_queue_head_incr(sd);
3841                 }
3842         }
3843         rps_unlock(sd);
3844
3845         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3846                 if (skb->dev == dev) {
3847                         __skb_unlink(skb, &sd->process_queue);
3848                         kfree_skb(skb);
3849                         input_queue_head_incr(sd);
3850                 }
3851         }
3852 }
3853
3854 static int napi_gro_complete(struct sk_buff *skb)
3855 {
3856         struct packet_offload *ptype;
3857         __be16 type = skb->protocol;
3858         struct list_head *head = &offload_base;
3859         int err = -ENOENT;
3860
3861         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3862
3863         if (NAPI_GRO_CB(skb)->count == 1) {
3864                 skb_shinfo(skb)->gso_size = 0;
3865                 goto out;
3866         }
3867
3868         rcu_read_lock();
3869         list_for_each_entry_rcu(ptype, head, list) {
3870                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3871                         continue;
3872
3873                 err = ptype->callbacks.gro_complete(skb, 0);
3874                 break;
3875         }
3876         rcu_read_unlock();
3877
3878         if (err) {
3879                 WARN_ON(&ptype->list == head);
3880                 kfree_skb(skb);
3881                 return NET_RX_SUCCESS;
3882         }
3883
3884 out:
3885         return netif_receive_skb_internal(skb);
3886 }
3887
3888 /* napi->gro_list contains packets ordered by age.
3889  * youngest packets at the head of it.
3890  * Complete skbs in reverse order to reduce latencies.
3891  */
3892 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3893 {
3894         struct sk_buff *skb, *prev = NULL;
3895
3896         /* scan list and build reverse chain */
3897         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3898                 skb->prev = prev;
3899                 prev = skb;
3900         }
3901
3902         for (skb = prev; skb; skb = prev) {
3903                 skb->next = NULL;
3904
3905                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3906                         return;
3907
3908                 prev = skb->prev;
3909                 napi_gro_complete(skb);
3910                 napi->gro_count--;
3911         }
3912
3913         napi->gro_list = NULL;
3914 }
3915 EXPORT_SYMBOL(napi_gro_flush);
3916
3917 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3918 {
3919         struct sk_buff *p;
3920         unsigned int maclen = skb->dev->hard_header_len;
3921         u32 hash = skb_get_hash_raw(skb);
3922
3923         for (p = napi->gro_list; p; p = p->next) {
3924                 unsigned long diffs;
3925
3926                 NAPI_GRO_CB(p)->flush = 0;
3927
3928                 if (hash != skb_get_hash_raw(p)) {
3929                         NAPI_GRO_CB(p)->same_flow = 0;
3930                         continue;
3931                 }
3932
3933                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3934                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3935                 if (maclen == ETH_HLEN)
3936                         diffs |= compare_ether_header(skb_mac_header(p),
3937                                                       skb_mac_header(skb));
3938                 else if (!diffs)
3939                         diffs = memcmp(skb_mac_header(p),
3940                                        skb_mac_header(skb),
3941                                        maclen);
3942                 NAPI_GRO_CB(p)->same_flow = !diffs;
3943         }
3944 }
3945
3946 static void skb_gro_reset_offset(struct sk_buff *skb)
3947 {
3948         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3949         const skb_frag_t *frag0 = &pinfo->frags[0];
3950
3951         NAPI_GRO_CB(skb)->data_offset = 0;
3952         NAPI_GRO_CB(skb)->frag0 = NULL;
3953         NAPI_GRO_CB(skb)->frag0_len = 0;
3954
3955         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3956             pinfo->nr_frags &&
3957             !PageHighMem(skb_frag_page(frag0))) {
3958                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3959                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3960         }
3961 }
3962
3963 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3964 {
3965         struct skb_shared_info *pinfo = skb_shinfo(skb);
3966
3967         BUG_ON(skb->end - skb->tail < grow);
3968
3969         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3970
3971         skb->data_len -= grow;
3972         skb->tail += grow;
3973
3974         pinfo->frags[0].page_offset += grow;
3975         skb_frag_size_sub(&pinfo->frags[0], grow);
3976
3977         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3978                 skb_frag_unref(skb, 0);
3979                 memmove(pinfo->frags, pinfo->frags + 1,
3980                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3981         }
3982 }
3983
3984 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3985 {
3986         struct sk_buff **pp = NULL;
3987         struct packet_offload *ptype;
3988         __be16 type = skb->protocol;
3989         struct list_head *head = &offload_base;
3990         int same_flow;
3991         enum gro_result ret;
3992         int grow;
3993
3994         if (!(skb->dev->features & NETIF_F_GRO))
3995                 goto normal;
3996
3997         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3998                 goto normal;
3999
4000         gro_list_prepare(napi, skb);
4001
4002         rcu_read_lock();
4003         list_for_each_entry_rcu(ptype, head, list) {
4004                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4005                         continue;
4006
4007                 skb_set_network_header(skb, skb_gro_offset(skb));
4008                 skb_reset_mac_len(skb);
4009                 NAPI_GRO_CB(skb)->same_flow = 0;
4010                 NAPI_GRO_CB(skb)->flush = 0;
4011                 NAPI_GRO_CB(skb)->free = 0;
4012                 NAPI_GRO_CB(skb)->udp_mark = 0;
4013
4014                 /* Setup for GRO checksum validation */
4015                 switch (skb->ip_summed) {
4016                 case CHECKSUM_COMPLETE:
4017                         NAPI_GRO_CB(skb)->csum = skb->csum;
4018                         NAPI_GRO_CB(skb)->csum_valid = 1;
4019                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4020                         break;
4021                 case CHECKSUM_UNNECESSARY:
4022                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4023                         NAPI_GRO_CB(skb)->csum_valid = 0;
4024                         break;
4025                 default:
4026                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4027                         NAPI_GRO_CB(skb)->csum_valid = 0;
4028                 }
4029
4030                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4031                 break;
4032         }
4033         rcu_read_unlock();
4034
4035         if (&ptype->list == head)
4036                 goto normal;
4037
4038         same_flow = NAPI_GRO_CB(skb)->same_flow;
4039         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4040
4041         if (pp) {
4042                 struct sk_buff *nskb = *pp;
4043
4044                 *pp = nskb->next;
4045                 nskb->next = NULL;
4046                 napi_gro_complete(nskb);
4047                 napi->gro_count--;
4048         }
4049
4050         if (same_flow)
4051                 goto ok;
4052
4053         if (NAPI_GRO_CB(skb)->flush)
4054                 goto normal;
4055
4056         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4057                 struct sk_buff *nskb = napi->gro_list;
4058
4059                 /* locate the end of the list to select the 'oldest' flow */
4060                 while (nskb->next) {
4061                         pp = &nskb->next;
4062                         nskb = *pp;
4063                 }
4064                 *pp = NULL;
4065                 nskb->next = NULL;
4066                 napi_gro_complete(nskb);
4067         } else {
4068                 napi->gro_count++;
4069         }
4070         NAPI_GRO_CB(skb)->count = 1;
4071         NAPI_GRO_CB(skb)->age = jiffies;
4072         NAPI_GRO_CB(skb)->last = skb;
4073         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4074         skb->next = napi->gro_list;
4075         napi->gro_list = skb;
4076         ret = GRO_HELD;
4077
4078 pull:
4079         grow = skb_gro_offset(skb) - skb_headlen(skb);
4080         if (grow > 0)
4081                 gro_pull_from_frag0(skb, grow);
4082 ok:
4083         return ret;
4084
4085 normal:
4086         ret = GRO_NORMAL;
4087         goto pull;
4088 }
4089
4090 struct packet_offload *gro_find_receive_by_type(__be16 type)
4091 {
4092         struct list_head *offload_head = &offload_base;
4093         struct packet_offload *ptype;
4094
4095         list_for_each_entry_rcu(ptype, offload_head, list) {
4096                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4097                         continue;
4098                 return ptype;
4099         }
4100         return NULL;
4101 }
4102 EXPORT_SYMBOL(gro_find_receive_by_type);
4103
4104 struct packet_offload *gro_find_complete_by_type(__be16 type)
4105 {
4106         struct list_head *offload_head = &offload_base;
4107         struct packet_offload *ptype;
4108
4109         list_for_each_entry_rcu(ptype, offload_head, list) {
4110                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4111                         continue;
4112                 return ptype;
4113         }
4114         return NULL;
4115 }
4116 EXPORT_SYMBOL(gro_find_complete_by_type);
4117
4118 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4119 {
4120         switch (ret) {
4121         case GRO_NORMAL:
4122                 if (netif_receive_skb_internal(skb))
4123                         ret = GRO_DROP;
4124                 break;
4125
4126         case GRO_DROP:
4127                 kfree_skb(skb);
4128                 break;
4129
4130         case GRO_MERGED_FREE:
4131                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4132                         kmem_cache_free(skbuff_head_cache, skb);
4133                 else
4134                         __kfree_skb(skb);
4135                 break;
4136
4137         case GRO_HELD:
4138         case GRO_MERGED:
4139                 break;
4140         }
4141
4142         return ret;
4143 }
4144
4145 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4146 {
4147         trace_napi_gro_receive_entry(skb);
4148
4149         skb_gro_reset_offset(skb);
4150
4151         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4152 }
4153 EXPORT_SYMBOL(napi_gro_receive);
4154
4155 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4156 {
4157         if (unlikely(skb->pfmemalloc)) {
4158                 consume_skb(skb);
4159                 return;
4160         }
4161         __skb_pull(skb, skb_headlen(skb));
4162         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4163         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4164         skb->vlan_tci = 0;
4165         skb->dev = napi->dev;
4166         skb->skb_iif = 0;
4167         skb->encapsulation = 0;
4168         skb_shinfo(skb)->gso_type = 0;
4169         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4170
4171         napi->skb = skb;
4172 }
4173
4174 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4175 {
4176         struct sk_buff *skb = napi->skb;
4177
4178         if (!skb) {
4179                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4180                 napi->skb = skb;
4181         }
4182         return skb;
4183 }
4184 EXPORT_SYMBOL(napi_get_frags);
4185
4186 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4187                                       struct sk_buff *skb,
4188                                       gro_result_t ret)
4189 {
4190         switch (ret) {
4191         case GRO_NORMAL:
4192         case GRO_HELD:
4193                 __skb_push(skb, ETH_HLEN);
4194                 skb->protocol = eth_type_trans(skb, skb->dev);
4195                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4196                         ret = GRO_DROP;
4197                 break;
4198
4199         case GRO_DROP:
4200         case GRO_MERGED_FREE:
4201                 napi_reuse_skb(napi, skb);
4202                 break;
4203
4204         case GRO_MERGED:
4205                 break;
4206         }
4207
4208         return ret;
4209 }
4210
4211 /* Upper GRO stack assumes network header starts at gro_offset=0
4212  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4213  * We copy ethernet header into skb->data to have a common layout.
4214  */
4215 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4216 {
4217         struct sk_buff *skb = napi->skb;
4218         const struct ethhdr *eth;
4219         unsigned int hlen = sizeof(*eth);
4220
4221         napi->skb = NULL;
4222
4223         skb_reset_mac_header(skb);
4224         skb_gro_reset_offset(skb);
4225
4226         eth = skb_gro_header_fast(skb, 0);
4227         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4228                 eth = skb_gro_header_slow(skb, hlen, 0);
4229                 if (unlikely(!eth)) {
4230                         napi_reuse_skb(napi, skb);
4231                         return NULL;
4232                 }
4233         } else {
4234                 gro_pull_from_frag0(skb, hlen);
4235                 NAPI_GRO_CB(skb)->frag0 += hlen;
4236                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4237         }
4238         __skb_pull(skb, hlen);
4239
4240         /*
4241          * This works because the only protocols we care about don't require
4242          * special handling.
4243          * We'll fix it up properly in napi_frags_finish()
4244          */
4245         skb->protocol = eth->h_proto;
4246
4247         return skb;
4248 }
4249
4250 gro_result_t napi_gro_frags(struct napi_struct *napi)
4251 {
4252         struct sk_buff *skb = napi_frags_skb(napi);
4253
4254         if (!skb)
4255                 return GRO_DROP;
4256
4257         trace_napi_gro_frags_entry(skb);
4258
4259         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4260 }
4261 EXPORT_SYMBOL(napi_gro_frags);
4262
4263 /* Compute the checksum from gro_offset and return the folded value
4264  * after adding in any pseudo checksum.
4265  */
4266 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4267 {
4268         __wsum wsum;
4269         __sum16 sum;
4270
4271         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4272
4273         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4274         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4275         if (likely(!sum)) {
4276                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4277                     !skb->csum_complete_sw)
4278                         netdev_rx_csum_fault(skb->dev);
4279         }
4280
4281         NAPI_GRO_CB(skb)->csum = wsum;
4282         NAPI_GRO_CB(skb)->csum_valid = 1;
4283
4284         return sum;
4285 }
4286 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4287
4288 /*
4289  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4290  * Note: called with local irq disabled, but exits with local irq enabled.
4291  */
4292 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4293 {
4294 #ifdef CONFIG_RPS
4295         struct softnet_data *remsd = sd->rps_ipi_list;
4296
4297         if (remsd) {
4298                 sd->rps_ipi_list = NULL;
4299
4300                 local_irq_enable();
4301
4302                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4303                 while (remsd) {
4304                         struct softnet_data *next = remsd->rps_ipi_next;
4305
4306                         if (cpu_online(remsd->cpu))
4307                                 smp_call_function_single_async(remsd->cpu,
4308                                                            &remsd->csd);
4309                         remsd = next;
4310                 }
4311         } else
4312 #endif
4313                 local_irq_enable();
4314 }
4315
4316 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4317 {
4318 #ifdef CONFIG_RPS
4319         return sd->rps_ipi_list != NULL;
4320 #else
4321         return false;
4322 #endif
4323 }
4324
4325 static int process_backlog(struct napi_struct *napi, int quota)
4326 {
4327         int work = 0;
4328         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4329
4330         /* Check if we have pending ipi, its better to send them now,
4331          * not waiting net_rx_action() end.
4332          */
4333         if (sd_has_rps_ipi_waiting(sd)) {
4334                 local_irq_disable();
4335                 net_rps_action_and_irq_enable(sd);
4336         }
4337
4338         napi->weight = weight_p;
4339         local_irq_disable();
4340         while (1) {
4341                 struct sk_buff *skb;
4342
4343                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4344                         local_irq_enable();
4345                         __netif_receive_skb(skb);
4346                         local_irq_disable();
4347                         input_queue_head_incr(sd);
4348                         if (++work >= quota) {
4349                                 local_irq_enable();
4350                                 return work;
4351                         }
4352                 }
4353
4354                 rps_lock(sd);
4355                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4356                         /*
4357                          * Inline a custom version of __napi_complete().
4358                          * only current cpu owns and manipulates this napi,
4359                          * and NAPI_STATE_SCHED is the only possible flag set
4360                          * on backlog.
4361                          * We can use a plain write instead of clear_bit(),
4362                          * and we dont need an smp_mb() memory barrier.
4363                          */
4364                         napi->state = 0;
4365                         rps_unlock(sd);
4366
4367                         break;
4368                 }
4369
4370                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4371                                            &sd->process_queue);
4372                 rps_unlock(sd);
4373         }
4374         local_irq_enable();
4375
4376         return work;
4377 }
4378
4379 /**
4380  * __napi_schedule - schedule for receive
4381  * @n: entry to schedule
4382  *
4383  * The entry's receive function will be scheduled to run.
4384  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4385  */
4386 void __napi_schedule(struct napi_struct *n)
4387 {
4388         unsigned long flags;
4389
4390         local_irq_save(flags);
4391         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4392         local_irq_restore(flags);
4393 }
4394 EXPORT_SYMBOL(__napi_schedule);
4395
4396 /**
4397  * __napi_schedule_irqoff - schedule for receive
4398  * @n: entry to schedule
4399  *
4400  * Variant of __napi_schedule() assuming hard irqs are masked
4401  */
4402 void __napi_schedule_irqoff(struct napi_struct *n)
4403 {
4404         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4405 }
4406 EXPORT_SYMBOL(__napi_schedule_irqoff);
4407
4408 void __napi_complete(struct napi_struct *n)
4409 {
4410         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4411
4412         list_del_init(&n->poll_list);
4413         smp_mb__before_atomic();
4414         clear_bit(NAPI_STATE_SCHED, &n->state);
4415 }
4416 EXPORT_SYMBOL(__napi_complete);
4417
4418 void napi_complete_done(struct napi_struct *n, int work_done)
4419 {
4420         unsigned long flags;
4421
4422         /*
4423          * don't let napi dequeue from the cpu poll list
4424          * just in case its running on a different cpu
4425          */
4426         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4427                 return;
4428
4429         if (n->gro_list) {
4430                 unsigned long timeout = 0;
4431
4432                 if (work_done)
4433                         timeout = n->dev->gro_flush_timeout;
4434
4435                 if (timeout)
4436                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4437                                       HRTIMER_MODE_REL_PINNED);
4438                 else
4439                         napi_gro_flush(n, false);
4440         }
4441         if (likely(list_empty(&n->poll_list))) {
4442                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4443         } else {
4444                 /* If n->poll_list is not empty, we need to mask irqs */
4445                 local_irq_save(flags);
4446                 __napi_complete(n);
4447                 local_irq_restore(flags);
4448         }
4449 }
4450 EXPORT_SYMBOL(napi_complete_done);
4451
4452 /* must be called under rcu_read_lock(), as we dont take a reference */
4453 struct napi_struct *napi_by_id(unsigned int napi_id)
4454 {
4455         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4456         struct napi_struct *napi;
4457
4458         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4459                 if (napi->napi_id == napi_id)
4460                         return napi;
4461
4462         return NULL;
4463 }
4464 EXPORT_SYMBOL_GPL(napi_by_id);
4465
4466 void napi_hash_add(struct napi_struct *napi)
4467 {
4468         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4469
4470                 spin_lock(&napi_hash_lock);
4471
4472                 /* 0 is not a valid id, we also skip an id that is taken
4473                  * we expect both events to be extremely rare
4474                  */
4475                 napi->napi_id = 0;
4476                 while (!napi->napi_id) {
4477                         napi->napi_id = ++napi_gen_id;
4478                         if (napi_by_id(napi->napi_id))
4479                                 napi->napi_id = 0;
4480                 }
4481
4482                 hlist_add_head_rcu(&napi->napi_hash_node,
4483                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4484
4485                 spin_unlock(&napi_hash_lock);
4486         }
4487 }
4488 EXPORT_SYMBOL_GPL(napi_hash_add);
4489
4490 /* Warning : caller is responsible to make sure rcu grace period
4491  * is respected before freeing memory containing @napi
4492  */
4493 void napi_hash_del(struct napi_struct *napi)
4494 {
4495         spin_lock(&napi_hash_lock);
4496
4497         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4498                 hlist_del_rcu(&napi->napi_hash_node);
4499
4500         spin_unlock(&napi_hash_lock);
4501 }
4502 EXPORT_SYMBOL_GPL(napi_hash_del);
4503
4504 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4505 {
4506         struct napi_struct *napi;
4507
4508         napi = container_of(timer, struct napi_struct, timer);
4509         if (napi->gro_list)
4510                 napi_schedule(napi);
4511
4512         return HRTIMER_NORESTART;
4513 }
4514
4515 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4516                     int (*poll)(struct napi_struct *, int), int weight)
4517 {
4518         INIT_LIST_HEAD(&napi->poll_list);
4519         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4520         napi->timer.function = napi_watchdog;
4521         napi->gro_count = 0;
4522         napi->gro_list = NULL;
4523         napi->skb = NULL;
4524         napi->poll = poll;
4525         if (weight > NAPI_POLL_WEIGHT)
4526                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4527                             weight, dev->name);
4528         napi->weight = weight;
4529         list_add(&napi->dev_list, &dev->napi_list);
4530         napi->dev = dev;
4531 #ifdef CONFIG_NETPOLL
4532         spin_lock_init(&napi->poll_lock);
4533         napi->poll_owner = -1;
4534 #endif
4535         set_bit(NAPI_STATE_SCHED, &napi->state);
4536 }
4537 EXPORT_SYMBOL(netif_napi_add);
4538
4539 void napi_disable(struct napi_struct *n)
4540 {
4541         might_sleep();
4542         set_bit(NAPI_STATE_DISABLE, &n->state);
4543
4544         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4545                 msleep(1);
4546
4547         hrtimer_cancel(&n->timer);
4548
4549         clear_bit(NAPI_STATE_DISABLE, &n->state);
4550 }
4551 EXPORT_SYMBOL(napi_disable);
4552
4553 void netif_napi_del(struct napi_struct *napi)
4554 {
4555         list_del_init(&napi->dev_list);
4556         napi_free_frags(napi);
4557
4558         kfree_skb_list(napi->gro_list);
4559         napi->gro_list = NULL;
4560         napi->gro_count = 0;
4561 }
4562 EXPORT_SYMBOL(netif_napi_del);
4563
4564 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4565 {
4566         void *have;
4567         int work, weight;
4568
4569         list_del_init(&n->poll_list);
4570
4571         have = netpoll_poll_lock(n);
4572
4573         weight = n->weight;
4574
4575         /* This NAPI_STATE_SCHED test is for avoiding a race
4576          * with netpoll's poll_napi().  Only the entity which
4577          * obtains the lock and sees NAPI_STATE_SCHED set will
4578          * actually make the ->poll() call.  Therefore we avoid
4579          * accidentally calling ->poll() when NAPI is not scheduled.
4580          */
4581         work = 0;
4582         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4583                 work = n->poll(n, weight);
4584                 trace_napi_poll(n);
4585         }
4586
4587         WARN_ON_ONCE(work > weight);
4588
4589         if (likely(work < weight))
4590                 goto out_unlock;
4591
4592         /* Drivers must not modify the NAPI state if they
4593          * consume the entire weight.  In such cases this code
4594          * still "owns" the NAPI instance and therefore can
4595          * move the instance around on the list at-will.
4596          */
4597         if (unlikely(napi_disable_pending(n))) {
4598                 napi_complete(n);
4599                 goto out_unlock;
4600         }
4601
4602         if (n->gro_list) {
4603                 /* flush too old packets
4604                  * If HZ < 1000, flush all packets.
4605                  */
4606                 napi_gro_flush(n, HZ >= 1000);
4607         }
4608
4609         /* Some drivers may have called napi_schedule
4610          * prior to exhausting their budget.
4611          */
4612         if (unlikely(!list_empty(&n->poll_list))) {
4613                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4614                              n->dev ? n->dev->name : "backlog");
4615                 goto out_unlock;
4616         }
4617
4618         list_add_tail(&n->poll_list, repoll);
4619
4620 out_unlock:
4621         netpoll_poll_unlock(have);
4622
4623         return work;
4624 }
4625
4626 static void net_rx_action(struct softirq_action *h)
4627 {
4628         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4629         unsigned long time_limit = jiffies + 2;
4630         int budget = netdev_budget;
4631         LIST_HEAD(list);
4632         LIST_HEAD(repoll);
4633
4634         local_irq_disable();
4635         list_splice_init(&sd->poll_list, &list);
4636         local_irq_enable();
4637
4638         for (;;) {
4639                 struct napi_struct *n;
4640
4641                 if (list_empty(&list)) {
4642                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4643                                 return;
4644                         break;
4645                 }
4646
4647                 n = list_first_entry(&list, struct napi_struct, poll_list);
4648                 budget -= napi_poll(n, &repoll);
4649
4650                 /* If softirq window is exhausted then punt.
4651                  * Allow this to run for 2 jiffies since which will allow
4652                  * an average latency of 1.5/HZ.
4653                  */
4654                 if (unlikely(budget <= 0 ||
4655                              time_after_eq(jiffies, time_limit))) {
4656                         sd->time_squeeze++;
4657                         break;
4658                 }
4659         }
4660
4661         local_irq_disable();
4662
4663         list_splice_tail_init(&sd->poll_list, &list);
4664         list_splice_tail(&repoll, &list);
4665         list_splice(&list, &sd->poll_list);
4666         if (!list_empty(&sd->poll_list))
4667                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4668
4669         net_rps_action_and_irq_enable(sd);
4670 }
4671
4672 struct netdev_adjacent {
4673         struct net_device *dev;
4674
4675         /* upper master flag, there can only be one master device per list */
4676         bool master;
4677
4678         /* counter for the number of times this device was added to us */
4679         u16 ref_nr;
4680
4681         /* private field for the users */
4682         void *private;
4683
4684         struct list_head list;
4685         struct rcu_head rcu;
4686 };
4687
4688 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4689                                                  struct net_device *adj_dev,
4690                                                  struct list_head *adj_list)
4691 {
4692         struct netdev_adjacent *adj;
4693
4694         list_for_each_entry(adj, adj_list, list) {
4695                 if (adj->dev == adj_dev)
4696                         return adj;
4697         }
4698         return NULL;
4699 }
4700
4701 /**
4702  * netdev_has_upper_dev - Check if device is linked to an upper device
4703  * @dev: device
4704  * @upper_dev: upper device to check
4705  *
4706  * Find out if a device is linked to specified upper device and return true
4707  * in case it is. Note that this checks only immediate upper device,
4708  * not through a complete stack of devices. The caller must hold the RTNL lock.
4709  */
4710 bool netdev_has_upper_dev(struct net_device *dev,
4711                           struct net_device *upper_dev)
4712 {
4713         ASSERT_RTNL();
4714
4715         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4716 }
4717 EXPORT_SYMBOL(netdev_has_upper_dev);
4718
4719 /**
4720  * netdev_has_any_upper_dev - Check if device is linked to some device
4721  * @dev: device
4722  *
4723  * Find out if a device is linked to an upper device and return true in case
4724  * it is. The caller must hold the RTNL lock.
4725  */
4726 static bool netdev_has_any_upper_dev(struct net_device *dev)
4727 {
4728         ASSERT_RTNL();
4729
4730         return !list_empty(&dev->all_adj_list.upper);
4731 }
4732
4733 /**
4734  * netdev_master_upper_dev_get - Get master upper device
4735  * @dev: device
4736  *
4737  * Find a master upper device and return pointer to it or NULL in case
4738  * it's not there. The caller must hold the RTNL lock.
4739  */
4740 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4741 {
4742         struct netdev_adjacent *upper;
4743
4744         ASSERT_RTNL();
4745
4746         if (list_empty(&dev->adj_list.upper))
4747                 return NULL;
4748
4749         upper = list_first_entry(&dev->adj_list.upper,
4750                                  struct netdev_adjacent, list);
4751         if (likely(upper->master))
4752                 return upper->dev;
4753         return NULL;
4754 }
4755 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4756
4757 void *netdev_adjacent_get_private(struct list_head *adj_list)
4758 {
4759         struct netdev_adjacent *adj;
4760
4761         adj = list_entry(adj_list, struct netdev_adjacent, list);
4762
4763         return adj->private;
4764 }
4765 EXPORT_SYMBOL(netdev_adjacent_get_private);
4766
4767 /**
4768  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4769  * @dev: device
4770  * @iter: list_head ** of the current position
4771  *
4772  * Gets the next device from the dev's upper list, starting from iter
4773  * position. The caller must hold RCU read lock.
4774  */
4775 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4776                                                  struct list_head **iter)
4777 {
4778         struct netdev_adjacent *upper;
4779
4780         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4781
4782         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4783
4784         if (&upper->list == &dev->adj_list.upper)
4785                 return NULL;
4786
4787         *iter = &upper->list;
4788
4789         return upper->dev;
4790 }
4791 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4792
4793 /**
4794  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4795  * @dev: device
4796  * @iter: list_head ** of the current position
4797  *
4798  * Gets the next device from the dev's upper list, starting from iter
4799  * position. The caller must hold RCU read lock.
4800  */
4801 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4802                                                      struct list_head **iter)
4803 {
4804         struct netdev_adjacent *upper;
4805
4806         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4807
4808         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4809
4810         if (&upper->list == &dev->all_adj_list.upper)
4811                 return NULL;
4812
4813         *iter = &upper->list;
4814
4815         return upper->dev;
4816 }
4817 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4818
4819 /**
4820  * netdev_lower_get_next_private - Get the next ->private from the
4821  *                                 lower neighbour list
4822  * @dev: device
4823  * @iter: list_head ** of the current position
4824  *
4825  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4826  * list, starting from iter position. The caller must hold either hold the
4827  * RTNL lock or its own locking that guarantees that the neighbour lower
4828  * list will remain unchainged.
4829  */
4830 void *netdev_lower_get_next_private(struct net_device *dev,
4831                                     struct list_head **iter)
4832 {
4833         struct netdev_adjacent *lower;
4834
4835         lower = list_entry(*iter, struct netdev_adjacent, list);
4836
4837         if (&lower->list == &dev->adj_list.lower)
4838                 return NULL;
4839
4840         *iter = lower->list.next;
4841
4842         return lower->private;
4843 }
4844 EXPORT_SYMBOL(netdev_lower_get_next_private);
4845
4846 /**
4847  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4848  *                                     lower neighbour list, RCU
4849  *                                     variant
4850  * @dev: device
4851  * @iter: list_head ** of the current position
4852  *
4853  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4854  * list, starting from iter position. The caller must hold RCU read lock.
4855  */
4856 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4857                                         struct list_head **iter)
4858 {
4859         struct netdev_adjacent *lower;
4860
4861         WARN_ON_ONCE(!rcu_read_lock_held());
4862
4863         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4864
4865         if (&lower->list == &dev->adj_list.lower)
4866                 return NULL;
4867
4868         *iter = &lower->list;
4869
4870         return lower->private;
4871 }
4872 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4873
4874 /**
4875  * netdev_lower_get_next - Get the next device from the lower neighbour
4876  *                         list
4877  * @dev: device
4878  * @iter: list_head ** of the current position
4879  *
4880  * Gets the next netdev_adjacent from the dev's lower neighbour
4881  * list, starting from iter position. The caller must hold RTNL lock or
4882  * its own locking that guarantees that the neighbour lower
4883  * list will remain unchainged.
4884  */
4885 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4886 {
4887         struct netdev_adjacent *lower;
4888
4889         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4890
4891         if (&lower->list == &dev->adj_list.lower)
4892                 return NULL;
4893
4894         *iter = &lower->list;
4895
4896         return lower->dev;
4897 }
4898 EXPORT_SYMBOL(netdev_lower_get_next);
4899
4900 /**
4901  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4902  *                                     lower neighbour list, RCU
4903  *                                     variant
4904  * @dev: device
4905  *
4906  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4907  * list. The caller must hold RCU read lock.
4908  */
4909 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4910 {
4911         struct netdev_adjacent *lower;
4912
4913         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4914                         struct netdev_adjacent, list);
4915         if (lower)
4916                 return lower->private;
4917         return NULL;
4918 }
4919 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4920
4921 /**
4922  * netdev_master_upper_dev_get_rcu - Get master upper device
4923  * @dev: device
4924  *
4925  * Find a master upper device and return pointer to it or NULL in case
4926  * it's not there. The caller must hold the RCU read lock.
4927  */
4928 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4929 {
4930         struct netdev_adjacent *upper;
4931
4932         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4933                                        struct netdev_adjacent, list);
4934         if (upper && likely(upper->master))
4935                 return upper->dev;
4936         return NULL;
4937 }
4938 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4939
4940 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4941                               struct net_device *adj_dev,
4942                               struct list_head *dev_list)
4943 {
4944         char linkname[IFNAMSIZ+7];
4945         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4946                 "upper_%s" : "lower_%s", adj_dev->name);
4947         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4948                                  linkname);
4949 }
4950 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4951                                char *name,
4952                                struct list_head *dev_list)
4953 {
4954         char linkname[IFNAMSIZ+7];
4955         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4956                 "upper_%s" : "lower_%s", name);
4957         sysfs_remove_link(&(dev->dev.kobj), linkname);
4958 }
4959
4960 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4961                                                  struct net_device *adj_dev,
4962                                                  struct list_head *dev_list)
4963 {
4964         return (dev_list == &dev->adj_list.upper ||
4965                 dev_list == &dev->adj_list.lower) &&
4966                 net_eq(dev_net(dev), dev_net(adj_dev));
4967 }
4968
4969 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4970                                         struct net_device *adj_dev,
4971                                         struct list_head *dev_list,
4972                                         void *private, bool master)
4973 {
4974         struct netdev_adjacent *adj;
4975         int ret;
4976
4977         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4978
4979         if (adj) {
4980                 adj->ref_nr++;
4981                 return 0;
4982         }
4983
4984         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4985         if (!adj)
4986                 return -ENOMEM;
4987
4988         adj->dev = adj_dev;
4989         adj->master = master;
4990         adj->ref_nr = 1;
4991         adj->private = private;
4992         dev_hold(adj_dev);
4993
4994         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4995                  adj_dev->name, dev->name, adj_dev->name);
4996
4997         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4998                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4999                 if (ret)
5000                         goto free_adj;
5001         }
5002
5003         /* Ensure that master link is always the first item in list. */
5004         if (master) {
5005                 ret = sysfs_create_link(&(dev->dev.kobj),
5006                                         &(adj_dev->dev.kobj), "master");
5007                 if (ret)
5008                         goto remove_symlinks;
5009
5010                 list_add_rcu(&adj->list, dev_list);
5011         } else {
5012                 list_add_tail_rcu(&adj->list, dev_list);
5013         }
5014
5015         return 0;
5016
5017 remove_symlinks:
5018         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5019                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5020 free_adj:
5021         kfree(adj);
5022         dev_put(adj_dev);
5023
5024         return ret;
5025 }
5026
5027 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5028                                          struct net_device *adj_dev,
5029                                          struct list_head *dev_list)
5030 {
5031         struct netdev_adjacent *adj;
5032
5033         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5034
5035         if (!adj) {
5036                 pr_err("tried to remove device %s from %s\n",
5037                        dev->name, adj_dev->name);
5038                 BUG();
5039         }
5040
5041         if (adj->ref_nr > 1) {
5042                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5043                          adj->ref_nr-1);
5044                 adj->ref_nr--;
5045                 return;
5046         }
5047
5048         if (adj->master)
5049                 sysfs_remove_link(&(dev->dev.kobj), "master");
5050
5051         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5052                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5053
5054         list_del_rcu(&adj->list);
5055         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5056                  adj_dev->name, dev->name, adj_dev->name);
5057         dev_put(adj_dev);
5058         kfree_rcu(adj, rcu);
5059 }
5060
5061 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5062                                             struct net_device *upper_dev,
5063                                             struct list_head *up_list,
5064                                             struct list_head *down_list,
5065                                             void *private, bool master)
5066 {
5067         int ret;
5068
5069         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5070                                            master);
5071         if (ret)
5072                 return ret;
5073
5074         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5075                                            false);
5076         if (ret) {
5077                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5078                 return ret;
5079         }
5080
5081         return 0;
5082 }
5083
5084 static int __netdev_adjacent_dev_link(struct net_device *dev,
5085                                       struct net_device *upper_dev)
5086 {
5087         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5088                                                 &dev->all_adj_list.upper,
5089                                                 &upper_dev->all_adj_list.lower,
5090                                                 NULL, false);
5091 }
5092
5093 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5094                                                struct net_device *upper_dev,
5095                                                struct list_head *up_list,
5096                                                struct list_head *down_list)
5097 {
5098         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5099         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5100 }
5101
5102 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5103                                          struct net_device *upper_dev)
5104 {
5105         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5106                                            &dev->all_adj_list.upper,
5107                                            &upper_dev->all_adj_list.lower);
5108 }
5109
5110 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5111                                                 struct net_device *upper_dev,
5112                                                 void *private, bool master)
5113 {
5114         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5115
5116         if (ret)
5117                 return ret;
5118
5119         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5120                                                &dev->adj_list.upper,
5121                                                &upper_dev->adj_list.lower,
5122                                                private, master);
5123         if (ret) {
5124                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5125                 return ret;
5126         }
5127
5128         return 0;
5129 }
5130
5131 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5132                                                    struct net_device *upper_dev)
5133 {
5134         __netdev_adjacent_dev_unlink(dev, upper_dev);
5135         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5136                                            &dev->adj_list.upper,
5137                                            &upper_dev->adj_list.lower);
5138 }
5139
5140 static int __netdev_upper_dev_link(struct net_device *dev,
5141                                    struct net_device *upper_dev, bool master,
5142                                    void *private)
5143 {
5144         struct netdev_adjacent *i, *j, *to_i, *to_j;
5145         int ret = 0;
5146
5147         ASSERT_RTNL();
5148
5149         if (dev == upper_dev)
5150                 return -EBUSY;
5151
5152         /* To prevent loops, check if dev is not upper device to upper_dev. */
5153         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5154                 return -EBUSY;
5155
5156         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5157                 return -EEXIST;
5158
5159         if (master && netdev_master_upper_dev_get(dev))
5160                 return -EBUSY;
5161
5162         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5163                                                    master);
5164         if (ret)
5165                 return ret;
5166
5167         /* Now that we linked these devs, make all the upper_dev's
5168          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5169          * versa, and don't forget the devices itself. All of these
5170          * links are non-neighbours.
5171          */
5172         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5173                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5174                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5175                                  i->dev->name, j->dev->name);
5176                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5177                         if (ret)
5178                                 goto rollback_mesh;
5179                 }
5180         }
5181
5182         /* add dev to every upper_dev's upper device */
5183         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5184                 pr_debug("linking %s's upper device %s with %s\n",
5185                          upper_dev->name, i->dev->name, dev->name);
5186                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5187                 if (ret)
5188                         goto rollback_upper_mesh;
5189         }
5190
5191         /* add upper_dev to every dev's lower device */
5192         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5193                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5194                          i->dev->name, upper_dev->name);
5195                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5196                 if (ret)
5197                         goto rollback_lower_mesh;
5198         }
5199
5200         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5201         return 0;
5202
5203 rollback_lower_mesh:
5204         to_i = i;
5205         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5206                 if (i == to_i)
5207                         break;
5208                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5209         }
5210
5211         i = NULL;
5212
5213 rollback_upper_mesh:
5214         to_i = i;
5215         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5216                 if (i == to_i)
5217                         break;
5218                 __netdev_adjacent_dev_unlink(dev, i->dev);
5219         }
5220
5221         i = j = NULL;
5222
5223 rollback_mesh:
5224         to_i = i;
5225         to_j = j;
5226         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5227                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5228                         if (i == to_i && j == to_j)
5229                                 break;
5230                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5231                 }
5232                 if (i == to_i)
5233                         break;
5234         }
5235
5236         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5237
5238         return ret;
5239 }
5240
5241 /**
5242  * netdev_upper_dev_link - Add a link to the upper device
5243  * @dev: device
5244  * @upper_dev: new upper device
5245  *
5246  * Adds a link to device which is upper to this one. The caller must hold
5247  * the RTNL lock. On a failure a negative errno code is returned.
5248  * On success the reference counts are adjusted and the function
5249  * returns zero.
5250  */
5251 int netdev_upper_dev_link(struct net_device *dev,
5252                           struct net_device *upper_dev)
5253 {
5254         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5255 }
5256 EXPORT_SYMBOL(netdev_upper_dev_link);
5257
5258 /**
5259  * netdev_master_upper_dev_link - Add a master link to the upper device
5260  * @dev: device
5261  * @upper_dev: new upper device
5262  *
5263  * Adds a link to device which is upper to this one. In this case, only
5264  * one master upper device can be linked, although other non-master devices
5265  * might be linked as well. The caller must hold the RTNL lock.
5266  * On a failure a negative errno code is returned. On success the reference
5267  * counts are adjusted and the function returns zero.
5268  */
5269 int netdev_master_upper_dev_link(struct net_device *dev,
5270                                  struct net_device *upper_dev)
5271 {
5272         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5273 }
5274 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5275
5276 int netdev_master_upper_dev_link_private(struct net_device *dev,
5277                                          struct net_device *upper_dev,
5278                                          void *private)
5279 {
5280         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5281 }
5282 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5283
5284 /**
5285  * netdev_upper_dev_unlink - Removes a link to upper device
5286  * @dev: device
5287  * @upper_dev: new upper device
5288  *
5289  * Removes a link to device which is upper to this one. The caller must hold
5290  * the RTNL lock.
5291  */
5292 void netdev_upper_dev_unlink(struct net_device *dev,
5293                              struct net_device *upper_dev)
5294 {
5295         struct netdev_adjacent *i, *j;
5296         ASSERT_RTNL();
5297
5298         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5299
5300         /* Here is the tricky part. We must remove all dev's lower
5301          * devices from all upper_dev's upper devices and vice
5302          * versa, to maintain the graph relationship.
5303          */
5304         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5305                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5306                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5307
5308         /* remove also the devices itself from lower/upper device
5309          * list
5310          */
5311         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5312                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5313
5314         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5315                 __netdev_adjacent_dev_unlink(dev, i->dev);
5316
5317         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5318 }
5319 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5320
5321 void netdev_adjacent_add_links(struct net_device *dev)
5322 {
5323         struct netdev_adjacent *iter;
5324
5325         struct net *net = dev_net(dev);
5326
5327         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5328                 if (!net_eq(net,dev_net(iter->dev)))
5329                         continue;
5330                 netdev_adjacent_sysfs_add(iter->dev, dev,
5331                                           &iter->dev->adj_list.lower);
5332                 netdev_adjacent_sysfs_add(dev, iter->dev,
5333                                           &dev->adj_list.upper);
5334         }
5335
5336         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5337                 if (!net_eq(net,dev_net(iter->dev)))
5338                         continue;
5339                 netdev_adjacent_sysfs_add(iter->dev, dev,
5340                                           &iter->dev->adj_list.upper);
5341                 netdev_adjacent_sysfs_add(dev, iter->dev,
5342                                           &dev->adj_list.lower);
5343         }
5344 }
5345
5346 void netdev_adjacent_del_links(struct net_device *dev)
5347 {
5348         struct netdev_adjacent *iter;
5349
5350         struct net *net = dev_net(dev);
5351
5352         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5353                 if (!net_eq(net,dev_net(iter->dev)))
5354                         continue;
5355                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5356                                           &iter->dev->adj_list.lower);
5357                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5358                                           &dev->adj_list.upper);
5359         }
5360
5361         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5362                 if (!net_eq(net,dev_net(iter->dev)))
5363                         continue;
5364                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5365                                           &iter->dev->adj_list.upper);
5366                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5367                                           &dev->adj_list.lower);
5368         }
5369 }
5370
5371 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5372 {
5373         struct netdev_adjacent *iter;
5374
5375         struct net *net = dev_net(dev);
5376
5377         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5378                 if (!net_eq(net,dev_net(iter->dev)))
5379                         continue;
5380                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5381                                           &iter->dev->adj_list.lower);
5382                 netdev_adjacent_sysfs_add(iter->dev, dev,
5383                                           &iter->dev->adj_list.lower);
5384         }
5385
5386         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5387                 if (!net_eq(net,dev_net(iter->dev)))
5388                         continue;
5389                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5390                                           &iter->dev->adj_list.upper);
5391                 netdev_adjacent_sysfs_add(iter->dev, dev,
5392                                           &iter->dev->adj_list.upper);
5393         }
5394 }
5395
5396 void *netdev_lower_dev_get_private(struct net_device *dev,
5397                                    struct net_device *lower_dev)
5398 {
5399         struct netdev_adjacent *lower;
5400
5401         if (!lower_dev)
5402                 return NULL;
5403         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5404         if (!lower)
5405                 return NULL;
5406
5407         return lower->private;
5408 }
5409 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5410
5411
5412 int dev_get_nest_level(struct net_device *dev,
5413                        bool (*type_check)(struct net_device *dev))
5414 {
5415         struct net_device *lower = NULL;
5416         struct list_head *iter;
5417         int max_nest = -1;
5418         int nest;
5419
5420         ASSERT_RTNL();
5421
5422         netdev_for_each_lower_dev(dev, lower, iter) {
5423                 nest = dev_get_nest_level(lower, type_check);
5424                 if (max_nest < nest)
5425                         max_nest = nest;
5426         }
5427
5428         if (type_check(dev))
5429                 max_nest++;
5430
5431         return max_nest;
5432 }
5433 EXPORT_SYMBOL(dev_get_nest_level);
5434
5435 static void dev_change_rx_flags(struct net_device *dev, int flags)
5436 {
5437         const struct net_device_ops *ops = dev->netdev_ops;
5438
5439         if (ops->ndo_change_rx_flags)
5440                 ops->ndo_change_rx_flags(dev, flags);
5441 }
5442
5443 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5444 {
5445         unsigned int old_flags = dev->flags;
5446         kuid_t uid;
5447         kgid_t gid;
5448
5449         ASSERT_RTNL();
5450
5451         dev->flags |= IFF_PROMISC;
5452         dev->promiscuity += inc;
5453         if (dev->promiscuity == 0) {
5454                 /*
5455                  * Avoid overflow.
5456                  * If inc causes overflow, untouch promisc and return error.
5457                  */
5458                 if (inc < 0)
5459                         dev->flags &= ~IFF_PROMISC;
5460                 else {
5461                         dev->promiscuity -= inc;
5462                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5463                                 dev->name);
5464                         return -EOVERFLOW;
5465                 }
5466         }
5467         if (dev->flags != old_flags) {
5468                 pr_info("device %s %s promiscuous mode\n",
5469                         dev->name,
5470                         dev->flags & IFF_PROMISC ? "entered" : "left");
5471                 if (audit_enabled) {
5472                         current_uid_gid(&uid, &gid);
5473                         audit_log(current->audit_context, GFP_ATOMIC,
5474                                 AUDIT_ANOM_PROMISCUOUS,
5475                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5476                                 dev->name, (dev->flags & IFF_PROMISC),
5477                                 (old_flags & IFF_PROMISC),
5478                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5479                                 from_kuid(&init_user_ns, uid),
5480                                 from_kgid(&init_user_ns, gid),
5481                                 audit_get_sessionid(current));
5482                 }
5483
5484                 dev_change_rx_flags(dev, IFF_PROMISC);
5485         }
5486         if (notify)
5487                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5488         return 0;
5489 }
5490
5491 /**
5492  *      dev_set_promiscuity     - update promiscuity count on a device
5493  *      @dev: device
5494  *      @inc: modifier
5495  *
5496  *      Add or remove promiscuity from a device. While the count in the device
5497  *      remains above zero the interface remains promiscuous. Once it hits zero
5498  *      the device reverts back to normal filtering operation. A negative inc
5499  *      value is used to drop promiscuity on the device.
5500  *      Return 0 if successful or a negative errno code on error.
5501  */
5502 int dev_set_promiscuity(struct net_device *dev, int inc)
5503 {
5504         unsigned int old_flags = dev->flags;
5505         int err;
5506
5507         err = __dev_set_promiscuity(dev, inc, true);
5508         if (err < 0)
5509                 return err;
5510         if (dev->flags != old_flags)
5511                 dev_set_rx_mode(dev);
5512         return err;
5513 }
5514 EXPORT_SYMBOL(dev_set_promiscuity);
5515
5516 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5517 {
5518         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5519
5520         ASSERT_RTNL();
5521
5522         dev->flags |= IFF_ALLMULTI;
5523         dev->allmulti += inc;
5524         if (dev->allmulti == 0) {
5525                 /*
5526                  * Avoid overflow.
5527                  * If inc causes overflow, untouch allmulti and return error.
5528                  */
5529                 if (inc < 0)
5530                         dev->flags &= ~IFF_ALLMULTI;
5531                 else {
5532                         dev->allmulti -= inc;
5533                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5534                                 dev->name);
5535                         return -EOVERFLOW;
5536                 }
5537         }
5538         if (dev->flags ^ old_flags) {
5539                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5540                 dev_set_rx_mode(dev);
5541                 if (notify)
5542                         __dev_notify_flags(dev, old_flags,
5543                                            dev->gflags ^ old_gflags);
5544         }
5545         return 0;
5546 }
5547
5548 /**
5549  *      dev_set_allmulti        - update allmulti count on a device
5550  *      @dev: device
5551  *      @inc: modifier
5552  *
5553  *      Add or remove reception of all multicast frames to a device. While the
5554  *      count in the device remains above zero the interface remains listening
5555  *      to all interfaces. Once it hits zero the device reverts back to normal
5556  *      filtering operation. A negative @inc value is used to drop the counter
5557  *      when releasing a resource needing all multicasts.
5558  *      Return 0 if successful or a negative errno code on error.
5559  */
5560
5561 int dev_set_allmulti(struct net_device *dev, int inc)
5562 {
5563         return __dev_set_allmulti(dev, inc, true);
5564 }
5565 EXPORT_SYMBOL(dev_set_allmulti);
5566
5567 /*
5568  *      Upload unicast and multicast address lists to device and
5569  *      configure RX filtering. When the device doesn't support unicast
5570  *      filtering it is put in promiscuous mode while unicast addresses
5571  *      are present.
5572  */
5573 void __dev_set_rx_mode(struct net_device *dev)
5574 {
5575         const struct net_device_ops *ops = dev->netdev_ops;
5576
5577         /* dev_open will call this function so the list will stay sane. */
5578         if (!(dev->flags&IFF_UP))
5579                 return;
5580
5581         if (!netif_device_present(dev))
5582                 return;
5583
5584         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5585                 /* Unicast addresses changes may only happen under the rtnl,
5586                  * therefore calling __dev_set_promiscuity here is safe.
5587                  */
5588                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5589                         __dev_set_promiscuity(dev, 1, false);
5590                         dev->uc_promisc = true;
5591                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5592                         __dev_set_promiscuity(dev, -1, false);
5593                         dev->uc_promisc = false;
5594                 }
5595         }
5596
5597         if (ops->ndo_set_rx_mode)
5598                 ops->ndo_set_rx_mode(dev);
5599 }
5600
5601 void dev_set_rx_mode(struct net_device *dev)
5602 {
5603         netif_addr_lock_bh(dev);
5604         __dev_set_rx_mode(dev);
5605         netif_addr_unlock_bh(dev);
5606 }
5607
5608 /**
5609  *      dev_get_flags - get flags reported to userspace
5610  *      @dev: device
5611  *
5612  *      Get the combination of flag bits exported through APIs to userspace.
5613  */
5614 unsigned int dev_get_flags(const struct net_device *dev)
5615 {
5616         unsigned int flags;
5617
5618         flags = (dev->flags & ~(IFF_PROMISC |
5619                                 IFF_ALLMULTI |
5620                                 IFF_RUNNING |
5621                                 IFF_LOWER_UP |
5622                                 IFF_DORMANT)) |
5623                 (dev->gflags & (IFF_PROMISC |
5624                                 IFF_ALLMULTI));
5625
5626         if (netif_running(dev)) {
5627                 if (netif_oper_up(dev))
5628                         flags |= IFF_RUNNING;
5629                 if (netif_carrier_ok(dev))
5630                         flags |= IFF_LOWER_UP;
5631                 if (netif_dormant(dev))
5632                         flags |= IFF_DORMANT;
5633         }
5634
5635         return flags;
5636 }
5637 EXPORT_SYMBOL(dev_get_flags);
5638
5639 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5640 {
5641         unsigned int old_flags = dev->flags;
5642         int ret;
5643
5644         ASSERT_RTNL();
5645
5646         /*
5647          *      Set the flags on our device.
5648          */
5649
5650         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5651                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5652                                IFF_AUTOMEDIA)) |
5653                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5654                                     IFF_ALLMULTI));
5655
5656         /*
5657          *      Load in the correct multicast list now the flags have changed.
5658          */
5659
5660         if ((old_flags ^ flags) & IFF_MULTICAST)
5661                 dev_change_rx_flags(dev, IFF_MULTICAST);
5662
5663         dev_set_rx_mode(dev);
5664
5665         /*
5666          *      Have we downed the interface. We handle IFF_UP ourselves
5667          *      according to user attempts to set it, rather than blindly
5668          *      setting it.
5669          */
5670
5671         ret = 0;
5672         if ((old_flags ^ flags) & IFF_UP)
5673                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5674
5675         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5676                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5677                 unsigned int old_flags = dev->flags;
5678
5679                 dev->gflags ^= IFF_PROMISC;
5680
5681                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5682                         if (dev->flags != old_flags)
5683                                 dev_set_rx_mode(dev);
5684         }
5685
5686         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5687            is important. Some (broken) drivers set IFF_PROMISC, when
5688            IFF_ALLMULTI is requested not asking us and not reporting.
5689          */
5690         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5691                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5692
5693                 dev->gflags ^= IFF_ALLMULTI;
5694                 __dev_set_allmulti(dev, inc, false);
5695         }
5696
5697         return ret;
5698 }
5699
5700 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5701                         unsigned int gchanges)
5702 {
5703         unsigned int changes = dev->flags ^ old_flags;
5704
5705         if (gchanges)
5706                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5707
5708         if (changes & IFF_UP) {
5709                 if (dev->flags & IFF_UP)
5710                         call_netdevice_notifiers(NETDEV_UP, dev);
5711                 else
5712                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5713         }
5714
5715         if (dev->flags & IFF_UP &&
5716             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5717                 struct netdev_notifier_change_info change_info;
5718
5719                 change_info.flags_changed = changes;
5720                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5721                                               &change_info.info);
5722         }
5723 }
5724
5725 /**
5726  *      dev_change_flags - change device settings
5727  *      @dev: device
5728  *      @flags: device state flags
5729  *
5730  *      Change settings on device based state flags. The flags are
5731  *      in the userspace exported format.
5732  */
5733 int dev_change_flags(struct net_device *dev, unsigned int flags)
5734 {
5735         int ret;
5736         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5737
5738         ret = __dev_change_flags(dev, flags);
5739         if (ret < 0)
5740                 return ret;
5741
5742         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5743         __dev_notify_flags(dev, old_flags, changes);
5744         return ret;
5745 }
5746 EXPORT_SYMBOL(dev_change_flags);
5747
5748 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5749 {
5750         const struct net_device_ops *ops = dev->netdev_ops;
5751
5752         if (ops->ndo_change_mtu)
5753                 return ops->ndo_change_mtu(dev, new_mtu);
5754
5755         dev->mtu = new_mtu;
5756         return 0;
5757 }
5758
5759 /**
5760  *      dev_set_mtu - Change maximum transfer unit
5761  *      @dev: device
5762  *      @new_mtu: new transfer unit
5763  *
5764  *      Change the maximum transfer size of the network device.
5765  */
5766 int dev_set_mtu(struct net_device *dev, int new_mtu)
5767 {
5768         int err, orig_mtu;
5769
5770         if (new_mtu == dev->mtu)
5771                 return 0;
5772
5773         /*      MTU must be positive.    */
5774         if (new_mtu < 0)
5775                 return -EINVAL;
5776
5777         if (!netif_device_present(dev))
5778                 return -ENODEV;
5779
5780         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5781         err = notifier_to_errno(err);
5782         if (err)
5783                 return err;
5784
5785         orig_mtu = dev->mtu;
5786         err = __dev_set_mtu(dev, new_mtu);
5787
5788         if (!err) {
5789                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5790                 err = notifier_to_errno(err);
5791                 if (err) {
5792                         /* setting mtu back and notifying everyone again,
5793                          * so that they have a chance to revert changes.
5794                          */
5795                         __dev_set_mtu(dev, orig_mtu);
5796                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5797                 }
5798         }
5799         return err;
5800 }
5801 EXPORT_SYMBOL(dev_set_mtu);
5802
5803 /**
5804  *      dev_set_group - Change group this device belongs to
5805  *      @dev: device
5806  *      @new_group: group this device should belong to
5807  */
5808 void dev_set_group(struct net_device *dev, int new_group)
5809 {
5810         dev->group = new_group;
5811 }
5812 EXPORT_SYMBOL(dev_set_group);
5813
5814 /**
5815  *      dev_set_mac_address - Change Media Access Control Address
5816  *      @dev: device
5817  *      @sa: new address
5818  *
5819  *      Change the hardware (MAC) address of the device
5820  */
5821 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5822 {
5823         const struct net_device_ops *ops = dev->netdev_ops;
5824         int err;
5825
5826         if (!ops->ndo_set_mac_address)
5827                 return -EOPNOTSUPP;
5828         if (sa->sa_family != dev->type)
5829                 return -EINVAL;
5830         if (!netif_device_present(dev))
5831                 return -ENODEV;
5832         err = ops->ndo_set_mac_address(dev, sa);
5833         if (err)
5834                 return err;
5835         dev->addr_assign_type = NET_ADDR_SET;
5836         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5837         add_device_randomness(dev->dev_addr, dev->addr_len);
5838         return 0;
5839 }
5840 EXPORT_SYMBOL(dev_set_mac_address);
5841
5842 /**
5843  *      dev_change_carrier - Change device carrier
5844  *      @dev: device
5845  *      @new_carrier: new value
5846  *
5847  *      Change device carrier
5848  */
5849 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5850 {
5851         const struct net_device_ops *ops = dev->netdev_ops;
5852
5853         if (!ops->ndo_change_carrier)
5854                 return -EOPNOTSUPP;
5855         if (!netif_device_present(dev))
5856                 return -ENODEV;
5857         return ops->ndo_change_carrier(dev, new_carrier);
5858 }
5859 EXPORT_SYMBOL(dev_change_carrier);
5860
5861 /**
5862  *      dev_get_phys_port_id - Get device physical port ID
5863  *      @dev: device
5864  *      @ppid: port ID
5865  *
5866  *      Get device physical port ID
5867  */
5868 int dev_get_phys_port_id(struct net_device *dev,
5869                          struct netdev_phys_item_id *ppid)
5870 {
5871         const struct net_device_ops *ops = dev->netdev_ops;
5872
5873         if (!ops->ndo_get_phys_port_id)
5874                 return -EOPNOTSUPP;
5875         return ops->ndo_get_phys_port_id(dev, ppid);
5876 }
5877 EXPORT_SYMBOL(dev_get_phys_port_id);
5878
5879 /**
5880  *      dev_new_index   -       allocate an ifindex
5881  *      @net: the applicable net namespace
5882  *
5883  *      Returns a suitable unique value for a new device interface
5884  *      number.  The caller must hold the rtnl semaphore or the
5885  *      dev_base_lock to be sure it remains unique.
5886  */
5887 static int dev_new_index(struct net *net)
5888 {
5889         int ifindex = net->ifindex;
5890         for (;;) {
5891                 if (++ifindex <= 0)
5892                         ifindex = 1;
5893                 if (!__dev_get_by_index(net, ifindex))
5894                         return net->ifindex = ifindex;
5895         }
5896 }
5897
5898 /* Delayed registration/unregisteration */
5899 static LIST_HEAD(net_todo_list);
5900 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5901
5902 static void net_set_todo(struct net_device *dev)
5903 {
5904         list_add_tail(&dev->todo_list, &net_todo_list);
5905         dev_net(dev)->dev_unreg_count++;
5906 }
5907
5908 static void rollback_registered_many(struct list_head *head)
5909 {
5910         struct net_device *dev, *tmp;
5911         LIST_HEAD(close_head);
5912
5913         BUG_ON(dev_boot_phase);
5914         ASSERT_RTNL();
5915
5916         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5917                 /* Some devices call without registering
5918                  * for initialization unwind. Remove those
5919                  * devices and proceed with the remaining.
5920                  */
5921                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5922                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5923                                  dev->name, dev);
5924
5925                         WARN_ON(1);
5926                         list_del(&dev->unreg_list);
5927                         continue;
5928                 }
5929                 dev->dismantle = true;
5930                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5931         }
5932
5933         /* If device is running, close it first. */
5934         list_for_each_entry(dev, head, unreg_list)
5935                 list_add_tail(&dev->close_list, &close_head);
5936         dev_close_many(&close_head);
5937
5938         list_for_each_entry(dev, head, unreg_list) {
5939                 /* And unlink it from device chain. */
5940                 unlist_netdevice(dev);
5941
5942                 dev->reg_state = NETREG_UNREGISTERING;
5943         }
5944
5945         synchronize_net();
5946
5947         list_for_each_entry(dev, head, unreg_list) {
5948                 struct sk_buff *skb = NULL;
5949
5950                 /* Shutdown queueing discipline. */
5951                 dev_shutdown(dev);
5952
5953
5954                 /* Notify protocols, that we are about to destroy
5955                    this device. They should clean all the things.
5956                 */
5957                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5958
5959                 if (!dev->rtnl_link_ops ||
5960                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5961                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5962                                                      GFP_KERNEL);
5963
5964                 /*
5965                  *      Flush the unicast and multicast chains
5966                  */
5967                 dev_uc_flush(dev);
5968                 dev_mc_flush(dev);
5969
5970                 if (dev->netdev_ops->ndo_uninit)
5971                         dev->netdev_ops->ndo_uninit(dev);
5972
5973                 if (skb)
5974                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5975
5976                 /* Notifier chain MUST detach us all upper devices. */
5977                 WARN_ON(netdev_has_any_upper_dev(dev));
5978
5979                 /* Remove entries from kobject tree */
5980                 netdev_unregister_kobject(dev);
5981 #ifdef CONFIG_XPS
5982                 /* Remove XPS queueing entries */
5983                 netif_reset_xps_queues_gt(dev, 0);
5984 #endif
5985         }
5986
5987         synchronize_net();
5988
5989         list_for_each_entry(dev, head, unreg_list)
5990                 dev_put(dev);
5991 }
5992
5993 static void rollback_registered(struct net_device *dev)
5994 {
5995         LIST_HEAD(single);
5996
5997         list_add(&dev->unreg_list, &single);
5998         rollback_registered_many(&single);
5999         list_del(&single);
6000 }
6001
6002 static netdev_features_t netdev_fix_features(struct net_device *dev,
6003         netdev_features_t features)
6004 {
6005         /* Fix illegal checksum combinations */
6006         if ((features & NETIF_F_HW_CSUM) &&
6007             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6008                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6009                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6010         }
6011
6012         /* TSO requires that SG is present as well. */
6013         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6014                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6015                 features &= ~NETIF_F_ALL_TSO;
6016         }
6017
6018         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6019                                         !(features & NETIF_F_IP_CSUM)) {
6020                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6021                 features &= ~NETIF_F_TSO;
6022                 features &= ~NETIF_F_TSO_ECN;
6023         }
6024
6025         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6026                                          !(features & NETIF_F_IPV6_CSUM)) {
6027                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6028                 features &= ~NETIF_F_TSO6;
6029         }
6030
6031         /* TSO ECN requires that TSO is present as well. */
6032         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6033                 features &= ~NETIF_F_TSO_ECN;
6034
6035         /* Software GSO depends on SG. */
6036         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6037                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6038                 features &= ~NETIF_F_GSO;
6039         }
6040
6041         /* UFO needs SG and checksumming */
6042         if (features & NETIF_F_UFO) {
6043                 /* maybe split UFO into V4 and V6? */
6044                 if (!((features & NETIF_F_GEN_CSUM) ||
6045                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6046                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6047                         netdev_dbg(dev,
6048                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6049                         features &= ~NETIF_F_UFO;
6050                 }
6051
6052                 if (!(features & NETIF_F_SG)) {
6053                         netdev_dbg(dev,
6054                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6055                         features &= ~NETIF_F_UFO;
6056                 }
6057         }
6058
6059 #ifdef CONFIG_NET_RX_BUSY_POLL
6060         if (dev->netdev_ops->ndo_busy_poll)
6061                 features |= NETIF_F_BUSY_POLL;
6062         else
6063 #endif
6064                 features &= ~NETIF_F_BUSY_POLL;
6065
6066         return features;
6067 }
6068
6069 int __netdev_update_features(struct net_device *dev)
6070 {
6071         netdev_features_t features;
6072         int err = 0;
6073
6074         ASSERT_RTNL();
6075
6076         features = netdev_get_wanted_features(dev);
6077
6078         if (dev->netdev_ops->ndo_fix_features)
6079                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6080
6081         /* driver might be less strict about feature dependencies */
6082         features = netdev_fix_features(dev, features);
6083
6084         if (dev->features == features)
6085                 return 0;
6086
6087         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6088                 &dev->features, &features);
6089
6090         if (dev->netdev_ops->ndo_set_features)
6091                 err = dev->netdev_ops->ndo_set_features(dev, features);
6092
6093         if (unlikely(err < 0)) {
6094                 netdev_err(dev,
6095                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6096                         err, &features, &dev->features);
6097                 return -1;
6098         }
6099
6100         if (!err)
6101                 dev->features = features;
6102
6103         return 1;
6104 }
6105
6106 /**
6107  *      netdev_update_features - recalculate device features
6108  *      @dev: the device to check
6109  *
6110  *      Recalculate dev->features set and send notifications if it
6111  *      has changed. Should be called after driver or hardware dependent
6112  *      conditions might have changed that influence the features.
6113  */
6114 void netdev_update_features(struct net_device *dev)
6115 {
6116         if (__netdev_update_features(dev))
6117                 netdev_features_change(dev);
6118 }
6119 EXPORT_SYMBOL(netdev_update_features);
6120
6121 /**
6122  *      netdev_change_features - recalculate device features
6123  *      @dev: the device to check
6124  *
6125  *      Recalculate dev->features set and send notifications even
6126  *      if they have not changed. Should be called instead of
6127  *      netdev_update_features() if also dev->vlan_features might
6128  *      have changed to allow the changes to be propagated to stacked
6129  *      VLAN devices.
6130  */
6131 void netdev_change_features(struct net_device *dev)
6132 {
6133         __netdev_update_features(dev);
6134         netdev_features_change(dev);
6135 }
6136 EXPORT_SYMBOL(netdev_change_features);
6137
6138 /**
6139  *      netif_stacked_transfer_operstate -      transfer operstate
6140  *      @rootdev: the root or lower level device to transfer state from
6141  *      @dev: the device to transfer operstate to
6142  *
6143  *      Transfer operational state from root to device. This is normally
6144  *      called when a stacking relationship exists between the root
6145  *      device and the device(a leaf device).
6146  */
6147 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6148                                         struct net_device *dev)
6149 {
6150         if (rootdev->operstate == IF_OPER_DORMANT)
6151                 netif_dormant_on(dev);
6152         else
6153                 netif_dormant_off(dev);
6154
6155         if (netif_carrier_ok(rootdev)) {
6156                 if (!netif_carrier_ok(dev))
6157                         netif_carrier_on(dev);
6158         } else {
6159                 if (netif_carrier_ok(dev))
6160                         netif_carrier_off(dev);
6161         }
6162 }
6163 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6164
6165 #ifdef CONFIG_SYSFS
6166 static int netif_alloc_rx_queues(struct net_device *dev)
6167 {
6168         unsigned int i, count = dev->num_rx_queues;
6169         struct netdev_rx_queue *rx;
6170
6171         BUG_ON(count < 1);
6172
6173         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6174         if (!rx)
6175                 return -ENOMEM;
6176
6177         dev->_rx = rx;
6178
6179         for (i = 0; i < count; i++)
6180                 rx[i].dev = dev;
6181         return 0;
6182 }
6183 #endif
6184
6185 static void netdev_init_one_queue(struct net_device *dev,
6186                                   struct netdev_queue *queue, void *_unused)
6187 {
6188         /* Initialize queue lock */
6189         spin_lock_init(&queue->_xmit_lock);
6190         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6191         queue->xmit_lock_owner = -1;
6192         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6193         queue->dev = dev;
6194 #ifdef CONFIG_BQL
6195         dql_init(&queue->dql, HZ);
6196 #endif
6197 }
6198
6199 static void netif_free_tx_queues(struct net_device *dev)
6200 {
6201         kvfree(dev->_tx);
6202 }
6203
6204 static int netif_alloc_netdev_queues(struct net_device *dev)
6205 {
6206         unsigned int count = dev->num_tx_queues;
6207         struct netdev_queue *tx;
6208         size_t sz = count * sizeof(*tx);
6209
6210         BUG_ON(count < 1 || count > 0xffff);
6211
6212         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6213         if (!tx) {
6214                 tx = vzalloc(sz);
6215                 if (!tx)
6216                         return -ENOMEM;
6217         }
6218         dev->_tx = tx;
6219
6220         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6221         spin_lock_init(&dev->tx_global_lock);
6222
6223         return 0;
6224 }
6225
6226 /**
6227  *      register_netdevice      - register a network device
6228  *      @dev: device to register
6229  *
6230  *      Take a completed network device structure and add it to the kernel
6231  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6232  *      chain. 0 is returned on success. A negative errno code is returned
6233  *      on a failure to set up the device, or if the name is a duplicate.
6234  *
6235  *      Callers must hold the rtnl semaphore. You may want
6236  *      register_netdev() instead of this.
6237  *
6238  *      BUGS:
6239  *      The locking appears insufficient to guarantee two parallel registers
6240  *      will not get the same name.
6241  */
6242
6243 int register_netdevice(struct net_device *dev)
6244 {
6245         int ret;
6246         struct net *net = dev_net(dev);
6247
6248         BUG_ON(dev_boot_phase);
6249         ASSERT_RTNL();
6250
6251         might_sleep();
6252
6253         /* When net_device's are persistent, this will be fatal. */
6254         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6255         BUG_ON(!net);
6256
6257         spin_lock_init(&dev->addr_list_lock);
6258         netdev_set_addr_lockdep_class(dev);
6259
6260         dev->iflink = -1;
6261
6262         ret = dev_get_valid_name(net, dev, dev->name);
6263         if (ret < 0)
6264                 goto out;
6265
6266         /* Init, if this function is available */
6267         if (dev->netdev_ops->ndo_init) {
6268                 ret = dev->netdev_ops->ndo_init(dev);
6269                 if (ret) {
6270                         if (ret > 0)
6271                                 ret = -EIO;
6272                         goto out;
6273                 }
6274         }
6275
6276         if (((dev->hw_features | dev->features) &
6277              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6278             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6279              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6280                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6281                 ret = -EINVAL;
6282                 goto err_uninit;
6283         }
6284
6285         ret = -EBUSY;
6286         if (!dev->ifindex)
6287                 dev->ifindex = dev_new_index(net);
6288         else if (__dev_get_by_index(net, dev->ifindex))
6289                 goto err_uninit;
6290
6291         if (dev->iflink == -1)
6292                 dev->iflink = dev->ifindex;
6293
6294         /* Transfer changeable features to wanted_features and enable
6295          * software offloads (GSO and GRO).
6296          */
6297         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6298         dev->features |= NETIF_F_SOFT_FEATURES;
6299         dev->wanted_features = dev->features & dev->hw_features;
6300
6301         if (!(dev->flags & IFF_LOOPBACK)) {
6302                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6303         }
6304
6305         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6306          */
6307         dev->vlan_features |= NETIF_F_HIGHDMA;
6308
6309         /* Make NETIF_F_SG inheritable to tunnel devices.
6310          */
6311         dev->hw_enc_features |= NETIF_F_SG;
6312
6313         /* Make NETIF_F_SG inheritable to MPLS.
6314          */
6315         dev->mpls_features |= NETIF_F_SG;
6316
6317         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6318         ret = notifier_to_errno(ret);
6319         if (ret)
6320                 goto err_uninit;
6321
6322         ret = netdev_register_kobject(dev);
6323         if (ret)
6324                 goto err_uninit;
6325         dev->reg_state = NETREG_REGISTERED;
6326
6327         __netdev_update_features(dev);
6328
6329         /*
6330          *      Default initial state at registry is that the
6331          *      device is present.
6332          */
6333
6334         set_bit(__LINK_STATE_PRESENT, &dev->state);
6335
6336         linkwatch_init_dev(dev);
6337
6338         dev_init_scheduler(dev);
6339         dev_hold(dev);
6340         list_netdevice(dev);
6341         add_device_randomness(dev->dev_addr, dev->addr_len);
6342
6343         /* If the device has permanent device address, driver should
6344          * set dev_addr and also addr_assign_type should be set to
6345          * NET_ADDR_PERM (default value).
6346          */
6347         if (dev->addr_assign_type == NET_ADDR_PERM)
6348                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6349
6350         /* Notify protocols, that a new device appeared. */
6351         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6352         ret = notifier_to_errno(ret);
6353         if (ret) {
6354                 rollback_registered(dev);
6355                 dev->reg_state = NETREG_UNREGISTERED;
6356         }
6357         /*
6358          *      Prevent userspace races by waiting until the network
6359          *      device is fully setup before sending notifications.
6360          */
6361         if (!dev->rtnl_link_ops ||
6362             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6363                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6364
6365 out:
6366         return ret;
6367
6368 err_uninit:
6369         if (dev->netdev_ops->ndo_uninit)
6370                 dev->netdev_ops->ndo_uninit(dev);
6371         goto out;
6372 }
6373 EXPORT_SYMBOL(register_netdevice);
6374
6375 /**
6376  *      init_dummy_netdev       - init a dummy network device for NAPI
6377  *      @dev: device to init
6378  *
6379  *      This takes a network device structure and initialize the minimum
6380  *      amount of fields so it can be used to schedule NAPI polls without
6381  *      registering a full blown interface. This is to be used by drivers
6382  *      that need to tie several hardware interfaces to a single NAPI
6383  *      poll scheduler due to HW limitations.
6384  */
6385 int init_dummy_netdev(struct net_device *dev)
6386 {
6387         /* Clear everything. Note we don't initialize spinlocks
6388          * are they aren't supposed to be taken by any of the
6389          * NAPI code and this dummy netdev is supposed to be
6390          * only ever used for NAPI polls
6391          */
6392         memset(dev, 0, sizeof(struct net_device));
6393
6394         /* make sure we BUG if trying to hit standard
6395          * register/unregister code path
6396          */
6397         dev->reg_state = NETREG_DUMMY;
6398
6399         /* NAPI wants this */
6400         INIT_LIST_HEAD(&dev->napi_list);
6401
6402         /* a dummy interface is started by default */
6403         set_bit(__LINK_STATE_PRESENT, &dev->state);
6404         set_bit(__LINK_STATE_START, &dev->state);
6405
6406         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6407          * because users of this 'device' dont need to change
6408          * its refcount.
6409          */
6410
6411         return 0;
6412 }
6413 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6414
6415
6416 /**
6417  *      register_netdev - register a network device
6418  *      @dev: device to register
6419  *
6420  *      Take a completed network device structure and add it to the kernel
6421  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6422  *      chain. 0 is returned on success. A negative errno code is returned
6423  *      on a failure to set up the device, or if the name is a duplicate.
6424  *
6425  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6426  *      and expands the device name if you passed a format string to
6427  *      alloc_netdev.
6428  */
6429 int register_netdev(struct net_device *dev)
6430 {
6431         int err;
6432
6433         rtnl_lock();
6434         err = register_netdevice(dev);
6435         rtnl_unlock();
6436         return err;
6437 }
6438 EXPORT_SYMBOL(register_netdev);
6439
6440 int netdev_refcnt_read(const struct net_device *dev)
6441 {
6442         int i, refcnt = 0;
6443
6444         for_each_possible_cpu(i)
6445                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6446         return refcnt;
6447 }
6448 EXPORT_SYMBOL(netdev_refcnt_read);
6449
6450 /**
6451  * netdev_wait_allrefs - wait until all references are gone.
6452  * @dev: target net_device
6453  *
6454  * This is called when unregistering network devices.
6455  *
6456  * Any protocol or device that holds a reference should register
6457  * for netdevice notification, and cleanup and put back the
6458  * reference if they receive an UNREGISTER event.
6459  * We can get stuck here if buggy protocols don't correctly
6460  * call dev_put.
6461  */
6462 static void netdev_wait_allrefs(struct net_device *dev)
6463 {
6464         unsigned long rebroadcast_time, warning_time;
6465         int refcnt;
6466
6467         linkwatch_forget_dev(dev);
6468
6469         rebroadcast_time = warning_time = jiffies;
6470         refcnt = netdev_refcnt_read(dev);
6471
6472         while (refcnt != 0) {
6473                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6474                         rtnl_lock();
6475
6476                         /* Rebroadcast unregister notification */
6477                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6478
6479                         __rtnl_unlock();
6480                         rcu_barrier();
6481                         rtnl_lock();
6482
6483                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6484                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6485                                      &dev->state)) {
6486                                 /* We must not have linkwatch events
6487                                  * pending on unregister. If this
6488                                  * happens, we simply run the queue
6489                                  * unscheduled, resulting in a noop
6490                                  * for this device.
6491                                  */
6492                                 linkwatch_run_queue();
6493                         }
6494
6495                         __rtnl_unlock();
6496
6497                         rebroadcast_time = jiffies;
6498                 }
6499
6500                 msleep(250);
6501
6502                 refcnt = netdev_refcnt_read(dev);
6503
6504                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6505                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6506                                  dev->name, refcnt);
6507                         warning_time = jiffies;
6508                 }
6509         }
6510 }
6511
6512 /* The sequence is:
6513  *
6514  *      rtnl_lock();
6515  *      ...
6516  *      register_netdevice(x1);
6517  *      register_netdevice(x2);
6518  *      ...
6519  *      unregister_netdevice(y1);
6520  *      unregister_netdevice(y2);
6521  *      ...
6522  *      rtnl_unlock();
6523  *      free_netdev(y1);
6524  *      free_netdev(y2);
6525  *
6526  * We are invoked by rtnl_unlock().
6527  * This allows us to deal with problems:
6528  * 1) We can delete sysfs objects which invoke hotplug
6529  *    without deadlocking with linkwatch via keventd.
6530  * 2) Since we run with the RTNL semaphore not held, we can sleep
6531  *    safely in order to wait for the netdev refcnt to drop to zero.
6532  *
6533  * We must not return until all unregister events added during
6534  * the interval the lock was held have been completed.
6535  */
6536 void netdev_run_todo(void)
6537 {
6538         struct list_head list;
6539
6540         /* Snapshot list, allow later requests */
6541         list_replace_init(&net_todo_list, &list);
6542
6543         __rtnl_unlock();
6544
6545
6546         /* Wait for rcu callbacks to finish before next phase */
6547         if (!list_empty(&list))
6548                 rcu_barrier();
6549
6550         while (!list_empty(&list)) {
6551                 struct net_device *dev
6552                         = list_first_entry(&list, struct net_device, todo_list);
6553                 list_del(&dev->todo_list);
6554
6555                 rtnl_lock();
6556                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6557                 __rtnl_unlock();
6558
6559                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6560                         pr_err("network todo '%s' but state %d\n",
6561                                dev->name, dev->reg_state);
6562                         dump_stack();
6563                         continue;
6564                 }
6565
6566                 dev->reg_state = NETREG_UNREGISTERED;
6567
6568                 on_each_cpu(flush_backlog, dev, 1);
6569
6570                 netdev_wait_allrefs(dev);
6571
6572                 /* paranoia */
6573                 BUG_ON(netdev_refcnt_read(dev));
6574                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6575                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6576                 WARN_ON(dev->dn_ptr);
6577
6578                 if (dev->destructor)
6579                         dev->destructor(dev);
6580
6581                 /* Report a network device has been unregistered */
6582                 rtnl_lock();
6583                 dev_net(dev)->dev_unreg_count--;
6584                 __rtnl_unlock();
6585                 wake_up(&netdev_unregistering_wq);
6586
6587                 /* Free network device */
6588                 kobject_put(&dev->dev.kobj);
6589         }
6590 }
6591
6592 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6593  * fields in the same order, with only the type differing.
6594  */
6595 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6596                              const struct net_device_stats *netdev_stats)
6597 {
6598 #if BITS_PER_LONG == 64
6599         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6600         memcpy(stats64, netdev_stats, sizeof(*stats64));
6601 #else
6602         size_t i, n = sizeof(*stats64) / sizeof(u64);
6603         const unsigned long *src = (const unsigned long *)netdev_stats;
6604         u64 *dst = (u64 *)stats64;
6605
6606         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6607                      sizeof(*stats64) / sizeof(u64));
6608         for (i = 0; i < n; i++)
6609                 dst[i] = src[i];
6610 #endif
6611 }
6612 EXPORT_SYMBOL(netdev_stats_to_stats64);
6613
6614 /**
6615  *      dev_get_stats   - get network device statistics
6616  *      @dev: device to get statistics from
6617  *      @storage: place to store stats
6618  *
6619  *      Get network statistics from device. Return @storage.
6620  *      The device driver may provide its own method by setting
6621  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6622  *      otherwise the internal statistics structure is used.
6623  */
6624 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6625                                         struct rtnl_link_stats64 *storage)
6626 {
6627         const struct net_device_ops *ops = dev->netdev_ops;
6628
6629         if (ops->ndo_get_stats64) {
6630                 memset(storage, 0, sizeof(*storage));
6631                 ops->ndo_get_stats64(dev, storage);
6632         } else if (ops->ndo_get_stats) {
6633                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6634         } else {
6635                 netdev_stats_to_stats64(storage, &dev->stats);
6636         }
6637         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6638         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6639         return storage;
6640 }
6641 EXPORT_SYMBOL(dev_get_stats);
6642
6643 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6644 {
6645         struct netdev_queue *queue = dev_ingress_queue(dev);
6646
6647 #ifdef CONFIG_NET_CLS_ACT
6648         if (queue)
6649                 return queue;
6650         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6651         if (!queue)
6652                 return NULL;
6653         netdev_init_one_queue(dev, queue, NULL);
6654         queue->qdisc = &noop_qdisc;
6655         queue->qdisc_sleeping = &noop_qdisc;
6656         rcu_assign_pointer(dev->ingress_queue, queue);
6657 #endif
6658         return queue;
6659 }
6660
6661 static const struct ethtool_ops default_ethtool_ops;
6662
6663 void netdev_set_default_ethtool_ops(struct net_device *dev,
6664                                     const struct ethtool_ops *ops)
6665 {
6666         if (dev->ethtool_ops == &default_ethtool_ops)
6667                 dev->ethtool_ops = ops;
6668 }
6669 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6670
6671 void netdev_freemem(struct net_device *dev)
6672 {
6673         char *addr = (char *)dev - dev->padded;
6674
6675         kvfree(addr);
6676 }
6677
6678 /**
6679  *      alloc_netdev_mqs - allocate network device
6680  *      @sizeof_priv:           size of private data to allocate space for
6681  *      @name:                  device name format string
6682  *      @name_assign_type:      origin of device name
6683  *      @setup:                 callback to initialize device
6684  *      @txqs:                  the number of TX subqueues to allocate
6685  *      @rxqs:                  the number of RX subqueues to allocate
6686  *
6687  *      Allocates a struct net_device with private data area for driver use
6688  *      and performs basic initialization.  Also allocates subqueue structs
6689  *      for each queue on the device.
6690  */
6691 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6692                 unsigned char name_assign_type,
6693                 void (*setup)(struct net_device *),
6694                 unsigned int txqs, unsigned int rxqs)
6695 {
6696         struct net_device *dev;
6697         size_t alloc_size;
6698         struct net_device *p;
6699
6700         BUG_ON(strlen(name) >= sizeof(dev->name));
6701
6702         if (txqs < 1) {
6703                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6704                 return NULL;
6705         }
6706
6707 #ifdef CONFIG_SYSFS
6708         if (rxqs < 1) {
6709                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6710                 return NULL;
6711         }
6712 #endif
6713
6714         alloc_size = sizeof(struct net_device);
6715         if (sizeof_priv) {
6716                 /* ensure 32-byte alignment of private area */
6717                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6718                 alloc_size += sizeof_priv;
6719         }
6720         /* ensure 32-byte alignment of whole construct */
6721         alloc_size += NETDEV_ALIGN - 1;
6722
6723         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6724         if (!p)
6725                 p = vzalloc(alloc_size);
6726         if (!p)
6727                 return NULL;
6728
6729         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6730         dev->padded = (char *)dev - (char *)p;
6731
6732         dev->pcpu_refcnt = alloc_percpu(int);
6733         if (!dev->pcpu_refcnt)
6734                 goto free_dev;
6735
6736         if (dev_addr_init(dev))
6737                 goto free_pcpu;
6738
6739         dev_mc_init(dev);
6740         dev_uc_init(dev);
6741
6742         dev_net_set(dev, &init_net);
6743
6744         dev->gso_max_size = GSO_MAX_SIZE;
6745         dev->gso_max_segs = GSO_MAX_SEGS;
6746         dev->gso_min_segs = 0;
6747
6748         INIT_LIST_HEAD(&dev->napi_list);
6749         INIT_LIST_HEAD(&dev->unreg_list);
6750         INIT_LIST_HEAD(&dev->close_list);
6751         INIT_LIST_HEAD(&dev->link_watch_list);
6752         INIT_LIST_HEAD(&dev->adj_list.upper);
6753         INIT_LIST_HEAD(&dev->adj_list.lower);
6754         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6755         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6756         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6757         setup(dev);
6758
6759         dev->num_tx_queues = txqs;
6760         dev->real_num_tx_queues = txqs;
6761         if (netif_alloc_netdev_queues(dev))
6762                 goto free_all;
6763
6764 #ifdef CONFIG_SYSFS
6765         dev->num_rx_queues = rxqs;
6766         dev->real_num_rx_queues = rxqs;
6767         if (netif_alloc_rx_queues(dev))
6768                 goto free_all;
6769 #endif
6770
6771         strcpy(dev->name, name);
6772         dev->name_assign_type = name_assign_type;
6773         dev->group = INIT_NETDEV_GROUP;
6774         if (!dev->ethtool_ops)
6775                 dev->ethtool_ops = &default_ethtool_ops;
6776         return dev;
6777
6778 free_all:
6779         free_netdev(dev);
6780         return NULL;
6781
6782 free_pcpu:
6783         free_percpu(dev->pcpu_refcnt);
6784 free_dev:
6785         netdev_freemem(dev);
6786         return NULL;
6787 }
6788 EXPORT_SYMBOL(alloc_netdev_mqs);
6789
6790 /**
6791  *      free_netdev - free network device
6792  *      @dev: device
6793  *
6794  *      This function does the last stage of destroying an allocated device
6795  *      interface. The reference to the device object is released.
6796  *      If this is the last reference then it will be freed.
6797  */
6798 void free_netdev(struct net_device *dev)
6799 {
6800         struct napi_struct *p, *n;
6801
6802         release_net(dev_net(dev));
6803
6804         netif_free_tx_queues(dev);
6805 #ifdef CONFIG_SYSFS
6806         kfree(dev->_rx);
6807 #endif
6808
6809         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6810
6811         /* Flush device addresses */
6812         dev_addr_flush(dev);
6813
6814         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6815                 netif_napi_del(p);
6816
6817         free_percpu(dev->pcpu_refcnt);
6818         dev->pcpu_refcnt = NULL;
6819
6820         /*  Compatibility with error handling in drivers */
6821         if (dev->reg_state == NETREG_UNINITIALIZED) {
6822                 netdev_freemem(dev);
6823                 return;
6824         }
6825
6826         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6827         dev->reg_state = NETREG_RELEASED;
6828
6829         /* will free via device release */
6830         put_device(&dev->dev);
6831 }
6832 EXPORT_SYMBOL(free_netdev);
6833
6834 /**
6835  *      synchronize_net -  Synchronize with packet receive processing
6836  *
6837  *      Wait for packets currently being received to be done.
6838  *      Does not block later packets from starting.
6839  */
6840 void synchronize_net(void)
6841 {
6842         might_sleep();
6843         if (rtnl_is_locked())
6844                 synchronize_rcu_expedited();
6845         else
6846                 synchronize_rcu();
6847 }
6848 EXPORT_SYMBOL(synchronize_net);
6849
6850 /**
6851  *      unregister_netdevice_queue - remove device from the kernel
6852  *      @dev: device
6853  *      @head: list
6854  *
6855  *      This function shuts down a device interface and removes it
6856  *      from the kernel tables.
6857  *      If head not NULL, device is queued to be unregistered later.
6858  *
6859  *      Callers must hold the rtnl semaphore.  You may want
6860  *      unregister_netdev() instead of this.
6861  */
6862
6863 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6864 {
6865         ASSERT_RTNL();
6866
6867         if (head) {
6868                 list_move_tail(&dev->unreg_list, head);
6869         } else {
6870                 rollback_registered(dev);
6871                 /* Finish processing unregister after unlock */
6872                 net_set_todo(dev);
6873         }
6874 }
6875 EXPORT_SYMBOL(unregister_netdevice_queue);
6876
6877 /**
6878  *      unregister_netdevice_many - unregister many devices
6879  *      @head: list of devices
6880  *
6881  *  Note: As most callers use a stack allocated list_head,
6882  *  we force a list_del() to make sure stack wont be corrupted later.
6883  */
6884 void unregister_netdevice_many(struct list_head *head)
6885 {
6886         struct net_device *dev;
6887
6888         if (!list_empty(head)) {
6889                 rollback_registered_many(head);
6890                 list_for_each_entry(dev, head, unreg_list)
6891                         net_set_todo(dev);
6892                 list_del(head);
6893         }
6894 }
6895 EXPORT_SYMBOL(unregister_netdevice_many);
6896
6897 /**
6898  *      unregister_netdev - remove device from the kernel
6899  *      @dev: device
6900  *
6901  *      This function shuts down a device interface and removes it
6902  *      from the kernel tables.
6903  *
6904  *      This is just a wrapper for unregister_netdevice that takes
6905  *      the rtnl semaphore.  In general you want to use this and not
6906  *      unregister_netdevice.
6907  */
6908 void unregister_netdev(struct net_device *dev)
6909 {
6910         rtnl_lock();
6911         unregister_netdevice(dev);
6912         rtnl_unlock();
6913 }
6914 EXPORT_SYMBOL(unregister_netdev);
6915
6916 /**
6917  *      dev_change_net_namespace - move device to different nethost namespace
6918  *      @dev: device
6919  *      @net: network namespace
6920  *      @pat: If not NULL name pattern to try if the current device name
6921  *            is already taken in the destination network namespace.
6922  *
6923  *      This function shuts down a device interface and moves it
6924  *      to a new network namespace. On success 0 is returned, on
6925  *      a failure a netagive errno code is returned.
6926  *
6927  *      Callers must hold the rtnl semaphore.
6928  */
6929
6930 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6931 {
6932         int err;
6933
6934         ASSERT_RTNL();
6935
6936         /* Don't allow namespace local devices to be moved. */
6937         err = -EINVAL;
6938         if (dev->features & NETIF_F_NETNS_LOCAL)
6939                 goto out;
6940
6941         /* Ensure the device has been registrered */
6942         if (dev->reg_state != NETREG_REGISTERED)
6943                 goto out;
6944
6945         /* Get out if there is nothing todo */
6946         err = 0;
6947         if (net_eq(dev_net(dev), net))
6948                 goto out;
6949
6950         /* Pick the destination device name, and ensure
6951          * we can use it in the destination network namespace.
6952          */
6953         err = -EEXIST;
6954         if (__dev_get_by_name(net, dev->name)) {
6955                 /* We get here if we can't use the current device name */
6956                 if (!pat)
6957                         goto out;
6958                 if (dev_get_valid_name(net, dev, pat) < 0)
6959                         goto out;
6960         }
6961
6962         /*
6963          * And now a mini version of register_netdevice unregister_netdevice.
6964          */
6965
6966         /* If device is running close it first. */
6967         dev_close(dev);
6968
6969         /* And unlink it from device chain */
6970         err = -ENODEV;
6971         unlist_netdevice(dev);
6972
6973         synchronize_net();
6974
6975         /* Shutdown queueing discipline. */
6976         dev_shutdown(dev);
6977
6978         /* Notify protocols, that we are about to destroy
6979            this device. They should clean all the things.
6980
6981            Note that dev->reg_state stays at NETREG_REGISTERED.
6982            This is wanted because this way 8021q and macvlan know
6983            the device is just moving and can keep their slaves up.
6984         */
6985         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6986         rcu_barrier();
6987         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6988         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6989
6990         /*
6991          *      Flush the unicast and multicast chains
6992          */
6993         dev_uc_flush(dev);
6994         dev_mc_flush(dev);
6995
6996         /* Send a netdev-removed uevent to the old namespace */
6997         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6998         netdev_adjacent_del_links(dev);
6999
7000         /* Actually switch the network namespace */
7001         dev_net_set(dev, net);
7002
7003         /* If there is an ifindex conflict assign a new one */
7004         if (__dev_get_by_index(net, dev->ifindex)) {
7005                 int iflink = (dev->iflink == dev->ifindex);
7006                 dev->ifindex = dev_new_index(net);
7007                 if (iflink)
7008                         dev->iflink = dev->ifindex;
7009         }
7010
7011         /* Send a netdev-add uevent to the new namespace */
7012         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7013         netdev_adjacent_add_links(dev);
7014
7015         /* Fixup kobjects */
7016         err = device_rename(&dev->dev, dev->name);
7017         WARN_ON(err);
7018
7019         /* Add the device back in the hashes */
7020         list_netdevice(dev);
7021
7022         /* Notify protocols, that a new device appeared. */
7023         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7024
7025         /*
7026          *      Prevent userspace races by waiting until the network
7027          *      device is fully setup before sending notifications.
7028          */
7029         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7030
7031         synchronize_net();
7032         err = 0;
7033 out:
7034         return err;
7035 }
7036 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7037
7038 static int dev_cpu_callback(struct notifier_block *nfb,
7039                             unsigned long action,
7040                             void *ocpu)
7041 {
7042         struct sk_buff **list_skb;
7043         struct sk_buff *skb;
7044         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7045         struct softnet_data *sd, *oldsd;
7046
7047         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7048                 return NOTIFY_OK;
7049
7050         local_irq_disable();
7051         cpu = smp_processor_id();
7052         sd = &per_cpu(softnet_data, cpu);
7053         oldsd = &per_cpu(softnet_data, oldcpu);
7054
7055         /* Find end of our completion_queue. */
7056         list_skb = &sd->completion_queue;
7057         while (*list_skb)
7058                 list_skb = &(*list_skb)->next;
7059         /* Append completion queue from offline CPU. */
7060         *list_skb = oldsd->completion_queue;
7061         oldsd->completion_queue = NULL;
7062
7063         /* Append output queue from offline CPU. */
7064         if (oldsd->output_queue) {
7065                 *sd->output_queue_tailp = oldsd->output_queue;
7066                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7067                 oldsd->output_queue = NULL;
7068                 oldsd->output_queue_tailp = &oldsd->output_queue;
7069         }
7070         /* Append NAPI poll list from offline CPU. */
7071         if (!list_empty(&oldsd->poll_list)) {
7072                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
7073                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
7074         }
7075
7076         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7077         local_irq_enable();
7078
7079         /* Process offline CPU's input_pkt_queue */
7080         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7081                 netif_rx_internal(skb);
7082                 input_queue_head_incr(oldsd);
7083         }
7084         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7085                 netif_rx_internal(skb);
7086                 input_queue_head_incr(oldsd);
7087         }
7088
7089         return NOTIFY_OK;
7090 }
7091
7092
7093 /**
7094  *      netdev_increment_features - increment feature set by one
7095  *      @all: current feature set
7096  *      @one: new feature set
7097  *      @mask: mask feature set
7098  *
7099  *      Computes a new feature set after adding a device with feature set
7100  *      @one to the master device with current feature set @all.  Will not
7101  *      enable anything that is off in @mask. Returns the new feature set.
7102  */
7103 netdev_features_t netdev_increment_features(netdev_features_t all,
7104         netdev_features_t one, netdev_features_t mask)
7105 {
7106         if (mask & NETIF_F_GEN_CSUM)
7107                 mask |= NETIF_F_ALL_CSUM;
7108         mask |= NETIF_F_VLAN_CHALLENGED;
7109
7110         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7111         all &= one | ~NETIF_F_ALL_FOR_ALL;
7112
7113         /* If one device supports hw checksumming, set for all. */
7114         if (all & NETIF_F_GEN_CSUM)
7115                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7116
7117         return all;
7118 }
7119 EXPORT_SYMBOL(netdev_increment_features);
7120
7121 static struct hlist_head * __net_init netdev_create_hash(void)
7122 {
7123         int i;
7124         struct hlist_head *hash;
7125
7126         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7127         if (hash != NULL)
7128                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7129                         INIT_HLIST_HEAD(&hash[i]);
7130
7131         return hash;
7132 }
7133
7134 /* Initialize per network namespace state */
7135 static int __net_init netdev_init(struct net *net)
7136 {
7137         if (net != &init_net)
7138                 INIT_LIST_HEAD(&net->dev_base_head);
7139
7140         net->dev_name_head = netdev_create_hash();
7141         if (net->dev_name_head == NULL)
7142                 goto err_name;
7143
7144         net->dev_index_head = netdev_create_hash();
7145         if (net->dev_index_head == NULL)
7146                 goto err_idx;
7147
7148         return 0;
7149
7150 err_idx:
7151         kfree(net->dev_name_head);
7152 err_name:
7153         return -ENOMEM;
7154 }
7155
7156 /**
7157  *      netdev_drivername - network driver for the device
7158  *      @dev: network device
7159  *
7160  *      Determine network driver for device.
7161  */
7162 const char *netdev_drivername(const struct net_device *dev)
7163 {
7164         const struct device_driver *driver;
7165         const struct device *parent;
7166         const char *empty = "";
7167
7168         parent = dev->dev.parent;
7169         if (!parent)
7170                 return empty;
7171
7172         driver = parent->driver;
7173         if (driver && driver->name)
7174                 return driver->name;
7175         return empty;
7176 }
7177
7178 static void __netdev_printk(const char *level, const struct net_device *dev,
7179                             struct va_format *vaf)
7180 {
7181         if (dev && dev->dev.parent) {
7182                 dev_printk_emit(level[1] - '0',
7183                                 dev->dev.parent,
7184                                 "%s %s %s%s: %pV",
7185                                 dev_driver_string(dev->dev.parent),
7186                                 dev_name(dev->dev.parent),
7187                                 netdev_name(dev), netdev_reg_state(dev),
7188                                 vaf);
7189         } else if (dev) {
7190                 printk("%s%s%s: %pV",
7191                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7192         } else {
7193                 printk("%s(NULL net_device): %pV", level, vaf);
7194         }
7195 }
7196
7197 void netdev_printk(const char *level, const struct net_device *dev,
7198                    const char *format, ...)
7199 {
7200         struct va_format vaf;
7201         va_list args;
7202
7203         va_start(args, format);
7204
7205         vaf.fmt = format;
7206         vaf.va = &args;
7207
7208         __netdev_printk(level, dev, &vaf);
7209
7210         va_end(args);
7211 }
7212 EXPORT_SYMBOL(netdev_printk);
7213
7214 #define define_netdev_printk_level(func, level)                 \
7215 void func(const struct net_device *dev, const char *fmt, ...)   \
7216 {                                                               \
7217         struct va_format vaf;                                   \
7218         va_list args;                                           \
7219                                                                 \
7220         va_start(args, fmt);                                    \
7221                                                                 \
7222         vaf.fmt = fmt;                                          \
7223         vaf.va = &args;                                         \
7224                                                                 \
7225         __netdev_printk(level, dev, &vaf);                      \
7226                                                                 \
7227         va_end(args);                                           \
7228 }                                                               \
7229 EXPORT_SYMBOL(func);
7230
7231 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7232 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7233 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7234 define_netdev_printk_level(netdev_err, KERN_ERR);
7235 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7236 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7237 define_netdev_printk_level(netdev_info, KERN_INFO);
7238
7239 static void __net_exit netdev_exit(struct net *net)
7240 {
7241         kfree(net->dev_name_head);
7242         kfree(net->dev_index_head);
7243 }
7244
7245 static struct pernet_operations __net_initdata netdev_net_ops = {
7246         .init = netdev_init,
7247         .exit = netdev_exit,
7248 };
7249
7250 static void __net_exit default_device_exit(struct net *net)
7251 {
7252         struct net_device *dev, *aux;
7253         /*
7254          * Push all migratable network devices back to the
7255          * initial network namespace
7256          */
7257         rtnl_lock();
7258         for_each_netdev_safe(net, dev, aux) {
7259                 int err;
7260                 char fb_name[IFNAMSIZ];
7261
7262                 /* Ignore unmoveable devices (i.e. loopback) */
7263                 if (dev->features & NETIF_F_NETNS_LOCAL)
7264                         continue;
7265
7266                 /* Leave virtual devices for the generic cleanup */
7267                 if (dev->rtnl_link_ops)
7268                         continue;
7269
7270                 /* Push remaining network devices to init_net */
7271                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7272                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7273                 if (err) {
7274                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7275                                  __func__, dev->name, err);
7276                         BUG();
7277                 }
7278         }
7279         rtnl_unlock();
7280 }
7281
7282 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7283 {
7284         /* Return with the rtnl_lock held when there are no network
7285          * devices unregistering in any network namespace in net_list.
7286          */
7287         struct net *net;
7288         bool unregistering;
7289         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7290
7291         add_wait_queue(&netdev_unregistering_wq, &wait);
7292         for (;;) {
7293                 unregistering = false;
7294                 rtnl_lock();
7295                 list_for_each_entry(net, net_list, exit_list) {
7296                         if (net->dev_unreg_count > 0) {
7297                                 unregistering = true;
7298                                 break;
7299                         }
7300                 }
7301                 if (!unregistering)
7302                         break;
7303                 __rtnl_unlock();
7304
7305                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7306         }
7307         remove_wait_queue(&netdev_unregistering_wq, &wait);
7308 }
7309
7310 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7311 {
7312         /* At exit all network devices most be removed from a network
7313          * namespace.  Do this in the reverse order of registration.
7314          * Do this across as many network namespaces as possible to
7315          * improve batching efficiency.
7316          */
7317         struct net_device *dev;
7318         struct net *net;
7319         LIST_HEAD(dev_kill_list);
7320
7321         /* To prevent network device cleanup code from dereferencing
7322          * loopback devices or network devices that have been freed
7323          * wait here for all pending unregistrations to complete,
7324          * before unregistring the loopback device and allowing the
7325          * network namespace be freed.
7326          *
7327          * The netdev todo list containing all network devices
7328          * unregistrations that happen in default_device_exit_batch
7329          * will run in the rtnl_unlock() at the end of
7330          * default_device_exit_batch.
7331          */
7332         rtnl_lock_unregistering(net_list);
7333         list_for_each_entry(net, net_list, exit_list) {
7334                 for_each_netdev_reverse(net, dev) {
7335                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7336                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7337                         else
7338                                 unregister_netdevice_queue(dev, &dev_kill_list);
7339                 }
7340         }
7341         unregister_netdevice_many(&dev_kill_list);
7342         rtnl_unlock();
7343 }
7344
7345 static struct pernet_operations __net_initdata default_device_ops = {
7346         .exit = default_device_exit,
7347         .exit_batch = default_device_exit_batch,
7348 };
7349
7350 /*
7351  *      Initialize the DEV module. At boot time this walks the device list and
7352  *      unhooks any devices that fail to initialise (normally hardware not
7353  *      present) and leaves us with a valid list of present and active devices.
7354  *
7355  */
7356
7357 /*
7358  *       This is called single threaded during boot, so no need
7359  *       to take the rtnl semaphore.
7360  */
7361 static int __init net_dev_init(void)
7362 {
7363         int i, rc = -ENOMEM;
7364
7365         BUG_ON(!dev_boot_phase);
7366
7367         if (dev_proc_init())
7368                 goto out;
7369
7370         if (netdev_kobject_init())
7371                 goto out;
7372
7373         INIT_LIST_HEAD(&ptype_all);
7374         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7375                 INIT_LIST_HEAD(&ptype_base[i]);
7376
7377         INIT_LIST_HEAD(&offload_base);
7378
7379         if (register_pernet_subsys(&netdev_net_ops))
7380                 goto out;
7381
7382         /*
7383          *      Initialise the packet receive queues.
7384          */
7385
7386         for_each_possible_cpu(i) {
7387                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7388
7389                 skb_queue_head_init(&sd->input_pkt_queue);
7390                 skb_queue_head_init(&sd->process_queue);
7391                 INIT_LIST_HEAD(&sd->poll_list);
7392                 sd->output_queue_tailp = &sd->output_queue;
7393 #ifdef CONFIG_RPS
7394                 sd->csd.func = rps_trigger_softirq;
7395                 sd->csd.info = sd;
7396                 sd->cpu = i;
7397 #endif
7398
7399                 sd->backlog.poll = process_backlog;
7400                 sd->backlog.weight = weight_p;
7401         }
7402
7403         dev_boot_phase = 0;
7404
7405         /* The loopback device is special if any other network devices
7406          * is present in a network namespace the loopback device must
7407          * be present. Since we now dynamically allocate and free the
7408          * loopback device ensure this invariant is maintained by
7409          * keeping the loopback device as the first device on the
7410          * list of network devices.  Ensuring the loopback devices
7411          * is the first device that appears and the last network device
7412          * that disappears.
7413          */
7414         if (register_pernet_device(&loopback_net_ops))
7415                 goto out;
7416
7417         if (register_pernet_device(&default_device_ops))
7418                 goto out;
7419
7420         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7421         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7422
7423         hotcpu_notifier(dev_cpu_callback, 0);
7424         dst_init();
7425         rc = 0;
7426 out:
7427         return rc;
7428 }
7429
7430 subsys_initcall(net_dev_init);