net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 static DEFINE_SPINLOCK(ptype_lock);
 145 static DEFINE_SPINLOCK(offload_lock);
 146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147 struct list_head ptype_all __read_mostly;       /* Taps */
 148 static struct list_head offload_base __read_mostly;
 149
 150 static int netif_rx_internal(struct sk_buff *skb);
 151 static int call_netdevice_notifiers_info(unsigned long val,
 152                                          struct net_device *dev,
 153                                          struct netdev_notifier_info *info);
 154
 155 /*
 156  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 157  * semaphore.
 158  *
 159  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 160  *
 161  * Writers must hold the rtnl semaphore while they loop through the
 162  * dev_base_head list, and hold dev_base_lock for writing when they do the
 163  * actual updates.  This allows pure readers to access the list even
 164  * while a writer is preparing to update it.
 165  *
 166  * To put it another way, dev_base_lock is held for writing only to
 167  * protect against pure readers; the rtnl semaphore provides the
 168  * protection against other writers.
 169  *
 170  * See, for example usages, register_netdevice() and
 171  * unregister_netdevice(), which must be called with the rtnl
 172  * semaphore held.
 173  */
 174 DEFINE_RWLOCK(dev_base_lock);
 175 EXPORT_SYMBOL(dev_base_lock);
 176
 177 /* protects napi_hash addition/deletion and napi_gen_id */
 178 static DEFINE_SPINLOCK(napi_hash_lock);
 179
 180 static unsigned int napi_gen_id;
 181 static DEFINE_HASHTABLE(napi_hash, 8);
 182
 183 static seqcount_t devnet_rename_seq;
 184
 185 static inline void dev_base_seq_inc(struct net *net)
 186 {
 187         while (++net->dev_base_seq == 0);
 188 }
 189
 190 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 191 {
 192         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 193
 194         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 195 }
 196
 197 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 198 {
 199         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 200 }
 201
 202 static inline void rps_lock(struct softnet_data *sd)
 203 {
 204 #ifdef CONFIG_RPS
 205         spin_lock(&sd->input_pkt_queue.lock);
 206 #endif
 207 }
 208
 209 static inline void rps_unlock(struct softnet_data *sd)
 210 {
 211 #ifdef CONFIG_RPS
 212         spin_unlock(&sd->input_pkt_queue.lock);
 213 #endif
 214 }
 215
 216 /* Device list insertion */
 217 static void list_netdevice(struct net_device *dev)
 218 {
 219         struct net *net = dev_net(dev);
 220
 221         ASSERT_RTNL();
 222
 223         write_lock_bh(&dev_base_lock);
 224         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 225         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 226         hlist_add_head_rcu(&dev->index_hlist,
 227                            dev_index_hash(net, dev->ifindex));
 228         write_unlock_bh(&dev_base_lock);
 229
 230         dev_base_seq_inc(net);
 231 }
 232
 233 /* Device list removal
 234  * caller must respect a RCU grace period before freeing/reusing dev
 235  */
 236 static void unlist_netdevice(struct net_device *dev)
 237 {
 238         ASSERT_RTNL();
 239
 240         /* Unlink dev from the device chain */
 241         write_lock_bh(&dev_base_lock);
 242         list_del_rcu(&dev->dev_list);
 243         hlist_del_rcu(&dev->name_hlist);
 244         hlist_del_rcu(&dev->index_hlist);
 245         write_unlock_bh(&dev_base_lock);
 246
 247         dev_base_seq_inc(dev_net(dev));
 248 }
 249
 250 /*
 251  *      Our notifier list
 252  */
 253
 254 static RAW_NOTIFIER_HEAD(netdev_chain);
 255
 256 /*
 257  *      Device drivers call our routines to queue packets here. We empty the
 258  *      queue in the local softnet handler.
 259  */
 260
 261 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 262 EXPORT_PER_CPU_SYMBOL(softnet_data);
 263
 264 #ifdef CONFIG_LOCKDEP
 265 /*
 266  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267  * according to dev->type
 268  */
 269 static const unsigned short netdev_lock_type[] =
 270         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 283          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 284          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 285
 286 static const char *const netdev_lock_name[] =
 287         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 300          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 301          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 302
 303 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307 {
 308         int i;
 309
 310         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311                 if (netdev_lock_type[i] == dev_type)
 312                         return i;
 313         /* the last key is used by default */
 314         return ARRAY_SIZE(netdev_lock_type) - 1;
 315 }
 316
 317 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318                                                  unsigned short dev_type)
 319 {
 320         int i;
 321
 322         i = netdev_lock_pos(dev_type);
 323         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev->type);
 332         lockdep_set_class_and_name(&dev->addr_list_lock,
 333                                    &netdev_addr_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336 #else
 337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                  unsigned short dev_type)
 339 {
 340 }
 341 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342 {
 343 }
 344 #endif
 345
 346 /*******************************************************************************
 347
 348                 Protocol management and registration routines
 349
 350 *******************************************************************************/
 351
 352 /*
 353  *      Add a protocol ID to the list. Now that the input handler is
 354  *      smarter we can dispense with all the messy stuff that used to be
 355  *      here.
 356  *
 357  *      BEWARE!!! Protocol handlers, mangling input packets,
 358  *      MUST BE last in hash buckets and checking protocol handlers
 359  *      MUST start from promiscuous ptype_all chain in net_bh.
 360  *      It is true now, do not change it.
 361  *      Explanation follows: if protocol handler, mangling packet, will
 362  *      be the first on list, it is not able to sense, that packet
 363  *      is cloned and should be copied-on-write, so that it will
 364  *      change it and subsequent readers will get broken packet.
 365  *                                                      --ANK (980803)
 366  */
 367
 368 static inline struct list_head *ptype_head(const struct packet_type *pt)
 369 {
 370         if (pt->type == htons(ETH_P_ALL))
 371                 return &ptype_all;
 372         else
 373                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 374 }
 375
 376 /**
 377  *      dev_add_pack - add packet handler
 378  *      @pt: packet type declaration
 379  *
 380  *      Add a protocol handler to the networking stack. The passed &packet_type
 381  *      is linked into kernel lists and may not be freed until it has been
 382  *      removed from the kernel lists.
 383  *
 384  *      This call does not sleep therefore it can not
 385  *      guarantee all CPU's that are in middle of receiving packets
 386  *      will see the new packet type (until the next received packet).
 387  */
 388
 389 void dev_add_pack(struct packet_type *pt)
 390 {
 391         struct list_head *head = ptype_head(pt);
 392
 393         spin_lock(&ptype_lock);
 394         list_add_rcu(&pt->list, head);
 395         spin_unlock(&ptype_lock);
 396 }
 397 EXPORT_SYMBOL(dev_add_pack);
 398
 399 /**
 400  *      __dev_remove_pack        - remove packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Remove a protocol handler that was previously added to the kernel
 404  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 405  *      from the kernel lists and can be freed or reused once this function
 406  *      returns.
 407  *
 408  *      The packet type might still be in use by receivers
 409  *      and must not be freed until after all the CPU's have gone
 410  *      through a quiescent state.
 411  */
 412 void __dev_remove_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415         struct packet_type *pt1;
 416
 417         spin_lock(&ptype_lock);
 418
 419         list_for_each_entry(pt1, head, list) {
 420                 if (pt == pt1) {
 421                         list_del_rcu(&pt->list);
 422                         goto out;
 423                 }
 424         }
 425
 426         pr_warn("dev_remove_pack: %p not found\n", pt);
 427 out:
 428         spin_unlock(&ptype_lock);
 429 }
 430 EXPORT_SYMBOL(__dev_remove_pack);
 431
 432 /**
 433  *      dev_remove_pack  - remove packet handler
 434  *      @pt: packet type declaration
 435  *
 436  *      Remove a protocol handler that was previously added to the kernel
 437  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 438  *      from the kernel lists and can be freed or reused once this function
 439  *      returns.
 440  *
 441  *      This call sleeps to guarantee that no CPU is looking at the packet
 442  *      type after return.
 443  */
 444 void dev_remove_pack(struct packet_type *pt)
 445 {
 446         __dev_remove_pack(pt);
 447
 448         synchronize_net();
 449 }
 450 EXPORT_SYMBOL(dev_remove_pack);
 451
 452
 453 /**
 454  *      dev_add_offload - register offload handlers
 455  *      @po: protocol offload declaration
 456  *
 457  *      Add protocol offload handlers to the networking stack. The passed
 458  *      &proto_offload is linked into kernel lists and may not be freed until
 459  *      it has been removed from the kernel lists.
 460  *
 461  *      This call does not sleep therefore it can not
 462  *      guarantee all CPU's that are in middle of receiving packets
 463  *      will see the new offload handlers (until the next received packet).
 464  */
 465 void dev_add_offload(struct packet_offload *po)
 466 {
 467         struct list_head *head = &offload_base;
 468
 469         spin_lock(&offload_lock);
 470         list_add_rcu(&po->list, head);
 471         spin_unlock(&offload_lock);
 472 }
 473 EXPORT_SYMBOL(dev_add_offload);
 474
 475 /**
 476  *      __dev_remove_offload     - remove offload handler
 477  *      @po: packet offload declaration
 478  *
 479  *      Remove a protocol offload handler that was previously added to the
 480  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 481  *      is removed from the kernel lists and can be freed or reused once this
 482  *      function returns.
 483  *
 484  *      The packet type might still be in use by receivers
 485  *      and must not be freed until after all the CPU's have gone
 486  *      through a quiescent state.
 487  */
 488 static void __dev_remove_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491         struct packet_offload *po1;
 492
 493         spin_lock(&offload_lock);
 494
 495         list_for_each_entry(po1, head, list) {
 496                 if (po == po1) {
 497                         list_del_rcu(&po->list);
 498                         goto out;
 499                 }
 500         }
 501
 502         pr_warn("dev_remove_offload: %p not found\n", po);
 503 out:
 504         spin_unlock(&offload_lock);
 505 }
 506
 507 /**
 508  *      dev_remove_offload       - remove packet offload handler
 509  *      @po: packet offload declaration
 510  *
 511  *      Remove a packet offload handler that was previously added to the kernel
 512  *      offload handlers by dev_add_offload(). The passed &offload_type is
 513  *      removed from the kernel lists and can be freed or reused once this
 514  *      function returns.
 515  *
 516  *      This call sleeps to guarantee that no CPU is looking at the packet
 517  *      type after return.
 518  */
 519 void dev_remove_offload(struct packet_offload *po)
 520 {
 521         __dev_remove_offload(po);
 522
 523         synchronize_net();
 524 }
 525 EXPORT_SYMBOL(dev_remove_offload);
 526
 527 /******************************************************************************
 528
 529                       Device Boot-time Settings Routines
 530
 531 *******************************************************************************/
 532
 533 /* Boot time configuration table */
 534 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 535
 536 /**
 537  *      netdev_boot_setup_add   - add new setup entry
 538  *      @name: name of the device
 539  *      @map: configured settings for the device
 540  *
 541  *      Adds new setup entry to the dev_boot_setup list.  The function
 542  *      returns 0 on error and 1 on success.  This is a generic routine to
 543  *      all netdevices.
 544  */
 545 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 546 {
 547         struct netdev_boot_setup *s;
 548         int i;
 549
 550         s = dev_boot_setup;
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 552                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 553                         memset(s[i].name, 0, sizeof(s[i].name));
 554                         strlcpy(s[i].name, name, IFNAMSIZ);
 555                         memcpy(&s[i].map, map, sizeof(s[i].map));
 556                         break;
 557                 }
 558         }
 559
 560         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 561 }
 562
 563 /**
 564  *      netdev_boot_setup_check - check boot time settings
 565  *      @dev: the netdevice
 566  *
 567  *      Check boot time settings for the device.
 568  *      The found settings are set for the device to be used
 569  *      later in the device probing.
 570  *      Returns 0 if no settings found, 1 if they are.
 571  */
 572 int netdev_boot_setup_check(struct net_device *dev)
 573 {
 574         struct netdev_boot_setup *s = dev_boot_setup;
 575         int i;
 576
 577         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 578                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 579                     !strcmp(dev->name, s[i].name)) {
 580                         dev->irq        = s[i].map.irq;
 581                         dev->base_addr  = s[i].map.base_addr;
 582                         dev->mem_start  = s[i].map.mem_start;
 583                         dev->mem_end    = s[i].map.mem_end;
 584                         return 1;
 585                 }
 586         }
 587         return 0;
 588 }
 589 EXPORT_SYMBOL(netdev_boot_setup_check);
 590
 591
 592 /**
 593  *      netdev_boot_base        - get address from boot time settings
 594  *      @prefix: prefix for network device
 595  *      @unit: id for network device
 596  *
 597  *      Check boot time settings for the base address of device.
 598  *      The found settings are set for the device to be used
 599  *      later in the device probing.
 600  *      Returns 0 if no settings found.
 601  */
 602 unsigned long netdev_boot_base(const char *prefix, int unit)
 603 {
 604         const struct netdev_boot_setup *s = dev_boot_setup;
 605         char name[IFNAMSIZ];
 606         int i;
 607
 608         sprintf(name, "%s%d", prefix, unit);
 609
 610         /*
 611          * If device already registered then return base of 1
 612          * to indicate not to probe for this interface
 613          */
 614         if (__dev_get_by_name(&init_net, name))
 615                 return 1;
 616
 617         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 618                 if (!strcmp(name, s[i].name))
 619                         return s[i].map.base_addr;
 620         return 0;
 621 }
 622
 623 /*
 624  * Saves at boot time configured settings for any netdevice.
 625  */
 626 int __init netdev_boot_setup(char *str)
 627 {
 628         int ints[5];
 629         struct ifmap map;
 630
 631         str = get_options(str, ARRAY_SIZE(ints), ints);
 632         if (!str || !*str)
 633                 return 0;
 634
 635         /* Save settings */
 636         memset(&map, 0, sizeof(map));
 637         if (ints[0] > 0)
 638                 map.irq = ints[1];
 639         if (ints[0] > 1)
 640                 map.base_addr = ints[2];
 641         if (ints[0] > 2)
 642                 map.mem_start = ints[3];
 643         if (ints[0] > 3)
 644                 map.mem_end = ints[4];
 645
 646         /* Add new entry to the list */
 647         return netdev_boot_setup_add(str, &map);
 648 }
 649
 650 __setup("netdev=", netdev_boot_setup);
 651
 652 /*******************************************************************************
 653
 654                             Device Interface Subroutines
 655
 656 *******************************************************************************/
 657
 658 /**
 659  *      __dev_get_by_name       - find a device by its name
 660  *      @net: the applicable net namespace
 661  *      @name: name to find
 662  *
 663  *      Find an interface by name. Must be called under RTNL semaphore
 664  *      or @dev_base_lock. If the name is found a pointer to the device
 665  *      is returned. If the name is not found then %NULL is returned. The
 666  *      reference counters are not incremented so the caller must be
 667  *      careful with locks.
 668  */
 669
 670 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 671 {
 672         struct net_device *dev;
 673         struct hlist_head *head = dev_name_hash(net, name);
 674
 675         hlist_for_each_entry(dev, head, name_hlist)
 676                 if (!strncmp(dev->name, name, IFNAMSIZ))
 677                         return dev;
 678
 679         return NULL;
 680 }
 681 EXPORT_SYMBOL(__dev_get_by_name);
 682
 683 /**
 684  *      dev_get_by_name_rcu     - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name.
 689  *      If the name is found a pointer to the device is returned.
 690  *      If the name is not found then %NULL is returned.
 691  *      The reference counters are not incremented so the caller must be
 692  *      careful with locks. The caller must hold RCU lock.
 693  */
 694
 695 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 696 {
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry_rcu(dev, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(dev_get_by_name_rcu);
 707
 708 /**
 709  *      dev_get_by_name         - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name. This can be called from any
 714  *      context and does its own locking. The returned handle has
 715  *      the usage count incremented and the caller must use dev_put() to
 716  *      release it when it is no longer needed. %NULL is returned if no
 717  *      matching device is found.
 718  */
 719
 720 struct net_device *dev_get_by_name(struct net *net, const char *name)
 721 {
 722         struct net_device *dev;
 723
 724         rcu_read_lock();
 725         dev = dev_get_by_name_rcu(net, name);
 726         if (dev)
 727                 dev_hold(dev);
 728         rcu_read_unlock();
 729         return dev;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_name);
 732
 733 /**
 734  *      __dev_get_by_index - find a device by its ifindex
 735  *      @net: the applicable net namespace
 736  *      @ifindex: index of device
 737  *
 738  *      Search for an interface by index. Returns %NULL if the device
 739  *      is not found or a pointer to the device. The device has not
 740  *      had its reference counter increased so the caller must be careful
 741  *      about locking. The caller must hold either the RTNL semaphore
 742  *      or @dev_base_lock.
 743  */
 744
 745 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 746 {
 747         struct net_device *dev;
 748         struct hlist_head *head = dev_index_hash(net, ifindex);
 749
 750         hlist_for_each_entry(dev, head, index_hlist)
 751                 if (dev->ifindex == ifindex)
 752                         return dev;
 753
 754         return NULL;
 755 }
 756 EXPORT_SYMBOL(__dev_get_by_index);
 757
 758 /**
 759  *      dev_get_by_index_rcu - find a device by its ifindex
 760  *      @net: the applicable net namespace
 761  *      @ifindex: index of device
 762  *
 763  *      Search for an interface by index. Returns %NULL if the device
 764  *      is not found or a pointer to the device. The device has not
 765  *      had its reference counter increased so the caller must be careful
 766  *      about locking. The caller must hold RCU lock.
 767  */
 768
 769 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 770 {
 771         struct net_device *dev;
 772         struct hlist_head *head = dev_index_hash(net, ifindex);
 773
 774         hlist_for_each_entry_rcu(dev, head, index_hlist)
 775                 if (dev->ifindex == ifindex)
 776                         return dev;
 777
 778         return NULL;
 779 }
 780 EXPORT_SYMBOL(dev_get_by_index_rcu);
 781
 782
 783 /**
 784  *      dev_get_by_index - find a device by its ifindex
 785  *      @net: the applicable net namespace
 786  *      @ifindex: index of device
 787  *
 788  *      Search for an interface by index. Returns NULL if the device
 789  *      is not found or a pointer to the device. The device returned has
 790  *      had a reference added and the pointer is safe until the user calls
 791  *      dev_put to indicate they have finished with it.
 792  */
 793
 794 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 795 {
 796         struct net_device *dev;
 797
 798         rcu_read_lock();
 799         dev = dev_get_by_index_rcu(net, ifindex);
 800         if (dev)
 801                 dev_hold(dev);
 802         rcu_read_unlock();
 803         return dev;
 804 }
 805 EXPORT_SYMBOL(dev_get_by_index);
 806
 807 /**
 808  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 809  *      @net: network namespace
 810  *      @name: a pointer to the buffer where the name will be stored.
 811  *      @ifindex: the ifindex of the interface to get the name from.
 812  *
 813  *      The use of raw_seqcount_begin() and cond_resched() before
 814  *      retrying is required as we want to give the writers a chance
 815  *      to complete when CONFIG_PREEMPT is not set.
 816  */
 817 int netdev_get_name(struct net *net, char *name, int ifindex)
 818 {
 819         struct net_device *dev;
 820         unsigned int seq;
 821
 822 retry:
 823         seq = raw_seqcount_begin(&devnet_rename_seq);
 824         rcu_read_lock();
 825         dev = dev_get_by_index_rcu(net, ifindex);
 826         if (!dev) {
 827                 rcu_read_unlock();
 828                 return -ENODEV;
 829         }
 830
 831         strcpy(name, dev->name);
 832         rcu_read_unlock();
 833         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 834                 cond_resched();
 835                 goto retry;
 836         }
 837
 838         return 0;
 839 }
 840
 841 /**
 842  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 843  *      @net: the applicable net namespace
 844  *      @type: media type of device
 845  *      @ha: hardware address
 846  *
 847  *      Search for an interface by MAC address. Returns NULL if the device
 848  *      is not found or a pointer to the device.
 849  *      The caller must hold RCU or RTNL.
 850  *      The returned device has not had its ref count increased
 851  *      and the caller must therefore be careful about locking
 852  *
 853  */
 854
 855 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 856                                        const char *ha)
 857 {
 858         struct net_device *dev;
 859
 860         for_each_netdev_rcu(net, dev)
 861                 if (dev->type == type &&
 862                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 863                         return dev;
 864
 865         return NULL;
 866 }
 867 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 868
 869 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 870 {
 871         struct net_device *dev;
 872
 873         ASSERT_RTNL();
 874         for_each_netdev(net, dev)
 875                 if (dev->type == type)
 876                         return dev;
 877
 878         return NULL;
 879 }
 880 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 881
 882 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 883 {
 884         struct net_device *dev, *ret = NULL;
 885
 886         rcu_read_lock();
 887         for_each_netdev_rcu(net, dev)
 888                 if (dev->type == type) {
 889                         dev_hold(dev);
 890                         ret = dev;
 891                         break;
 892                 }
 893         rcu_read_unlock();
 894         return ret;
 895 }
 896 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 897
 898 /**
 899  *      dev_get_by_flags_rcu - find any device with given flags
 900  *      @net: the applicable net namespace
 901  *      @if_flags: IFF_* values
 902  *      @mask: bitmask of bits in if_flags to check
 903  *
 904  *      Search for any interface with the given flags. Returns NULL if a device
 905  *      is not found or a pointer to the device. Must be called inside
 906  *      rcu_read_lock(), and result refcount is unchanged.
 907  */
 908
 909 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 910                                     unsigned short mask)
 911 {
 912         struct net_device *dev, *ret;
 913
 914         ret = NULL;
 915         for_each_netdev_rcu(net, dev) {
 916                 if (((dev->flags ^ if_flags) & mask) == 0) {
 917                         ret = dev;
 918                         break;
 919                 }
 920         }
 921         return ret;
 922 }
 923 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 924
 925 /**
 926  *      dev_valid_name - check if name is okay for network device
 927  *      @name: name string
 928  *
 929  *      Network device names need to be valid file names to
 930  *      to allow sysfs to work.  We also disallow any kind of
 931  *      whitespace.
 932  */
 933 bool dev_valid_name(const char *name)
 934 {
 935         if (*name == '\0')
 936                 return false;
 937         if (strlen(name) >= IFNAMSIZ)
 938                 return false;
 939         if (!strcmp(name, ".") || !strcmp(name, ".."))
 940                 return false;
 941
 942         while (*name) {
 943                 if (*name == '/' || isspace(*name))
 944                         return false;
 945                 name++;
 946         }
 947         return true;
 948 }
 949 EXPORT_SYMBOL(dev_valid_name);
 950
 951 /**
 952  *      __dev_alloc_name - allocate a name for a device
 953  *      @net: network namespace to allocate the device name in
 954  *      @name: name format string
 955  *      @buf:  scratch buffer and result name string
 956  *
 957  *      Passed a format string - eg "lt%d" it will try and find a suitable
 958  *      id. It scans list of devices to build up a free map, then chooses
 959  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 960  *      while allocating the name and adding the device in order to avoid
 961  *      duplicates.
 962  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 963  *      Returns the number of the unit assigned or a negative errno code.
 964  */
 965
 966 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 967 {
 968         int i = 0;
 969         const char *p;
 970         const int max_netdevices = 8*PAGE_SIZE;
 971         unsigned long *inuse;
 972         struct net_device *d;
 973
 974         p = strnchr(name, IFNAMSIZ-1, '%');
 975         if (p) {
 976                 /*
 977                  * Verify the string as this thing may have come from
 978                  * the user.  There must be either one "%d" and no other "%"
 979                  * characters.
 980                  */
 981                 if (p[1] != 'd' || strchr(p + 2, '%'))
 982                         return -EINVAL;
 983
 984                 /* Use one page as a bit array of possible slots */
 985                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 986                 if (!inuse)
 987                         return -ENOMEM;
 988
 989                 for_each_netdev(net, d) {
 990                         if (!sscanf(d->name, name, &i))
 991                                 continue;
 992                         if (i < 0 || i >= max_netdevices)
 993                                 continue;
 994
 995                         /*  avoid cases where sscanf is not exact inverse of printf */
 996                         snprintf(buf, IFNAMSIZ, name, i);
 997                         if (!strncmp(buf, d->name, IFNAMSIZ))
 998                                 set_bit(i, inuse);
 999                 }
1000
1001                 i = find_first_zero_bit(inuse, max_netdevices);
1002                 free_page((unsigned long) inuse);
1003         }
1004
1005         if (buf != name)
1006                 snprintf(buf, IFNAMSIZ, name, i);
1007         if (!__dev_get_by_name(net, buf))
1008                 return i;
1009
1010         /* It is possible to run out of possible slots
1011          * when the name is long and there isn't enough space left
1012          * for the digits, or if all bits are used.
1013          */
1014         return -ENFILE;
1015 }
1016
1017 /**
1018  *      dev_alloc_name - allocate a name for a device
1019  *      @dev: device
1020  *      @name: name format string
1021  *
1022  *      Passed a format string - eg "lt%d" it will try and find a suitable
1023  *      id. It scans list of devices to build up a free map, then chooses
1024  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1025  *      while allocating the name and adding the device in order to avoid
1026  *      duplicates.
1027  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1028  *      Returns the number of the unit assigned or a negative errno code.
1029  */
1030
1031 int dev_alloc_name(struct net_device *dev, const char *name)
1032 {
1033         char buf[IFNAMSIZ];
1034         struct net *net;
1035         int ret;
1036
1037         BUG_ON(!dev_net(dev));
1038         net = dev_net(dev);
1039         ret = __dev_alloc_name(net, name, buf);
1040         if (ret >= 0)
1041                 strlcpy(dev->name, buf, IFNAMSIZ);
1042         return ret;
1043 }
1044 EXPORT_SYMBOL(dev_alloc_name);
1045
1046 static int dev_alloc_name_ns(struct net *net,
1047                              struct net_device *dev,
1048                              const char *name)
1049 {
1050         char buf[IFNAMSIZ];
1051         int ret;
1052
1053         ret = __dev_alloc_name(net, name, buf);
1054         if (ret >= 0)
1055                 strlcpy(dev->name, buf, IFNAMSIZ);
1056         return ret;
1057 }
1058
1059 static int dev_get_valid_name(struct net *net,
1060                               struct net_device *dev,
1061                               const char *name)
1062 {
1063         BUG_ON(!net);
1064
1065         if (!dev_valid_name(name))
1066                 return -EINVAL;
1067
1068         if (strchr(name, '%'))
1069                 return dev_alloc_name_ns(net, dev, name);
1070         else if (__dev_get_by_name(net, name))
1071                 return -EEXIST;
1072         else if (dev->name != name)
1073                 strlcpy(dev->name, name, IFNAMSIZ);
1074
1075         return 0;
1076 }
1077
1078 /**
1079  *      dev_change_name - change name of a device
1080  *      @dev: device
1081  *      @newname: name (or format string) must be at least IFNAMSIZ
1082  *
1083  *      Change name of a device, can pass format strings "eth%d".
1084  *      for wildcarding.
1085  */
1086 int dev_change_name(struct net_device *dev, const char *newname)
1087 {
1088         char oldname[IFNAMSIZ];
1089         int err = 0;
1090         int ret;
1091         struct net *net;
1092
1093         ASSERT_RTNL();
1094         BUG_ON(!dev_net(dev));
1095
1096         net = dev_net(dev);
1097         if (dev->flags & IFF_UP)
1098                 return -EBUSY;
1099
1100         write_seqcount_begin(&devnet_rename_seq);
1101
1102         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1103                 write_seqcount_end(&devnet_rename_seq);
1104                 return 0;
1105         }
1106
1107         memcpy(oldname, dev->name, IFNAMSIZ);
1108
1109         err = dev_get_valid_name(net, dev, newname);
1110         if (err < 0) {
1111                 write_seqcount_end(&devnet_rename_seq);
1112                 return err;
1113         }
1114
1115 rollback:
1116         ret = device_rename(&dev->dev, dev->name);
1117         if (ret) {
1118                 memcpy(dev->name, oldname, IFNAMSIZ);
1119                 write_seqcount_end(&devnet_rename_seq);
1120                 return ret;
1121         }
1122
1123         write_seqcount_end(&devnet_rename_seq);
1124
1125         netdev_adjacent_rename_links(dev, oldname);
1126
1127         write_lock_bh(&dev_base_lock);
1128         hlist_del_rcu(&dev->name_hlist);
1129         write_unlock_bh(&dev_base_lock);
1130
1131         synchronize_rcu();
1132
1133         write_lock_bh(&dev_base_lock);
1134         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1135         write_unlock_bh(&dev_base_lock);
1136
1137         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1138         ret = notifier_to_errno(ret);
1139
1140         if (ret) {
1141                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1142                 if (err >= 0) {
1143                         err = ret;
1144                         write_seqcount_begin(&devnet_rename_seq);
1145                         memcpy(dev->name, oldname, IFNAMSIZ);
1146                         memcpy(oldname, newname, IFNAMSIZ);
1147                         goto rollback;
1148                 } else {
1149                         pr_err("%s: name change rollback failed: %d\n",
1150                                dev->name, ret);
1151                 }
1152         }
1153
1154         return err;
1155 }
1156
1157 /**
1158  *      dev_set_alias - change ifalias of a device
1159  *      @dev: device
1160  *      @alias: name up to IFALIASZ
1161  *      @len: limit of bytes to copy from info
1162  *
1163  *      Set ifalias for a device,
1164  */
1165 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1166 {
1167         char *new_ifalias;
1168
1169         ASSERT_RTNL();
1170
1171         if (len >= IFALIASZ)
1172                 return -EINVAL;
1173
1174         if (!len) {
1175                 kfree(dev->ifalias);
1176                 dev->ifalias = NULL;
1177                 return 0;
1178         }
1179
1180         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1181         if (!new_ifalias)
1182                 return -ENOMEM;
1183         dev->ifalias = new_ifalias;
1184
1185         strlcpy(dev->ifalias, alias, len+1);
1186         return len;
1187 }
1188
1189
1190 /**
1191  *      netdev_features_change - device changes features
1192  *      @dev: device to cause notification
1193  *
1194  *      Called to indicate a device has changed features.
1195  */
1196 void netdev_features_change(struct net_device *dev)
1197 {
1198         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1199 }
1200 EXPORT_SYMBOL(netdev_features_change);
1201
1202 /**
1203  *      netdev_state_change - device changes state
1204  *      @dev: device to cause notification
1205  *
1206  *      Called to indicate a device has changed state. This function calls
1207  *      the notifier chains for netdev_chain and sends a NEWLINK message
1208  *      to the routing socket.
1209  */
1210 void netdev_state_change(struct net_device *dev)
1211 {
1212         if (dev->flags & IFF_UP) {
1213                 struct netdev_notifier_change_info change_info;
1214
1215                 change_info.flags_changed = 0;
1216                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1217                                               &change_info.info);
1218                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1219         }
1220 }
1221 EXPORT_SYMBOL(netdev_state_change);
1222
1223 /**
1224  *      netdev_notify_peers - notify network peers about existence of @dev
1225  *      @dev: network device
1226  *
1227  * Generate traffic such that interested network peers are aware of
1228  * @dev, such as by generating a gratuitous ARP. This may be used when
1229  * a device wants to inform the rest of the network about some sort of
1230  * reconfiguration such as a failover event or virtual machine
1231  * migration.
1232  */
1233 void netdev_notify_peers(struct net_device *dev)
1234 {
1235         rtnl_lock();
1236         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1237         rtnl_unlock();
1238 }
1239 EXPORT_SYMBOL(netdev_notify_peers);
1240
1241 static int __dev_open(struct net_device *dev)
1242 {
1243         const struct net_device_ops *ops = dev->netdev_ops;
1244         int ret;
1245
1246         ASSERT_RTNL();
1247
1248         if (!netif_device_present(dev))
1249                 return -ENODEV;
1250
1251         /* Block netpoll from trying to do any rx path servicing.
1252          * If we don't do this there is a chance ndo_poll_controller
1253          * or ndo_poll may be running while we open the device
1254          */
1255         netpoll_poll_disable(dev);
1256
1257         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1258         ret = notifier_to_errno(ret);
1259         if (ret)
1260                 return ret;
1261
1262         set_bit(__LINK_STATE_START, &dev->state);
1263
1264         if (ops->ndo_validate_addr)
1265                 ret = ops->ndo_validate_addr(dev);
1266
1267         if (!ret && ops->ndo_open)
1268                 ret = ops->ndo_open(dev);
1269
1270         netpoll_poll_enable(dev);
1271
1272         if (ret)
1273                 clear_bit(__LINK_STATE_START, &dev->state);
1274         else {
1275                 dev->flags |= IFF_UP;
1276                 net_dmaengine_get();
1277                 dev_set_rx_mode(dev);
1278                 dev_activate(dev);
1279                 add_device_randomness(dev->dev_addr, dev->addr_len);
1280         }
1281
1282         return ret;
1283 }
1284
1285 /**
1286  *      dev_open        - prepare an interface for use.
1287  *      @dev:   device to open
1288  *
1289  *      Takes a device from down to up state. The device's private open
1290  *      function is invoked and then the multicast lists are loaded. Finally
1291  *      the device is moved into the up state and a %NETDEV_UP message is
1292  *      sent to the netdev notifier chain.
1293  *
1294  *      Calling this function on an active interface is a nop. On a failure
1295  *      a negative errno code is returned.
1296  */
1297 int dev_open(struct net_device *dev)
1298 {
1299         int ret;
1300
1301         if (dev->flags & IFF_UP)
1302                 return 0;
1303
1304         ret = __dev_open(dev);
1305         if (ret < 0)
1306                 return ret;
1307
1308         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1309         call_netdevice_notifiers(NETDEV_UP, dev);
1310
1311         return ret;
1312 }
1313 EXPORT_SYMBOL(dev_open);
1314
1315 static int __dev_close_many(struct list_head *head)
1316 {
1317         struct net_device *dev;
1318
1319         ASSERT_RTNL();
1320         might_sleep();
1321
1322         list_for_each_entry(dev, head, close_list) {
1323                 /* Temporarily disable netpoll until the interface is down */
1324                 netpoll_poll_disable(dev);
1325
1326                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1327
1328                 clear_bit(__LINK_STATE_START, &dev->state);
1329
1330                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1331                  * can be even on different cpu. So just clear netif_running().
1332                  *
1333                  * dev->stop() will invoke napi_disable() on all of it's
1334                  * napi_struct instances on this device.
1335                  */
1336                 smp_mb__after_atomic(); /* Commit netif_running(). */
1337         }
1338
1339         dev_deactivate_many(head);
1340
1341         list_for_each_entry(dev, head, close_list) {
1342                 const struct net_device_ops *ops = dev->netdev_ops;
1343
1344                 /*
1345                  *      Call the device specific close. This cannot fail.
1346                  *      Only if device is UP
1347                  *
1348                  *      We allow it to be called even after a DETACH hot-plug
1349                  *      event.
1350                  */
1351                 if (ops->ndo_stop)
1352                         ops->ndo_stop(dev);
1353
1354                 dev->flags &= ~IFF_UP;
1355                 net_dmaengine_put();
1356                 netpoll_poll_enable(dev);
1357         }
1358
1359         return 0;
1360 }
1361
1362 static int __dev_close(struct net_device *dev)
1363 {
1364         int retval;
1365         LIST_HEAD(single);
1366
1367         list_add(&dev->close_list, &single);
1368         retval = __dev_close_many(&single);
1369         list_del(&single);
1370
1371         return retval;
1372 }
1373
1374 static int dev_close_many(struct list_head *head)
1375 {
1376         struct net_device *dev, *tmp;
1377
1378         /* Remove the devices that don't need to be closed */
1379         list_for_each_entry_safe(dev, tmp, head, close_list)
1380                 if (!(dev->flags & IFF_UP))
1381                         list_del_init(&dev->close_list);
1382
1383         __dev_close_many(head);
1384
1385         list_for_each_entry_safe(dev, tmp, head, close_list) {
1386                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1387                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1388                 list_del_init(&dev->close_list);
1389         }
1390
1391         return 0;
1392 }
1393
1394 /**
1395  *      dev_close - shutdown an interface.
1396  *      @dev: device to shutdown
1397  *
1398  *      This function moves an active device into down state. A
1399  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1400  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1401  *      chain.
1402  */
1403 int dev_close(struct net_device *dev)
1404 {
1405         if (dev->flags & IFF_UP) {
1406                 LIST_HEAD(single);
1407
1408                 list_add(&dev->close_list, &single);
1409                 dev_close_many(&single);
1410                 list_del(&single);
1411         }
1412         return 0;
1413 }
1414 EXPORT_SYMBOL(dev_close);
1415
1416
1417 /**
1418  *      dev_disable_lro - disable Large Receive Offload on a device
1419  *      @dev: device
1420  *
1421  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1422  *      called under RTNL.  This is needed if received packets may be
1423  *      forwarded to another interface.
1424  */
1425 void dev_disable_lro(struct net_device *dev)
1426 {
1427         /*
1428          * If we're trying to disable lro on a vlan device
1429          * use the underlying physical device instead
1430          */
1431         if (is_vlan_dev(dev))
1432                 dev = vlan_dev_real_dev(dev);
1433
1434         /* the same for macvlan devices */
1435         if (netif_is_macvlan(dev))
1436                 dev = macvlan_dev_real_dev(dev);
1437
1438         dev->wanted_features &= ~NETIF_F_LRO;
1439         netdev_update_features(dev);
1440
1441         if (unlikely(dev->features & NETIF_F_LRO))
1442                 netdev_WARN(dev, "failed to disable LRO!\n");
1443 }
1444 EXPORT_SYMBOL(dev_disable_lro);
1445
1446 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1447                                    struct net_device *dev)
1448 {
1449         struct netdev_notifier_info info;
1450
1451         netdev_notifier_info_init(&info, dev);
1452         return nb->notifier_call(nb, val, &info);
1453 }
1454
1455 static int dev_boot_phase = 1;
1456
1457 /**
1458  *      register_netdevice_notifier - register a network notifier block
1459  *      @nb: notifier
1460  *
1461  *      Register a notifier to be called when network device events occur.
1462  *      The notifier passed is linked into the kernel structures and must
1463  *      not be reused until it has been unregistered. A negative errno code
1464  *      is returned on a failure.
1465  *
1466  *      When registered all registration and up events are replayed
1467  *      to the new notifier to allow device to have a race free
1468  *      view of the network device list.
1469  */
1470
1471 int register_netdevice_notifier(struct notifier_block *nb)
1472 {
1473         struct net_device *dev;
1474         struct net_device *last;
1475         struct net *net;
1476         int err;
1477
1478         rtnl_lock();
1479         err = raw_notifier_chain_register(&netdev_chain, nb);
1480         if (err)
1481                 goto unlock;
1482         if (dev_boot_phase)
1483                 goto unlock;
1484         for_each_net(net) {
1485                 for_each_netdev(net, dev) {
1486                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1487                         err = notifier_to_errno(err);
1488                         if (err)
1489                                 goto rollback;
1490
1491                         if (!(dev->flags & IFF_UP))
1492                                 continue;
1493
1494                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1495                 }
1496         }
1497
1498 unlock:
1499         rtnl_unlock();
1500         return err;
1501
1502 rollback:
1503         last = dev;
1504         for_each_net(net) {
1505                 for_each_netdev(net, dev) {
1506                         if (dev == last)
1507                                 goto outroll;
1508
1509                         if (dev->flags & IFF_UP) {
1510                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1511                                                         dev);
1512                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1513                         }
1514                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1515                 }
1516         }
1517
1518 outroll:
1519         raw_notifier_chain_unregister(&netdev_chain, nb);
1520         goto unlock;
1521 }
1522 EXPORT_SYMBOL(register_netdevice_notifier);
1523
1524 /**
1525  *      unregister_netdevice_notifier - unregister a network notifier block
1526  *      @nb: notifier
1527  *
1528  *      Unregister a notifier previously registered by
1529  *      register_netdevice_notifier(). The notifier is unlinked into the
1530  *      kernel structures and may then be reused. A negative errno code
1531  *      is returned on a failure.
1532  *
1533  *      After unregistering unregister and down device events are synthesized
1534  *      for all devices on the device list to the removed notifier to remove
1535  *      the need for special case cleanup code.
1536  */
1537
1538 int unregister_netdevice_notifier(struct notifier_block *nb)
1539 {
1540         struct net_device *dev;
1541         struct net *net;
1542         int err;
1543
1544         rtnl_lock();
1545         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1546         if (err)
1547                 goto unlock;
1548
1549         for_each_net(net) {
1550                 for_each_netdev(net, dev) {
1551                         if (dev->flags & IFF_UP) {
1552                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1553                                                         dev);
1554                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1555                         }
1556                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1557                 }
1558         }
1559 unlock:
1560         rtnl_unlock();
1561         return err;
1562 }
1563 EXPORT_SYMBOL(unregister_netdevice_notifier);
1564
1565 /**
1566  *      call_netdevice_notifiers_info - call all network notifier blocks
1567  *      @val: value passed unmodified to notifier function
1568  *      @dev: net_device pointer passed unmodified to notifier function
1569  *      @info: notifier information data
1570  *
1571  *      Call all network notifier blocks.  Parameters and return value
1572  *      are as for raw_notifier_call_chain().
1573  */
1574
1575 static int call_netdevice_notifiers_info(unsigned long val,
1576                                          struct net_device *dev,
1577                                          struct netdev_notifier_info *info)
1578 {
1579         ASSERT_RTNL();
1580         netdev_notifier_info_init(info, dev);
1581         return raw_notifier_call_chain(&netdev_chain, val, info);
1582 }
1583
1584 /**
1585  *      call_netdevice_notifiers - call all network notifier blocks
1586  *      @val: value passed unmodified to notifier function
1587  *      @dev: net_device pointer passed unmodified to notifier function
1588  *
1589  *      Call all network notifier blocks.  Parameters and return value
1590  *      are as for raw_notifier_call_chain().
1591  */
1592
1593 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1594 {
1595         struct netdev_notifier_info info;
1596
1597         return call_netdevice_notifiers_info(val, dev, &info);
1598 }
1599 EXPORT_SYMBOL(call_netdevice_notifiers);
1600
1601 static struct static_key netstamp_needed __read_mostly;
1602 #ifdef HAVE_JUMP_LABEL
1603 /* We are not allowed to call static_key_slow_dec() from irq context
1604  * If net_disable_timestamp() is called from irq context, defer the
1605  * static_key_slow_dec() calls.
1606  */
1607 static atomic_t netstamp_needed_deferred;
1608 #endif
1609
1610 void net_enable_timestamp(void)
1611 {
1612 #ifdef HAVE_JUMP_LABEL
1613         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1614
1615         if (deferred) {
1616                 while (--deferred)
1617                         static_key_slow_dec(&netstamp_needed);
1618                 return;
1619         }
1620 #endif
1621         static_key_slow_inc(&netstamp_needed);
1622 }
1623 EXPORT_SYMBOL(net_enable_timestamp);
1624
1625 void net_disable_timestamp(void)
1626 {
1627 #ifdef HAVE_JUMP_LABEL
1628         if (in_interrupt()) {
1629                 atomic_inc(&netstamp_needed_deferred);
1630                 return;
1631         }
1632 #endif
1633         static_key_slow_dec(&netstamp_needed);
1634 }
1635 EXPORT_SYMBOL(net_disable_timestamp);
1636
1637 static inline void net_timestamp_set(struct sk_buff *skb)
1638 {
1639         skb->tstamp.tv64 = 0;
1640         if (static_key_false(&netstamp_needed))
1641                 __net_timestamp(skb);
1642 }
1643
1644 #define net_timestamp_check(COND, SKB)                  \
1645         if (static_key_false(&netstamp_needed)) {               \
1646                 if ((COND) && !(SKB)->tstamp.tv64)      \
1647                         __net_timestamp(SKB);           \
1648         }                                               \
1649
1650 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1651 {
1652         unsigned int len;
1653
1654         if (!(dev->flags & IFF_UP))
1655                 return false;
1656
1657         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1658         if (skb->len <= len)
1659                 return true;
1660
1661         /* if TSO is enabled, we don't care about the length as the packet
1662          * could be forwarded without being segmented before
1663          */
1664         if (skb_is_gso(skb))
1665                 return true;
1666
1667         return false;
1668 }
1669 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1670
1671 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1672 {
1673         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1674                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1675                         atomic_long_inc(&dev->rx_dropped);
1676                         kfree_skb(skb);
1677                         return NET_RX_DROP;
1678                 }
1679         }
1680
1681         if (unlikely(!is_skb_forwardable(dev, skb))) {
1682                 atomic_long_inc(&dev->rx_dropped);
1683                 kfree_skb(skb);
1684                 return NET_RX_DROP;
1685         }
1686
1687         skb_scrub_packet(skb, true);
1688         skb->protocol = eth_type_trans(skb, dev);
1689
1690         return 0;
1691 }
1692 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1693
1694 /**
1695  * dev_forward_skb - loopback an skb to another netif
1696  *
1697  * @dev: destination network device
1698  * @skb: buffer to forward
1699  *
1700  * return values:
1701  *      NET_RX_SUCCESS  (no congestion)
1702  *      NET_RX_DROP     (packet was dropped, but freed)
1703  *
1704  * dev_forward_skb can be used for injecting an skb from the
1705  * start_xmit function of one device into the receive queue
1706  * of another device.
1707  *
1708  * The receiving device may be in another namespace, so
1709  * we have to clear all information in the skb that could
1710  * impact namespace isolation.
1711  */
1712 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1713 {
1714         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1715 }
1716 EXPORT_SYMBOL_GPL(dev_forward_skb);
1717
1718 static inline int deliver_skb(struct sk_buff *skb,
1719                               struct packet_type *pt_prev,
1720                               struct net_device *orig_dev)
1721 {
1722         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1723                 return -ENOMEM;
1724         atomic_inc(&skb->users);
1725         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1726 }
1727
1728 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1729 {
1730         if (!ptype->af_packet_priv || !skb->sk)
1731                 return false;
1732
1733         if (ptype->id_match)
1734                 return ptype->id_match(ptype, skb->sk);
1735         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1736                 return true;
1737
1738         return false;
1739 }
1740
1741 /*
1742  *      Support routine. Sends outgoing frames to any network
1743  *      taps currently in use.
1744  */
1745
1746 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1747 {
1748         struct packet_type *ptype;
1749         struct sk_buff *skb2 = NULL;
1750         struct packet_type *pt_prev = NULL;
1751
1752         rcu_read_lock();
1753         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1754                 /* Never send packets back to the socket
1755                  * they originated from - MvS (miquels@drinkel.ow.org)
1756                  */
1757                 if ((ptype->dev == dev || !ptype->dev) &&
1758                     (!skb_loop_sk(ptype, skb))) {
1759                         if (pt_prev) {
1760                                 deliver_skb(skb2, pt_prev, skb->dev);
1761                                 pt_prev = ptype;
1762                                 continue;
1763                         }
1764
1765                         skb2 = skb_clone(skb, GFP_ATOMIC);
1766                         if (!skb2)
1767                                 break;
1768
1769                         net_timestamp_set(skb2);
1770
1771                         /* skb->nh should be correctly
1772                            set by sender, so that the second statement is
1773                            just protection against buggy protocols.
1774                          */
1775                         skb_reset_mac_header(skb2);
1776
1777                         if (skb_network_header(skb2) < skb2->data ||
1778                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1779                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1780                                                      ntohs(skb2->protocol),
1781                                                      dev->name);
1782                                 skb_reset_network_header(skb2);
1783                         }
1784
1785                         skb2->transport_header = skb2->network_header;
1786                         skb2->pkt_type = PACKET_OUTGOING;
1787                         pt_prev = ptype;
1788                 }
1789         }
1790         if (pt_prev)
1791                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1792         rcu_read_unlock();
1793 }
1794
1795 /**
1796  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1797  * @dev: Network device
1798  * @txq: number of queues available
1799  *
1800  * If real_num_tx_queues is changed the tc mappings may no longer be
1801  * valid. To resolve this verify the tc mapping remains valid and if
1802  * not NULL the mapping. With no priorities mapping to this
1803  * offset/count pair it will no longer be used. In the worst case TC0
1804  * is invalid nothing can be done so disable priority mappings. If is
1805  * expected that drivers will fix this mapping if they can before
1806  * calling netif_set_real_num_tx_queues.
1807  */
1808 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1809 {
1810         int i;
1811         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1812
1813         /* If TC0 is invalidated disable TC mapping */
1814         if (tc->offset + tc->count > txq) {
1815                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1816                 dev->num_tc = 0;
1817                 return;
1818         }
1819
1820         /* Invalidated prio to tc mappings set to TC0 */
1821         for (i = 1; i < TC_BITMASK + 1; i++) {
1822                 int q = netdev_get_prio_tc_map(dev, i);
1823
1824                 tc = &dev->tc_to_txq[q];
1825                 if (tc->offset + tc->count > txq) {
1826                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1827                                 i, q);
1828                         netdev_set_prio_tc_map(dev, i, 0);
1829                 }
1830         }
1831 }
1832
1833 #ifdef CONFIG_XPS
1834 static DEFINE_MUTEX(xps_map_mutex);
1835 #define xmap_dereference(P)             \
1836         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1837
1838 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1839                                         int cpu, u16 index)
1840 {
1841         struct xps_map *map = NULL;
1842         int pos;
1843
1844         if (dev_maps)
1845                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1846
1847         for (pos = 0; map && pos < map->len; pos++) {
1848                 if (map->queues[pos] == index) {
1849                         if (map->len > 1) {
1850                                 map->queues[pos] = map->queues[--map->len];
1851                         } else {
1852                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1853                                 kfree_rcu(map, rcu);
1854                                 map = NULL;
1855                         }
1856                         break;
1857                 }
1858         }
1859
1860         return map;
1861 }
1862
1863 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1864 {
1865         struct xps_dev_maps *dev_maps;
1866         int cpu, i;
1867         bool active = false;
1868
1869         mutex_lock(&xps_map_mutex);
1870         dev_maps = xmap_dereference(dev->xps_maps);
1871
1872         if (!dev_maps)
1873                 goto out_no_maps;
1874
1875         for_each_possible_cpu(cpu) {
1876                 for (i = index; i < dev->num_tx_queues; i++) {
1877                         if (!remove_xps_queue(dev_maps, cpu, i))
1878                                 break;
1879                 }
1880                 if (i == dev->num_tx_queues)
1881                         active = true;
1882         }
1883
1884         if (!active) {
1885                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1886                 kfree_rcu(dev_maps, rcu);
1887         }
1888
1889         for (i = index; i < dev->num_tx_queues; i++)
1890                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1891                                              NUMA_NO_NODE);
1892
1893 out_no_maps:
1894         mutex_unlock(&xps_map_mutex);
1895 }
1896
1897 static struct xps_map *expand_xps_map(struct xps_map *map,
1898                                       int cpu, u16 index)
1899 {
1900         struct xps_map *new_map;
1901         int alloc_len = XPS_MIN_MAP_ALLOC;
1902         int i, pos;
1903
1904         for (pos = 0; map && pos < map->len; pos++) {
1905                 if (map->queues[pos] != index)
1906                         continue;
1907                 return map;
1908         }
1909
1910         /* Need to add queue to this CPU's existing map */
1911         if (map) {
1912                 if (pos < map->alloc_len)
1913                         return map;
1914
1915                 alloc_len = map->alloc_len * 2;
1916         }
1917
1918         /* Need to allocate new map to store queue on this CPU's map */
1919         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1920                                cpu_to_node(cpu));
1921         if (!new_map)
1922                 return NULL;
1923
1924         for (i = 0; i < pos; i++)
1925                 new_map->queues[i] = map->queues[i];
1926         new_map->alloc_len = alloc_len;
1927         new_map->len = pos;
1928
1929         return new_map;
1930 }
1931
1932 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1933                         u16 index)
1934 {
1935         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1936         struct xps_map *map, *new_map;
1937         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1938         int cpu, numa_node_id = -2;
1939         bool active = false;
1940
1941         mutex_lock(&xps_map_mutex);
1942
1943         dev_maps = xmap_dereference(dev->xps_maps);
1944
1945         /* allocate memory for queue storage */
1946         for_each_online_cpu(cpu) {
1947                 if (!cpumask_test_cpu(cpu, mask))
1948                         continue;
1949
1950                 if (!new_dev_maps)
1951                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1952                 if (!new_dev_maps) {
1953                         mutex_unlock(&xps_map_mutex);
1954                         return -ENOMEM;
1955                 }
1956
1957                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1958                                  NULL;
1959
1960                 map = expand_xps_map(map, cpu, index);
1961                 if (!map)
1962                         goto error;
1963
1964                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1965         }
1966
1967         if (!new_dev_maps)
1968                 goto out_no_new_maps;
1969
1970         for_each_possible_cpu(cpu) {
1971                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1972                         /* add queue to CPU maps */
1973                         int pos = 0;
1974
1975                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1976                         while ((pos < map->len) && (map->queues[pos] != index))
1977                                 pos++;
1978
1979                         if (pos == map->len)
1980                                 map->queues[map->len++] = index;
1981 #ifdef CONFIG_NUMA
1982                         if (numa_node_id == -2)
1983                                 numa_node_id = cpu_to_node(cpu);
1984                         else if (numa_node_id != cpu_to_node(cpu))
1985                                 numa_node_id = -1;
1986 #endif
1987                 } else if (dev_maps) {
1988                         /* fill in the new device map from the old device map */
1989                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1991                 }
1992
1993         }
1994
1995         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1996
1997         /* Cleanup old maps */
1998         if (dev_maps) {
1999                 for_each_possible_cpu(cpu) {
2000                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2001                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2002                         if (map && map != new_map)
2003                                 kfree_rcu(map, rcu);
2004                 }
2005
2006                 kfree_rcu(dev_maps, rcu);
2007         }
2008
2009         dev_maps = new_dev_maps;
2010         active = true;
2011
2012 out_no_new_maps:
2013         /* update Tx queue numa node */
2014         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2015                                      (numa_node_id >= 0) ? numa_node_id :
2016                                      NUMA_NO_NODE);
2017
2018         if (!dev_maps)
2019                 goto out_no_maps;
2020
2021         /* removes queue from unused CPUs */
2022         for_each_possible_cpu(cpu) {
2023                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2024                         continue;
2025
2026                 if (remove_xps_queue(dev_maps, cpu, index))
2027                         active = true;
2028         }
2029
2030         /* free map if not active */
2031         if (!active) {
2032                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2033                 kfree_rcu(dev_maps, rcu);
2034         }
2035
2036 out_no_maps:
2037         mutex_unlock(&xps_map_mutex);
2038
2039         return 0;
2040 error:
2041         /* remove any maps that we added */
2042         for_each_possible_cpu(cpu) {
2043                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2044                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2045                                  NULL;
2046                 if (new_map && new_map != map)
2047                         kfree(new_map);
2048         }
2049
2050         mutex_unlock(&xps_map_mutex);
2051
2052         kfree(new_dev_maps);
2053         return -ENOMEM;
2054 }
2055 EXPORT_SYMBOL(netif_set_xps_queue);
2056
2057 #endif
2058 /*
2059  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2060  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2061  */
2062 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2063 {
2064         int rc;
2065
2066         if (txq < 1 || txq > dev->num_tx_queues)
2067                 return -EINVAL;
2068
2069         if (dev->reg_state == NETREG_REGISTERED ||
2070             dev->reg_state == NETREG_UNREGISTERING) {
2071                 ASSERT_RTNL();
2072
2073                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2074                                                   txq);
2075                 if (rc)
2076                         return rc;
2077
2078                 if (dev->num_tc)
2079                         netif_setup_tc(dev, txq);
2080
2081                 if (txq < dev->real_num_tx_queues) {
2082                         qdisc_reset_all_tx_gt(dev, txq);
2083 #ifdef CONFIG_XPS
2084                         netif_reset_xps_queues_gt(dev, txq);
2085 #endif
2086                 }
2087         }
2088
2089         dev->real_num_tx_queues = txq;
2090         return 0;
2091 }
2092 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2093
2094 #ifdef CONFIG_SYSFS
2095 /**
2096  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2097  *      @dev: Network device
2098  *      @rxq: Actual number of RX queues
2099  *
2100  *      This must be called either with the rtnl_lock held or before
2101  *      registration of the net device.  Returns 0 on success, or a
2102  *      negative error code.  If called before registration, it always
2103  *      succeeds.
2104  */
2105 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2106 {
2107         int rc;
2108
2109         if (rxq < 1 || rxq > dev->num_rx_queues)
2110                 return -EINVAL;
2111
2112         if (dev->reg_state == NETREG_REGISTERED) {
2113                 ASSERT_RTNL();
2114
2115                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2116                                                   rxq);
2117                 if (rc)
2118                         return rc;
2119         }
2120
2121         dev->real_num_rx_queues = rxq;
2122         return 0;
2123 }
2124 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2125 #endif
2126
2127 /**
2128  * netif_get_num_default_rss_queues - default number of RSS queues
2129  *
2130  * This routine should set an upper limit on the number of RSS queues
2131  * used by default by multiqueue devices.
2132  */
2133 int netif_get_num_default_rss_queues(void)
2134 {
2135         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2136 }
2137 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2138
2139 static inline void __netif_reschedule(struct Qdisc *q)
2140 {
2141         struct softnet_data *sd;
2142         unsigned long flags;
2143
2144         local_irq_save(flags);
2145         sd = &__get_cpu_var(softnet_data);
2146         q->next_sched = NULL;
2147         *sd->output_queue_tailp = q;
2148         sd->output_queue_tailp = &q->next_sched;
2149         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2150         local_irq_restore(flags);
2151 }
2152
2153 void __netif_schedule(struct Qdisc *q)
2154 {
2155         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2156                 __netif_reschedule(q);
2157 }
2158 EXPORT_SYMBOL(__netif_schedule);
2159
2160 struct dev_kfree_skb_cb {
2161         enum skb_free_reason reason;
2162 };
2163
2164 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2165 {
2166         return (struct dev_kfree_skb_cb *)skb->cb;
2167 }
2168
2169 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2170 {
2171         unsigned long flags;
2172
2173         if (likely(atomic_read(&skb->users) == 1)) {
2174                 smp_rmb();
2175                 atomic_set(&skb->users, 0);
2176         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2177                 return;
2178         }
2179         get_kfree_skb_cb(skb)->reason = reason;
2180         local_irq_save(flags);
2181         skb->next = __this_cpu_read(softnet_data.completion_queue);
2182         __this_cpu_write(softnet_data.completion_queue, skb);
2183         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2184         local_irq_restore(flags);
2185 }
2186 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2187
2188 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2189 {
2190         if (in_irq() || irqs_disabled())
2191                 __dev_kfree_skb_irq(skb, reason);
2192         else
2193                 dev_kfree_skb(skb);
2194 }
2195 EXPORT_SYMBOL(__dev_kfree_skb_any);
2196
2197
2198 /**
2199  * netif_device_detach - mark device as removed
2200  * @dev: network device
2201  *
2202  * Mark device as removed from system and therefore no longer available.
2203  */
2204 void netif_device_detach(struct net_device *dev)
2205 {
2206         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2207             netif_running(dev)) {
2208                 netif_tx_stop_all_queues(dev);
2209         }
2210 }
2211 EXPORT_SYMBOL(netif_device_detach);
2212
2213 /**
2214  * netif_device_attach - mark device as attached
2215  * @dev: network device
2216  *
2217  * Mark device as attached from system and restart if needed.
2218  */
2219 void netif_device_attach(struct net_device *dev)
2220 {
2221         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2222             netif_running(dev)) {
2223                 netif_tx_wake_all_queues(dev);
2224                 __netdev_watchdog_up(dev);
2225         }
2226 }
2227 EXPORT_SYMBOL(netif_device_attach);
2228
2229 static void skb_warn_bad_offload(const struct sk_buff *skb)
2230 {
2231         static const netdev_features_t null_features = 0;
2232         struct net_device *dev = skb->dev;
2233         const char *driver = "";
2234
2235         if (!net_ratelimit())
2236                 return;
2237
2238         if (dev && dev->dev.parent)
2239                 driver = dev_driver_string(dev->dev.parent);
2240
2241         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2242              "gso_type=%d ip_summed=%d\n",
2243              driver, dev ? &dev->features : &null_features,
2244              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2245              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2246              skb_shinfo(skb)->gso_type, skb->ip_summed);
2247 }
2248
2249 /*
2250  * Invalidate hardware checksum when packet is to be mangled, and
2251  * complete checksum manually on outgoing path.
2252  */
2253 int skb_checksum_help(struct sk_buff *skb)
2254 {
2255         __wsum csum;
2256         int ret = 0, offset;
2257
2258         if (skb->ip_summed == CHECKSUM_COMPLETE)
2259                 goto out_set_summed;
2260
2261         if (unlikely(skb_shinfo(skb)->gso_size)) {
2262                 skb_warn_bad_offload(skb);
2263                 return -EINVAL;
2264         }
2265
2266         /* Before computing a checksum, we should make sure no frag could
2267          * be modified by an external entity : checksum could be wrong.
2268          */
2269         if (skb_has_shared_frag(skb)) {
2270                 ret = __skb_linearize(skb);
2271                 if (ret)
2272                         goto out;
2273         }
2274
2275         offset = skb_checksum_start_offset(skb);
2276         BUG_ON(offset >= skb_headlen(skb));
2277         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2278
2279         offset += skb->csum_offset;
2280         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2281
2282         if (skb_cloned(skb) &&
2283             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2284                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2285                 if (ret)
2286                         goto out;
2287         }
2288
2289         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2290 out_set_summed:
2291         skb->ip_summed = CHECKSUM_NONE;
2292 out:
2293         return ret;
2294 }
2295 EXPORT_SYMBOL(skb_checksum_help);
2296
2297 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2298 {
2299         unsigned int vlan_depth = skb->mac_len;
2300         __be16 type = skb->protocol;
2301
2302         /* Tunnel gso handlers can set protocol to ethernet. */
2303         if (type == htons(ETH_P_TEB)) {
2304                 struct ethhdr *eth;
2305
2306                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2307                         return 0;
2308
2309                 eth = (struct ethhdr *)skb_mac_header(skb);
2310                 type = eth->h_proto;
2311         }
2312
2313         /* if skb->protocol is 802.1Q/AD then the header should already be
2314          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2315          * ETH_HLEN otherwise
2316          */
2317         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2318                 if (vlan_depth) {
2319                         if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2320                                 return 0;
2321                         vlan_depth -= VLAN_HLEN;
2322                 } else {
2323                         vlan_depth = ETH_HLEN;
2324                 }
2325                 do {
2326                         struct vlan_hdr *vh;
2327
2328                         if (unlikely(!pskb_may_pull(skb,
2329                                                     vlan_depth + VLAN_HLEN)))
2330                                 return 0;
2331
2332                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2333                         type = vh->h_vlan_encapsulated_proto;
2334                         vlan_depth += VLAN_HLEN;
2335                 } while (type == htons(ETH_P_8021Q) ||
2336                          type == htons(ETH_P_8021AD));
2337         }
2338
2339         *depth = vlan_depth;
2340
2341         return type;
2342 }
2343
2344 /**
2345  *      skb_mac_gso_segment - mac layer segmentation handler.
2346  *      @skb: buffer to segment
2347  *      @features: features for the output path (see dev->features)
2348  */
2349 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2350                                     netdev_features_t features)
2351 {
2352         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2353         struct packet_offload *ptype;
2354         int vlan_depth = skb->mac_len;
2355         __be16 type = skb_network_protocol(skb, &vlan_depth);
2356
2357         if (unlikely(!type))
2358                 return ERR_PTR(-EINVAL);
2359
2360         __skb_pull(skb, vlan_depth);
2361
2362         rcu_read_lock();
2363         list_for_each_entry_rcu(ptype, &offload_base, list) {
2364                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2365                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2366                                 int err;
2367
2368                                 err = ptype->callbacks.gso_send_check(skb);
2369                                 segs = ERR_PTR(err);
2370                                 if (err || skb_gso_ok(skb, features))
2371                                         break;
2372                                 __skb_push(skb, (skb->data -
2373                                                  skb_network_header(skb)));
2374                         }
2375                         segs = ptype->callbacks.gso_segment(skb, features);
2376                         break;
2377                 }
2378         }
2379         rcu_read_unlock();
2380
2381         __skb_push(skb, skb->data - skb_mac_header(skb));
2382
2383         return segs;
2384 }
2385 EXPORT_SYMBOL(skb_mac_gso_segment);
2386
2387
2388 /* openvswitch calls this on rx path, so we need a different check.
2389  */
2390 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2391 {
2392         if (tx_path)
2393                 return skb->ip_summed != CHECKSUM_PARTIAL;
2394         else
2395                 return skb->ip_summed == CHECKSUM_NONE;
2396 }
2397
2398 /**
2399  *      __skb_gso_segment - Perform segmentation on skb.
2400  *      @skb: buffer to segment
2401  *      @features: features for the output path (see dev->features)
2402  *      @tx_path: whether it is called in TX path
2403  *
2404  *      This function segments the given skb and returns a list of segments.
2405  *
2406  *      It may return NULL if the skb requires no segmentation.  This is
2407  *      only possible when GSO is used for verifying header integrity.
2408  */
2409 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2410                                   netdev_features_t features, bool tx_path)
2411 {
2412         if (unlikely(skb_needs_check(skb, tx_path))) {
2413                 int err;
2414
2415                 skb_warn_bad_offload(skb);
2416
2417                 if (skb_header_cloned(skb) &&
2418                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2419                         return ERR_PTR(err);
2420         }
2421
2422         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2423         SKB_GSO_CB(skb)->encap_level = 0;
2424
2425         skb_reset_mac_header(skb);
2426         skb_reset_mac_len(skb);
2427
2428         return skb_mac_gso_segment(skb, features);
2429 }
2430 EXPORT_SYMBOL(__skb_gso_segment);
2431
2432 /* Take action when hardware reception checksum errors are detected. */
2433 #ifdef CONFIG_BUG
2434 void netdev_rx_csum_fault(struct net_device *dev)
2435 {
2436         if (net_ratelimit()) {
2437                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2438                 dump_stack();
2439         }
2440 }
2441 EXPORT_SYMBOL(netdev_rx_csum_fault);
2442 #endif
2443
2444 /* Actually, we should eliminate this check as soon as we know, that:
2445  * 1. IOMMU is present and allows to map all the memory.
2446  * 2. No high memory really exists on this machine.
2447  */
2448
2449 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2450 {
2451 #ifdef CONFIG_HIGHMEM
2452         int i;
2453         if (!(dev->features & NETIF_F_HIGHDMA)) {
2454                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2455                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2456                         if (PageHighMem(skb_frag_page(frag)))
2457                                 return 1;
2458                 }
2459         }
2460
2461         if (PCI_DMA_BUS_IS_PHYS) {
2462                 struct device *pdev = dev->dev.parent;
2463
2464                 if (!pdev)
2465                         return 0;
2466                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2467                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2468                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2469                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2470                                 return 1;
2471                 }
2472         }
2473 #endif
2474         return 0;
2475 }
2476
2477 struct dev_gso_cb {
2478         void (*destructor)(struct sk_buff *skb);
2479 };
2480
2481 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2482
2483 static void dev_gso_skb_destructor(struct sk_buff *skb)
2484 {
2485         struct dev_gso_cb *cb;
2486
2487         kfree_skb_list(skb->next);
2488         skb->next = NULL;
2489
2490         cb = DEV_GSO_CB(skb);
2491         if (cb->destructor)
2492                 cb->destructor(skb);
2493 }
2494
2495 /**
2496  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2497  *      @skb: buffer to segment
2498  *      @features: device features as applicable to this skb
2499  *
2500  *      This function segments the given skb and stores the list of segments
2501  *      in skb->next.
2502  */
2503 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2504 {
2505         struct sk_buff *segs;
2506
2507         segs = skb_gso_segment(skb, features);
2508
2509         /* Verifying header integrity only. */
2510         if (!segs)
2511                 return 0;
2512
2513         if (IS_ERR(segs))
2514                 return PTR_ERR(segs);
2515
2516         skb->next = segs;
2517         DEV_GSO_CB(skb)->destructor = skb->destructor;
2518         skb->destructor = dev_gso_skb_destructor;
2519
2520         return 0;
2521 }
2522
2523 /* If MPLS offload request, verify we are testing hardware MPLS features
2524  * instead of standard features for the netdev.
2525  */
2526 #ifdef CONFIG_NET_MPLS_GSO
2527 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528                                            netdev_features_t features,
2529                                            __be16 type)
2530 {
2531         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2532                 features &= skb->dev->mpls_features;
2533
2534         return features;
2535 }
2536 #else
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538                                            netdev_features_t features,
2539                                            __be16 type)
2540 {
2541         return features;
2542 }
2543 #endif
2544
2545 static netdev_features_t harmonize_features(struct sk_buff *skb,
2546         netdev_features_t features)
2547 {
2548         int tmp;
2549         __be16 type;
2550
2551         type = skb_network_protocol(skb, &tmp);
2552         features = net_mpls_features(skb, features, type);
2553
2554         if (skb->ip_summed != CHECKSUM_NONE &&
2555             !can_checksum_protocol(features, type)) {
2556                 features &= ~NETIF_F_ALL_CSUM;
2557         } else if (illegal_highdma(skb->dev, skb)) {
2558                 features &= ~NETIF_F_SG;
2559         }
2560
2561         return features;
2562 }
2563
2564 netdev_features_t netif_skb_features(struct sk_buff *skb)
2565 {
2566         __be16 protocol = skb->protocol;
2567         netdev_features_t features = skb->dev->features;
2568
2569         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2570                 features &= ~NETIF_F_GSO_MASK;
2571
2572         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2573                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2574                 protocol = veh->h_vlan_encapsulated_proto;
2575         } else if (!vlan_tx_tag_present(skb)) {
2576                 return harmonize_features(skb, features);
2577         }
2578
2579         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2580                                                NETIF_F_HW_VLAN_STAG_TX);
2581
2582         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2583                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2584                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2585                                 NETIF_F_HW_VLAN_STAG_TX;
2586
2587         return harmonize_features(skb, features);
2588 }
2589 EXPORT_SYMBOL(netif_skb_features);
2590
2591 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2592                         struct netdev_queue *txq)
2593 {
2594         const struct net_device_ops *ops = dev->netdev_ops;
2595         int rc = NETDEV_TX_OK;
2596         unsigned int skb_len;
2597
2598         if (likely(!skb->next)) {
2599                 netdev_features_t features;
2600
2601                 /*
2602                  * If device doesn't need skb->dst, release it right now while
2603                  * its hot in this cpu cache
2604                  */
2605                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2606                         skb_dst_drop(skb);
2607
2608                 features = netif_skb_features(skb);
2609
2610                 if (vlan_tx_tag_present(skb) &&
2611                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2612                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2613                                              vlan_tx_tag_get(skb));
2614                         if (unlikely(!skb))
2615                                 goto out;
2616
2617                         skb->vlan_tci = 0;
2618                 }
2619
2620                 /* If encapsulation offload request, verify we are testing
2621                  * hardware encapsulation features instead of standard
2622                  * features for the netdev
2623                  */
2624                 if (skb->encapsulation)
2625                         features &= dev->hw_enc_features;
2626
2627                 if (netif_needs_gso(skb, features)) {
2628                         if (unlikely(dev_gso_segment(skb, features)))
2629                                 goto out_kfree_skb;
2630                         if (skb->next)
2631                                 goto gso;
2632                 } else {
2633                         if (skb_needs_linearize(skb, features) &&
2634                             __skb_linearize(skb))
2635                                 goto out_kfree_skb;
2636
2637                         /* If packet is not checksummed and device does not
2638                          * support checksumming for this protocol, complete
2639                          * checksumming here.
2640                          */
2641                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2642                                 if (skb->encapsulation)
2643                                         skb_set_inner_transport_header(skb,
2644                                                 skb_checksum_start_offset(skb));
2645                                 else
2646                                         skb_set_transport_header(skb,
2647                                                 skb_checksum_start_offset(skb));
2648                                 if (!(features & NETIF_F_ALL_CSUM) &&
2649                                      skb_checksum_help(skb))
2650                                         goto out_kfree_skb;
2651                         }
2652                 }
2653
2654                 if (!list_empty(&ptype_all))
2655                         dev_queue_xmit_nit(skb, dev);
2656
2657                 skb_len = skb->len;
2658                 trace_net_dev_start_xmit(skb, dev);
2659                 rc = ops->ndo_start_xmit(skb, dev);
2660                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2661                 if (rc == NETDEV_TX_OK)
2662                         txq_trans_update(txq);
2663                 return rc;
2664         }
2665
2666 gso:
2667         do {
2668                 struct sk_buff *nskb = skb->next;
2669
2670                 skb->next = nskb->next;
2671                 nskb->next = NULL;
2672
2673                 if (!list_empty(&ptype_all))
2674                         dev_queue_xmit_nit(nskb, dev);
2675
2676                 skb_len = nskb->len;
2677                 trace_net_dev_start_xmit(nskb, dev);
2678                 rc = ops->ndo_start_xmit(nskb, dev);
2679                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2680                 if (unlikely(rc != NETDEV_TX_OK)) {
2681                         if (rc & ~NETDEV_TX_MASK)
2682                                 goto out_kfree_gso_skb;
2683                         nskb->next = skb->next;
2684                         skb->next = nskb;
2685                         return rc;
2686                 }
2687                 txq_trans_update(txq);
2688                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2689                         return NETDEV_TX_BUSY;
2690         } while (skb->next);
2691
2692 out_kfree_gso_skb:
2693         if (likely(skb->next == NULL)) {
2694                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2695                 consume_skb(skb);
2696                 return rc;
2697         }
2698 out_kfree_skb:
2699         kfree_skb(skb);
2700 out:
2701         return rc;
2702 }
2703 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2704
2705 static void qdisc_pkt_len_init(struct sk_buff *skb)
2706 {
2707         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2708
2709         qdisc_skb_cb(skb)->pkt_len = skb->len;
2710
2711         /* To get more precise estimation of bytes sent on wire,
2712          * we add to pkt_len the headers size of all segments
2713          */
2714         if (shinfo->gso_size)  {
2715                 unsigned int hdr_len;
2716                 u16 gso_segs = shinfo->gso_segs;
2717
2718                 /* mac layer + network layer */
2719                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2720
2721                 /* + transport layer */
2722                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2723                         hdr_len += tcp_hdrlen(skb);
2724                 else
2725                         hdr_len += sizeof(struct udphdr);
2726
2727                 if (shinfo->gso_type & SKB_GSO_DODGY)
2728                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2729                                                 shinfo->gso_size);
2730
2731                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2732         }
2733 }
2734
2735 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2736                                  struct net_device *dev,
2737                                  struct netdev_queue *txq)
2738 {
2739         spinlock_t *root_lock = qdisc_lock(q);
2740         bool contended;
2741         int rc;
2742
2743         qdisc_pkt_len_init(skb);
2744         qdisc_calculate_pkt_len(skb, q);
2745         /*
2746          * Heuristic to force contended enqueues to serialize on a
2747          * separate lock before trying to get qdisc main lock.
2748          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2749          * and dequeue packets faster.
2750          */
2751         contended = qdisc_is_running(q);
2752         if (unlikely(contended))
2753                 spin_lock(&q->busylock);
2754
2755         spin_lock(root_lock);
2756         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2757                 kfree_skb(skb);
2758                 rc = NET_XMIT_DROP;
2759         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2760                    qdisc_run_begin(q)) {
2761                 /*
2762                  * This is a work-conserving queue; there are no old skbs
2763                  * waiting to be sent out; and the qdisc is not running -
2764                  * xmit the skb directly.
2765                  */
2766                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2767                         skb_dst_force(skb);
2768
2769                 qdisc_bstats_update(q, skb);
2770
2771                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2772                         if (unlikely(contended)) {
2773                                 spin_unlock(&q->busylock);
2774                                 contended = false;
2775                         }
2776                         __qdisc_run(q);
2777                 } else
2778                         qdisc_run_end(q);
2779
2780                 rc = NET_XMIT_SUCCESS;
2781         } else {
2782                 skb_dst_force(skb);
2783                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2784                 if (qdisc_run_begin(q)) {
2785                         if (unlikely(contended)) {
2786                                 spin_unlock(&q->busylock);
2787                                 contended = false;
2788                         }
2789                         __qdisc_run(q);
2790                 }
2791         }
2792         spin_unlock(root_lock);
2793         if (unlikely(contended))
2794                 spin_unlock(&q->busylock);
2795         return rc;
2796 }
2797
2798 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2799 static void skb_update_prio(struct sk_buff *skb)
2800 {
2801         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2802
2803         if (!skb->priority && skb->sk && map) {
2804                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2805
2806                 if (prioidx < map->priomap_len)
2807                         skb->priority = map->priomap[prioidx];
2808         }
2809 }
2810 #else
2811 #define skb_update_prio(skb)
2812 #endif
2813
2814 static DEFINE_PER_CPU(int, xmit_recursion);
2815 #define RECURSION_LIMIT 10
2816
2817 /**
2818  *      dev_loopback_xmit - loop back @skb
2819  *      @skb: buffer to transmit
2820  */
2821 int dev_loopback_xmit(struct sk_buff *skb)
2822 {
2823         skb_reset_mac_header(skb);
2824         __skb_pull(skb, skb_network_offset(skb));
2825         skb->pkt_type = PACKET_LOOPBACK;
2826         skb->ip_summed = CHECKSUM_UNNECESSARY;
2827         WARN_ON(!skb_dst(skb));
2828         skb_dst_force(skb);
2829         netif_rx_ni(skb);
2830         return 0;
2831 }
2832 EXPORT_SYMBOL(dev_loopback_xmit);
2833
2834 /**
2835  *      __dev_queue_xmit - transmit a buffer
2836  *      @skb: buffer to transmit
2837  *      @accel_priv: private data used for L2 forwarding offload
2838  *
2839  *      Queue a buffer for transmission to a network device. The caller must
2840  *      have set the device and priority and built the buffer before calling
2841  *      this function. The function can be called from an interrupt.
2842  *
2843  *      A negative errno code is returned on a failure. A success does not
2844  *      guarantee the frame will be transmitted as it may be dropped due
2845  *      to congestion or traffic shaping.
2846  *
2847  * -----------------------------------------------------------------------------------
2848  *      I notice this method can also return errors from the queue disciplines,
2849  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2850  *      be positive.
2851  *
2852  *      Regardless of the return value, the skb is consumed, so it is currently
2853  *      difficult to retry a send to this method.  (You can bump the ref count
2854  *      before sending to hold a reference for retry if you are careful.)
2855  *
2856  *      When calling this method, interrupts MUST be enabled.  This is because
2857  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2858  *          --BLG
2859  */
2860 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2861 {
2862         struct net_device *dev = skb->dev;
2863         struct netdev_queue *txq;
2864         struct Qdisc *q;
2865         int rc = -ENOMEM;
2866
2867         skb_reset_mac_header(skb);
2868
2869         /* Disable soft irqs for various locks below. Also
2870          * stops preemption for RCU.
2871          */
2872         rcu_read_lock_bh();
2873
2874         skb_update_prio(skb);
2875
2876         txq = netdev_pick_tx(dev, skb, accel_priv);
2877         q = rcu_dereference_bh(txq->qdisc);
2878
2879 #ifdef CONFIG_NET_CLS_ACT
2880         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2881 #endif
2882         trace_net_dev_queue(skb);
2883         if (q->enqueue) {
2884                 rc = __dev_xmit_skb(skb, q, dev, txq);
2885                 goto out;
2886         }
2887
2888         /* The device has no queue. Common case for software devices:
2889            loopback, all the sorts of tunnels...
2890
2891            Really, it is unlikely that netif_tx_lock protection is necessary
2892            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2893            counters.)
2894            However, it is possible, that they rely on protection
2895            made by us here.
2896
2897            Check this and shot the lock. It is not prone from deadlocks.
2898            Either shot noqueue qdisc, it is even simpler 8)
2899          */
2900         if (dev->flags & IFF_UP) {
2901                 int cpu = smp_processor_id(); /* ok because BHs are off */
2902
2903                 if (txq->xmit_lock_owner != cpu) {
2904
2905                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2906                                 goto recursion_alert;
2907
2908                         HARD_TX_LOCK(dev, txq, cpu);
2909
2910                         if (!netif_xmit_stopped(txq)) {
2911                                 __this_cpu_inc(xmit_recursion);
2912                                 rc = dev_hard_start_xmit(skb, dev, txq);
2913                                 __this_cpu_dec(xmit_recursion);
2914                                 if (dev_xmit_complete(rc)) {
2915                                         HARD_TX_UNLOCK(dev, txq);
2916                                         goto out;
2917                                 }
2918                         }
2919                         HARD_TX_UNLOCK(dev, txq);
2920                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2921                                              dev->name);
2922                 } else {
2923                         /* Recursion is detected! It is possible,
2924                          * unfortunately
2925                          */
2926 recursion_alert:
2927                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2928                                              dev->name);
2929                 }
2930         }
2931
2932         rc = -ENETDOWN;
2933         rcu_read_unlock_bh();
2934
2935         atomic_long_inc(&dev->tx_dropped);
2936         kfree_skb(skb);
2937         return rc;
2938 out:
2939         rcu_read_unlock_bh();
2940         return rc;
2941 }
2942
2943 int dev_queue_xmit(struct sk_buff *skb)
2944 {
2945         return __dev_queue_xmit(skb, NULL);
2946 }
2947 EXPORT_SYMBOL(dev_queue_xmit);
2948
2949 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2950 {
2951         return __dev_queue_xmit(skb, accel_priv);
2952 }
2953 EXPORT_SYMBOL(dev_queue_xmit_accel);
2954
2955
2956 /*=======================================================================
2957                         Receiver routines
2958   =======================================================================*/
2959
2960 int netdev_max_backlog __read_mostly = 1000;
2961 EXPORT_SYMBOL(netdev_max_backlog);
2962
2963 int netdev_tstamp_prequeue __read_mostly = 1;
2964 int netdev_budget __read_mostly = 300;
2965 int weight_p __read_mostly = 64;            /* old backlog weight */
2966
2967 /* Called with irq disabled */
2968 static inline void ____napi_schedule(struct softnet_data *sd,
2969                                      struct napi_struct *napi)
2970 {
2971         list_add_tail(&napi->poll_list, &sd->poll_list);
2972         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2973 }
2974
2975 #ifdef CONFIG_RPS
2976
2977 /* One global table that all flow-based protocols share. */
2978 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2979 EXPORT_SYMBOL(rps_sock_flow_table);
2980
2981 struct static_key rps_needed __read_mostly;
2982
2983 static struct rps_dev_flow *
2984 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2985             struct rps_dev_flow *rflow, u16 next_cpu)
2986 {
2987         if (next_cpu != RPS_NO_CPU) {
2988 #ifdef CONFIG_RFS_ACCEL
2989                 struct netdev_rx_queue *rxqueue;
2990                 struct rps_dev_flow_table *flow_table;
2991                 struct rps_dev_flow *old_rflow;
2992                 u32 flow_id;
2993                 u16 rxq_index;
2994                 int rc;
2995
2996                 /* Should we steer this flow to a different hardware queue? */
2997                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2998                     !(dev->features & NETIF_F_NTUPLE))
2999                         goto out;
3000                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3001                 if (rxq_index == skb_get_rx_queue(skb))
3002                         goto out;
3003
3004                 rxqueue = dev->_rx + rxq_index;
3005                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3006                 if (!flow_table)
3007                         goto out;
3008                 flow_id = skb_get_hash(skb) & flow_table->mask;
3009                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3010                                                         rxq_index, flow_id);
3011                 if (rc < 0)
3012                         goto out;
3013                 old_rflow = rflow;
3014                 rflow = &flow_table->flows[flow_id];
3015                 rflow->filter = rc;
3016                 if (old_rflow->filter == rflow->filter)
3017                         old_rflow->filter = RPS_NO_FILTER;
3018         out:
3019 #endif
3020                 rflow->last_qtail =
3021                         per_cpu(softnet_data, next_cpu).input_queue_head;
3022         }
3023
3024         rflow->cpu = next_cpu;
3025         return rflow;
3026 }
3027
3028 /*
3029  * get_rps_cpu is called from netif_receive_skb and returns the target
3030  * CPU from the RPS map of the receiving queue for a given skb.
3031  * rcu_read_lock must be held on entry.
3032  */
3033 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3034                        struct rps_dev_flow **rflowp)
3035 {
3036         struct netdev_rx_queue *rxqueue;
3037         struct rps_map *map;
3038         struct rps_dev_flow_table *flow_table;
3039         struct rps_sock_flow_table *sock_flow_table;
3040         int cpu = -1;
3041         u16 tcpu;
3042         u32 hash;
3043
3044         if (skb_rx_queue_recorded(skb)) {
3045                 u16 index = skb_get_rx_queue(skb);
3046                 if (unlikely(index >= dev->real_num_rx_queues)) {
3047                         WARN_ONCE(dev->real_num_rx_queues > 1,
3048                                   "%s received packet on queue %u, but number "
3049                                   "of RX queues is %u\n",
3050                                   dev->name, index, dev->real_num_rx_queues);
3051                         goto done;
3052                 }
3053                 rxqueue = dev->_rx + index;
3054         } else
3055                 rxqueue = dev->_rx;
3056
3057         map = rcu_dereference(rxqueue->rps_map);
3058         if (map) {
3059                 if (map->len == 1 &&
3060                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3061                         tcpu = map->cpus[0];
3062                         if (cpu_online(tcpu))
3063                                 cpu = tcpu;
3064                         goto done;
3065                 }
3066         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3067                 goto done;
3068         }
3069
3070         skb_reset_network_header(skb);
3071         hash = skb_get_hash(skb);
3072         if (!hash)
3073                 goto done;
3074
3075         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3076         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3077         if (flow_table && sock_flow_table) {
3078                 u16 next_cpu;
3079                 struct rps_dev_flow *rflow;
3080
3081                 rflow = &flow_table->flows[hash & flow_table->mask];
3082                 tcpu = rflow->cpu;
3083
3084                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3085
3086                 /*
3087                  * If the desired CPU (where last recvmsg was done) is
3088                  * different from current CPU (one in the rx-queue flow
3089                  * table entry), switch if one of the following holds:
3090                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3091                  *   - Current CPU is offline.
3092                  *   - The current CPU's queue tail has advanced beyond the
3093                  *     last packet that was enqueued using this table entry.
3094                  *     This guarantees that all previous packets for the flow
3095                  *     have been dequeued, thus preserving in order delivery.
3096                  */
3097                 if (unlikely(tcpu != next_cpu) &&
3098                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3099                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3100                       rflow->last_qtail)) >= 0)) {
3101                         tcpu = next_cpu;
3102                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3103                 }
3104
3105                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3106                         *rflowp = rflow;
3107                         cpu = tcpu;
3108                         goto done;
3109                 }
3110         }
3111
3112         if (map) {
3113                 tcpu = map->cpus[((u64) hash * map->len) >> 32];
3114
3115                 if (cpu_online(tcpu)) {
3116                         cpu = tcpu;
3117                         goto done;
3118                 }
3119         }
3120
3121 done:
3122         return cpu;
3123 }
3124
3125 #ifdef CONFIG_RFS_ACCEL
3126
3127 /**
3128  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3129  * @dev: Device on which the filter was set
3130  * @rxq_index: RX queue index
3131  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3132  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3133  *
3134  * Drivers that implement ndo_rx_flow_steer() should periodically call
3135  * this function for each installed filter and remove the filters for
3136  * which it returns %true.
3137  */
3138 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3139                          u32 flow_id, u16 filter_id)
3140 {
3141         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3142         struct rps_dev_flow_table *flow_table;
3143         struct rps_dev_flow *rflow;
3144         bool expire = true;
3145         int cpu;
3146
3147         rcu_read_lock();
3148         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3149         if (flow_table && flow_id <= flow_table->mask) {
3150                 rflow = &flow_table->flows[flow_id];
3151                 cpu = ACCESS_ONCE(rflow->cpu);
3152                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3153                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3154                            rflow->last_qtail) <
3155                      (int)(10 * flow_table->mask)))
3156                         expire = false;
3157         }
3158         rcu_read_unlock();
3159         return expire;
3160 }
3161 EXPORT_SYMBOL(rps_may_expire_flow);
3162
3163 #endif /* CONFIG_RFS_ACCEL */
3164
3165 /* Called from hardirq (IPI) context */
3166 static void rps_trigger_softirq(void *data)
3167 {
3168         struct softnet_data *sd = data;
3169
3170         ____napi_schedule(sd, &sd->backlog);
3171         sd->received_rps++;
3172 }
3173
3174 #endif /* CONFIG_RPS */
3175
3176 /*
3177  * Check if this softnet_data structure is another cpu one
3178  * If yes, queue it to our IPI list and return 1
3179  * If no, return 0
3180  */
3181 static int rps_ipi_queued(struct softnet_data *sd)
3182 {
3183 #ifdef CONFIG_RPS
3184         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3185
3186         if (sd != mysd) {
3187                 sd->rps_ipi_next = mysd->rps_ipi_list;
3188                 mysd->rps_ipi_list = sd;
3189
3190                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3191                 return 1;
3192         }
3193 #endif /* CONFIG_RPS */
3194         return 0;
3195 }
3196
3197 #ifdef CONFIG_NET_FLOW_LIMIT
3198 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3199 #endif
3200
3201 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3202 {
3203 #ifdef CONFIG_NET_FLOW_LIMIT
3204         struct sd_flow_limit *fl;
3205         struct softnet_data *sd;
3206         unsigned int old_flow, new_flow;
3207
3208         if (qlen < (netdev_max_backlog >> 1))
3209                 return false;
3210
3211         sd = &__get_cpu_var(softnet_data);
3212
3213         rcu_read_lock();
3214         fl = rcu_dereference(sd->flow_limit);
3215         if (fl) {
3216                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3217                 old_flow = fl->history[fl->history_head];
3218                 fl->history[fl->history_head] = new_flow;
3219
3220                 fl->history_head++;
3221                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3222
3223                 if (likely(fl->buckets[old_flow]))
3224                         fl->buckets[old_flow]--;
3225
3226                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3227                         fl->count++;
3228                         rcu_read_unlock();
3229                         return true;
3230                 }
3231         }
3232         rcu_read_unlock();
3233 #endif
3234         return false;
3235 }
3236
3237 /*
3238  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3239  * queue (may be a remote CPU queue).
3240  */
3241 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3242                               unsigned int *qtail)
3243 {
3244         struct softnet_data *sd;
3245         unsigned long flags;
3246         unsigned int qlen;
3247
3248         sd = &per_cpu(softnet_data, cpu);
3249
3250         local_irq_save(flags);
3251
3252         rps_lock(sd);
3253         qlen = skb_queue_len(&sd->input_pkt_queue);
3254         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3255                 if (skb_queue_len(&sd->input_pkt_queue)) {
3256 enqueue:
3257                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3258                         input_queue_tail_incr_save(sd, qtail);
3259                         rps_unlock(sd);
3260                         local_irq_restore(flags);
3261                         return NET_RX_SUCCESS;
3262                 }
3263
3264                 /* Schedule NAPI for backlog device
3265                  * We can use non atomic operation since we own the queue lock
3266                  */
3267                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3268                         if (!rps_ipi_queued(sd))
3269                                 ____napi_schedule(sd, &sd->backlog);
3270                 }
3271                 goto enqueue;
3272         }
3273
3274         sd->dropped++;
3275         rps_unlock(sd);
3276
3277         local_irq_restore(flags);
3278
3279         atomic_long_inc(&skb->dev->rx_dropped);
3280         kfree_skb(skb);
3281         return NET_RX_DROP;
3282 }
3283
3284 static int netif_rx_internal(struct sk_buff *skb)
3285 {
3286         int ret;
3287
3288         net_timestamp_check(netdev_tstamp_prequeue, skb);
3289
3290         trace_netif_rx(skb);
3291 #ifdef CONFIG_RPS
3292         if (static_key_false(&rps_needed)) {
3293                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3294                 int cpu;
3295
3296                 preempt_disable();
3297                 rcu_read_lock();
3298
3299                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3300                 if (cpu < 0)
3301                         cpu = smp_processor_id();
3302
3303                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3304
3305                 rcu_read_unlock();
3306                 preempt_enable();
3307         } else
3308 #endif
3309         {
3310                 unsigned int qtail;
3311                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3312                 put_cpu();
3313         }
3314         return ret;
3315 }
3316
3317 /**
3318  *      netif_rx        -       post buffer to the network code
3319  *      @skb: buffer to post
3320  *
3321  *      This function receives a packet from a device driver and queues it for
3322  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3323  *      may be dropped during processing for congestion control or by the
3324  *      protocol layers.
3325  *
3326  *      return values:
3327  *      NET_RX_SUCCESS  (no congestion)
3328  *      NET_RX_DROP     (packet was dropped)
3329  *
3330  */
3331
3332 int netif_rx(struct sk_buff *skb)
3333 {
3334         trace_netif_rx_entry(skb);
3335
3336         return netif_rx_internal(skb);
3337 }
3338 EXPORT_SYMBOL(netif_rx);
3339
3340 int netif_rx_ni(struct sk_buff *skb)
3341 {
3342         int err;
3343
3344         trace_netif_rx_ni_entry(skb);
3345
3346         preempt_disable();
3347         err = netif_rx_internal(skb);
3348         if (local_softirq_pending())
3349                 do_softirq();
3350         preempt_enable();
3351
3352         return err;
3353 }
3354 EXPORT_SYMBOL(netif_rx_ni);
3355
3356 static void net_tx_action(struct softirq_action *h)
3357 {
3358         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3359
3360         if (sd->completion_queue) {
3361                 struct sk_buff *clist;
3362
3363                 local_irq_disable();
3364                 clist = sd->completion_queue;
3365                 sd->completion_queue = NULL;
3366                 local_irq_enable();
3367
3368                 while (clist) {
3369                         struct sk_buff *skb = clist;
3370                         clist = clist->next;
3371
3372                         WARN_ON(atomic_read(&skb->users));
3373                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3374                                 trace_consume_skb(skb);
3375                         else
3376                                 trace_kfree_skb(skb, net_tx_action);
3377                         __kfree_skb(skb);
3378                 }
3379         }
3380
3381         if (sd->output_queue) {
3382                 struct Qdisc *head;
3383
3384                 local_irq_disable();
3385                 head = sd->output_queue;
3386                 sd->output_queue = NULL;
3387                 sd->output_queue_tailp = &sd->output_queue;
3388                 local_irq_enable();
3389
3390                 while (head) {
3391                         struct Qdisc *q = head;
3392                         spinlock_t *root_lock;
3393
3394                         head = head->next_sched;
3395
3396                         root_lock = qdisc_lock(q);
3397                         if (spin_trylock(root_lock)) {
3398                                 smp_mb__before_atomic();
3399                                 clear_bit(__QDISC_STATE_SCHED,
3400                                           &q->state);
3401                                 qdisc_run(q);
3402                                 spin_unlock(root_lock);
3403                         } else {
3404                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3405                                               &q->state)) {
3406                                         __netif_reschedule(q);
3407                                 } else {
3408                                         smp_mb__before_atomic();
3409                                         clear_bit(__QDISC_STATE_SCHED,
3410                                                   &q->state);
3411                                 }
3412                         }
3413                 }
3414         }
3415 }
3416
3417 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3418     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3419 /* This hook is defined here for ATM LANE */
3420 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3421                              unsigned char *addr) __read_mostly;
3422 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3423 #endif
3424
3425 #ifdef CONFIG_NET_CLS_ACT
3426 /* TODO: Maybe we should just force sch_ingress to be compiled in
3427  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3428  * a compare and 2 stores extra right now if we dont have it on
3429  * but have CONFIG_NET_CLS_ACT
3430  * NOTE: This doesn't stop any functionality; if you dont have
3431  * the ingress scheduler, you just can't add policies on ingress.
3432  *
3433  */
3434 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3435 {
3436         struct net_device *dev = skb->dev;
3437         u32 ttl = G_TC_RTTL(skb->tc_verd);
3438         int result = TC_ACT_OK;
3439         struct Qdisc *q;
3440
3441         if (unlikely(MAX_RED_LOOP < ttl++)) {
3442                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3443                                      skb->skb_iif, dev->ifindex);
3444                 return TC_ACT_SHOT;
3445         }
3446
3447         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3448         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3449
3450         q = rxq->qdisc;
3451         if (q != &noop_qdisc) {
3452                 spin_lock(qdisc_lock(q));
3453                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3454                         result = qdisc_enqueue_root(skb, q);
3455                 spin_unlock(qdisc_lock(q));
3456         }
3457
3458         return result;
3459 }
3460
3461 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3462                                          struct packet_type **pt_prev,
3463                                          int *ret, struct net_device *orig_dev)
3464 {
3465         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3466
3467         if (!rxq || rxq->qdisc == &noop_qdisc)
3468                 goto out;
3469
3470         if (*pt_prev) {
3471                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3472                 *pt_prev = NULL;
3473         }
3474
3475         switch (ing_filter(skb, rxq)) {
3476         case TC_ACT_SHOT:
3477         case TC_ACT_STOLEN:
3478                 kfree_skb(skb);
3479                 return NULL;
3480         }
3481
3482 out:
3483         skb->tc_verd = 0;
3484         return skb;
3485 }
3486 #endif
3487
3488 /**
3489  *      netdev_rx_handler_register - register receive handler
3490  *      @dev: device to register a handler for
3491  *      @rx_handler: receive handler to register
3492  *      @rx_handler_data: data pointer that is used by rx handler
3493  *
3494  *      Register a receive handler for a device. This handler will then be
3495  *      called from __netif_receive_skb. A negative errno code is returned
3496  *      on a failure.
3497  *
3498  *      The caller must hold the rtnl_mutex.
3499  *
3500  *      For a general description of rx_handler, see enum rx_handler_result.
3501  */
3502 int netdev_rx_handler_register(struct net_device *dev,
3503                                rx_handler_func_t *rx_handler,
3504                                void *rx_handler_data)
3505 {
3506         ASSERT_RTNL();
3507
3508         if (dev->rx_handler)
3509                 return -EBUSY;
3510
3511         /* Note: rx_handler_data must be set before rx_handler */
3512         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3513         rcu_assign_pointer(dev->rx_handler, rx_handler);
3514
3515         return 0;
3516 }
3517 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3518
3519 /**
3520  *      netdev_rx_handler_unregister - unregister receive handler
3521  *      @dev: device to unregister a handler from
3522  *
3523  *      Unregister a receive handler from a device.
3524  *
3525  *      The caller must hold the rtnl_mutex.
3526  */
3527 void netdev_rx_handler_unregister(struct net_device *dev)
3528 {
3529
3530         ASSERT_RTNL();
3531         RCU_INIT_POINTER(dev->rx_handler, NULL);
3532         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3533          * section has a guarantee to see a non NULL rx_handler_data
3534          * as well.
3535          */
3536         synchronize_net();
3537         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3538 }
3539 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3540
3541 /*
3542  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3543  * the special handling of PFMEMALLOC skbs.
3544  */
3545 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3546 {
3547         switch (skb->protocol) {
3548         case htons(ETH_P_ARP):
3549         case htons(ETH_P_IP):
3550         case htons(ETH_P_IPV6):
3551         case htons(ETH_P_8021Q):
3552         case htons(ETH_P_8021AD):
3553                 return true;
3554         default:
3555                 return false;
3556         }
3557 }
3558
3559 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3560 {
3561         struct packet_type *ptype, *pt_prev;
3562         rx_handler_func_t *rx_handler;
3563         struct net_device *orig_dev;
3564         struct net_device *null_or_dev;
3565         bool deliver_exact = false;
3566         int ret = NET_RX_DROP;
3567         __be16 type;
3568
3569         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3570
3571         trace_netif_receive_skb(skb);
3572
3573         orig_dev = skb->dev;
3574
3575         skb_reset_network_header(skb);
3576         if (!skb_transport_header_was_set(skb))
3577                 skb_reset_transport_header(skb);
3578         skb_reset_mac_len(skb);
3579
3580         pt_prev = NULL;
3581
3582         rcu_read_lock();
3583
3584 another_round:
3585         skb->skb_iif = skb->dev->ifindex;
3586
3587         __this_cpu_inc(softnet_data.processed);
3588
3589         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3590             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3591                 skb = vlan_untag(skb);
3592                 if (unlikely(!skb))
3593                         goto unlock;
3594         }
3595
3596 #ifdef CONFIG_NET_CLS_ACT
3597         if (skb->tc_verd & TC_NCLS) {
3598                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3599                 goto ncls;
3600         }
3601 #endif
3602
3603         if (pfmemalloc)
3604                 goto skip_taps;
3605
3606         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3607                 if (!ptype->dev || ptype->dev == skb->dev) {
3608                         if (pt_prev)
3609                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3610                         pt_prev = ptype;
3611                 }
3612         }
3613
3614 skip_taps:
3615 #ifdef CONFIG_NET_CLS_ACT
3616         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3617         if (!skb)
3618                 goto unlock;
3619 ncls:
3620 #endif
3621
3622         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3623                 goto drop;
3624
3625         if (vlan_tx_tag_present(skb)) {
3626                 if (pt_prev) {
3627                         ret = deliver_skb(skb, pt_prev, orig_dev);
3628                         pt_prev = NULL;
3629                 }
3630                 if (vlan_do_receive(&skb))
3631                         goto another_round;
3632                 else if (unlikely(!skb))
3633                         goto unlock;
3634         }
3635
3636         rx_handler = rcu_dereference(skb->dev->rx_handler);
3637         if (rx_handler) {
3638                 if (pt_prev) {
3639                         ret = deliver_skb(skb, pt_prev, orig_dev);
3640                         pt_prev = NULL;
3641                 }
3642                 switch (rx_handler(&skb)) {
3643                 case RX_HANDLER_CONSUMED:
3644                         ret = NET_RX_SUCCESS;
3645                         goto unlock;
3646                 case RX_HANDLER_ANOTHER:
3647                         goto another_round;
3648                 case RX_HANDLER_EXACT:
3649                         deliver_exact = true;
3650                 case RX_HANDLER_PASS:
3651                         break;
3652                 default:
3653                         BUG();
3654                 }
3655         }
3656
3657         if (unlikely(vlan_tx_tag_present(skb))) {
3658                 if (vlan_tx_tag_get_id(skb))
3659                         skb->pkt_type = PACKET_OTHERHOST;
3660                 /* Note: we might in the future use prio bits
3661                  * and set skb->priority like in vlan_do_receive()
3662                  * For the time being, just ignore Priority Code Point
3663                  */
3664                 skb->vlan_tci = 0;
3665         }
3666
3667         /* deliver only exact match when indicated */
3668         null_or_dev = deliver_exact ? skb->dev : NULL;
3669
3670         type = skb->protocol;
3671         list_for_each_entry_rcu(ptype,
3672                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3673                 if (ptype->type == type &&
3674                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3675                      ptype->dev == orig_dev)) {
3676                         if (pt_prev)
3677                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3678                         pt_prev = ptype;
3679                 }
3680         }
3681
3682         if (pt_prev) {
3683                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3684                         goto drop;
3685                 else
3686                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3687         } else {
3688 drop:
3689                 atomic_long_inc(&skb->dev->rx_dropped);
3690                 kfree_skb(skb);
3691                 /* Jamal, now you will not able to escape explaining
3692                  * me how you were going to use this. :-)
3693                  */
3694                 ret = NET_RX_DROP;
3695         }
3696
3697 unlock:
3698         rcu_read_unlock();
3699         return ret;
3700 }
3701
3702 static int __netif_receive_skb(struct sk_buff *skb)
3703 {
3704         int ret;
3705
3706         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3707                 unsigned long pflags = current->flags;
3708
3709                 /*
3710                  * PFMEMALLOC skbs are special, they should
3711                  * - be delivered to SOCK_MEMALLOC sockets only
3712                  * - stay away from userspace
3713                  * - have bounded memory usage
3714                  *
3715                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3716                  * context down to all allocation sites.
3717                  */
3718                 current->flags |= PF_MEMALLOC;
3719                 ret = __netif_receive_skb_core(skb, true);
3720                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3721         } else
3722                 ret = __netif_receive_skb_core(skb, false);
3723
3724         return ret;
3725 }
3726
3727 static int netif_receive_skb_internal(struct sk_buff *skb)
3728 {
3729         net_timestamp_check(netdev_tstamp_prequeue, skb);
3730
3731         if (skb_defer_rx_timestamp(skb))
3732                 return NET_RX_SUCCESS;
3733
3734 #ifdef CONFIG_RPS
3735         if (static_key_false(&rps_needed)) {
3736                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3737                 int cpu, ret;
3738
3739                 rcu_read_lock();
3740
3741                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3742
3743                 if (cpu >= 0) {
3744                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3745                         rcu_read_unlock();
3746                         return ret;
3747                 }
3748                 rcu_read_unlock();
3749         }
3750 #endif
3751         return __netif_receive_skb(skb);
3752 }
3753
3754 /**
3755  *      netif_receive_skb - process receive buffer from network
3756  *      @skb: buffer to process
3757  *
3758  *      netif_receive_skb() is the main receive data processing function.
3759  *      It always succeeds. The buffer may be dropped during processing
3760  *      for congestion control or by the protocol layers.
3761  *
3762  *      This function may only be called from softirq context and interrupts
3763  *      should be enabled.
3764  *
3765  *      Return values (usually ignored):
3766  *      NET_RX_SUCCESS: no congestion
3767  *      NET_RX_DROP: packet was dropped
3768  */
3769 int netif_receive_skb(struct sk_buff *skb)
3770 {
3771         trace_netif_receive_skb_entry(skb);
3772
3773         return netif_receive_skb_internal(skb);
3774 }
3775 EXPORT_SYMBOL(netif_receive_skb);
3776
3777 /* Network device is going away, flush any packets still pending
3778  * Called with irqs disabled.
3779  */
3780 static void flush_backlog(void *arg)
3781 {
3782         struct net_device *dev = arg;
3783         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3784         struct sk_buff *skb, *tmp;
3785
3786         rps_lock(sd);
3787         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3788                 if (skb->dev == dev) {
3789                         __skb_unlink(skb, &sd->input_pkt_queue);
3790                         kfree_skb(skb);
3791                         input_queue_head_incr(sd);
3792                 }
3793         }
3794         rps_unlock(sd);
3795
3796         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3797                 if (skb->dev == dev) {
3798                         __skb_unlink(skb, &sd->process_queue);
3799                         kfree_skb(skb);
3800                         input_queue_head_incr(sd);
3801                 }
3802         }
3803 }
3804
3805 static int napi_gro_complete(struct sk_buff *skb)
3806 {
3807         struct packet_offload *ptype;
3808         __be16 type = skb->protocol;
3809         struct list_head *head = &offload_base;
3810         int err = -ENOENT;
3811
3812         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3813
3814         if (NAPI_GRO_CB(skb)->count == 1) {
3815                 skb_shinfo(skb)->gso_size = 0;
3816                 goto out;
3817         }
3818
3819         rcu_read_lock();
3820         list_for_each_entry_rcu(ptype, head, list) {
3821                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3822                         continue;
3823
3824                 err = ptype->callbacks.gro_complete(skb, 0);
3825                 break;
3826         }
3827         rcu_read_unlock();
3828
3829         if (err) {
3830                 WARN_ON(&ptype->list == head);
3831                 kfree_skb(skb);
3832                 return NET_RX_SUCCESS;
3833         }
3834
3835 out:
3836         return netif_receive_skb_internal(skb);
3837 }
3838
3839 /* napi->gro_list contains packets ordered by age.
3840  * youngest packets at the head of it.
3841  * Complete skbs in reverse order to reduce latencies.
3842  */
3843 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3844 {
3845         struct sk_buff *skb, *prev = NULL;
3846
3847         /* scan list and build reverse chain */
3848         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3849                 skb->prev = prev;
3850                 prev = skb;
3851         }
3852
3853         for (skb = prev; skb; skb = prev) {
3854                 skb->next = NULL;
3855
3856                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3857                         return;
3858
3859                 prev = skb->prev;
3860                 napi_gro_complete(skb);
3861                 napi->gro_count--;
3862         }
3863
3864         napi->gro_list = NULL;
3865 }
3866 EXPORT_SYMBOL(napi_gro_flush);
3867
3868 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3869 {
3870         struct sk_buff *p;
3871         unsigned int maclen = skb->dev->hard_header_len;
3872         u32 hash = skb_get_hash_raw(skb);
3873
3874         for (p = napi->gro_list; p; p = p->next) {
3875                 unsigned long diffs;
3876
3877                 NAPI_GRO_CB(p)->flush = 0;
3878
3879                 if (hash != skb_get_hash_raw(p)) {
3880                         NAPI_GRO_CB(p)->same_flow = 0;
3881                         continue;
3882                 }
3883
3884                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3885                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3886                 if (maclen == ETH_HLEN)
3887                         diffs |= compare_ether_header(skb_mac_header(p),
3888                                                       skb_mac_header(skb));
3889                 else if (!diffs)
3890                         diffs = memcmp(skb_mac_header(p),
3891                                        skb_mac_header(skb),
3892                                        maclen);
3893                 NAPI_GRO_CB(p)->same_flow = !diffs;
3894         }
3895 }
3896
3897 static void skb_gro_reset_offset(struct sk_buff *skb)
3898 {
3899         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3900         const skb_frag_t *frag0 = &pinfo->frags[0];
3901
3902         NAPI_GRO_CB(skb)->data_offset = 0;
3903         NAPI_GRO_CB(skb)->frag0 = NULL;
3904         NAPI_GRO_CB(skb)->frag0_len = 0;
3905
3906         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3907             pinfo->nr_frags &&
3908             !PageHighMem(skb_frag_page(frag0))) {
3909                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3910                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3911         }
3912 }
3913
3914 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3915 {
3916         struct skb_shared_info *pinfo = skb_shinfo(skb);
3917
3918         BUG_ON(skb->end - skb->tail < grow);
3919
3920         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3921
3922         skb->data_len -= grow;
3923         skb->tail += grow;
3924
3925         pinfo->frags[0].page_offset += grow;
3926         skb_frag_size_sub(&pinfo->frags[0], grow);
3927
3928         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3929                 skb_frag_unref(skb, 0);
3930                 memmove(pinfo->frags, pinfo->frags + 1,
3931                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3932         }
3933 }
3934
3935 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3936 {
3937         struct sk_buff **pp = NULL;
3938         struct packet_offload *ptype;
3939         __be16 type = skb->protocol;
3940         struct list_head *head = &offload_base;
3941         int same_flow;
3942         enum gro_result ret;
3943         int grow;
3944
3945         if (!(skb->dev->features & NETIF_F_GRO))
3946                 goto normal;
3947
3948         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3949                 goto normal;
3950
3951         gro_list_prepare(napi, skb);
3952         NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3953
3954         rcu_read_lock();
3955         list_for_each_entry_rcu(ptype, head, list) {
3956                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3957                         continue;
3958
3959                 skb_set_network_header(skb, skb_gro_offset(skb));
3960                 skb_reset_mac_len(skb);
3961                 NAPI_GRO_CB(skb)->same_flow = 0;
3962                 NAPI_GRO_CB(skb)->flush = 0;
3963                 NAPI_GRO_CB(skb)->free = 0;
3964                 NAPI_GRO_CB(skb)->udp_mark = 0;
3965
3966                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3967                 break;
3968         }
3969         rcu_read_unlock();
3970
3971         if (&ptype->list == head)
3972                 goto normal;
3973
3974         same_flow = NAPI_GRO_CB(skb)->same_flow;
3975         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3976
3977         if (pp) {
3978                 struct sk_buff *nskb = *pp;
3979
3980                 *pp = nskb->next;
3981                 nskb->next = NULL;
3982                 napi_gro_complete(nskb);
3983                 napi->gro_count--;
3984         }
3985
3986         if (same_flow)
3987                 goto ok;
3988
3989         if (NAPI_GRO_CB(skb)->flush)
3990                 goto normal;
3991
3992         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3993                 struct sk_buff *nskb = napi->gro_list;
3994
3995                 /* locate the end of the list to select the 'oldest' flow */
3996                 while (nskb->next) {
3997                         pp = &nskb->next;
3998                         nskb = *pp;
3999                 }
4000                 *pp = NULL;
4001                 nskb->next = NULL;
4002                 napi_gro_complete(nskb);
4003         } else {
4004                 napi->gro_count++;
4005         }
4006         NAPI_GRO_CB(skb)->count = 1;
4007         NAPI_GRO_CB(skb)->age = jiffies;
4008         NAPI_GRO_CB(skb)->last = skb;
4009         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4010         skb->next = napi->gro_list;
4011         napi->gro_list = skb;
4012         ret = GRO_HELD;
4013
4014 pull:
4015         grow = skb_gro_offset(skb) - skb_headlen(skb);
4016         if (grow > 0)
4017                 gro_pull_from_frag0(skb, grow);
4018 ok:
4019         return ret;
4020
4021 normal:
4022         ret = GRO_NORMAL;
4023         goto pull;
4024 }
4025
4026 struct packet_offload *gro_find_receive_by_type(__be16 type)
4027 {
4028         struct list_head *offload_head = &offload_base;
4029         struct packet_offload *ptype;
4030
4031         list_for_each_entry_rcu(ptype, offload_head, list) {
4032                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4033                         continue;
4034                 return ptype;
4035         }
4036         return NULL;
4037 }
4038 EXPORT_SYMBOL(gro_find_receive_by_type);
4039
4040 struct packet_offload *gro_find_complete_by_type(__be16 type)
4041 {
4042         struct list_head *offload_head = &offload_base;
4043         struct packet_offload *ptype;
4044
4045         list_for_each_entry_rcu(ptype, offload_head, list) {
4046                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4047                         continue;
4048                 return ptype;
4049         }
4050         return NULL;
4051 }
4052 EXPORT_SYMBOL(gro_find_complete_by_type);
4053
4054 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4055 {
4056         switch (ret) {
4057         case GRO_NORMAL:
4058                 if (netif_receive_skb_internal(skb))
4059                         ret = GRO_DROP;
4060                 break;
4061
4062         case GRO_DROP:
4063                 kfree_skb(skb);
4064                 break;
4065
4066         case GRO_MERGED_FREE:
4067                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4068                         kmem_cache_free(skbuff_head_cache, skb);
4069                 else
4070                         __kfree_skb(skb);
4071                 break;
4072
4073         case GRO_HELD:
4074         case GRO_MERGED:
4075                 break;
4076         }
4077
4078         return ret;
4079 }
4080
4081 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4082 {
4083         trace_napi_gro_receive_entry(skb);
4084
4085         skb_gro_reset_offset(skb);
4086
4087         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4088 }
4089 EXPORT_SYMBOL(napi_gro_receive);
4090
4091 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4092 {
4093         __skb_pull(skb, skb_headlen(skb));
4094         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4095         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4096         skb->vlan_tci = 0;
4097         skb->dev = napi->dev;
4098         skb->skb_iif = 0;
4099         skb->encapsulation = 0;
4100         skb_shinfo(skb)->gso_type = 0;
4101         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4102
4103         napi->skb = skb;
4104 }
4105
4106 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4107 {
4108         struct sk_buff *skb = napi->skb;
4109
4110         if (!skb) {
4111                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4112                 napi->skb = skb;
4113         }
4114         return skb;
4115 }
4116 EXPORT_SYMBOL(napi_get_frags);
4117
4118 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4119                                       struct sk_buff *skb,
4120                                       gro_result_t ret)
4121 {
4122         switch (ret) {
4123         case GRO_NORMAL:
4124         case GRO_HELD:
4125                 __skb_push(skb, ETH_HLEN);
4126                 skb->protocol = eth_type_trans(skb, skb->dev);
4127                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4128                         ret = GRO_DROP;
4129                 break;
4130
4131         case GRO_DROP:
4132         case GRO_MERGED_FREE:
4133                 napi_reuse_skb(napi, skb);
4134                 break;
4135
4136         case GRO_MERGED:
4137                 break;
4138         }
4139
4140         return ret;
4141 }
4142
4143 /* Upper GRO stack assumes network header starts at gro_offset=0
4144  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4145  * We copy ethernet header into skb->data to have a common layout.
4146  */
4147 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4148 {
4149         struct sk_buff *skb = napi->skb;
4150         const struct ethhdr *eth;
4151         unsigned int hlen = sizeof(*eth);
4152
4153         napi->skb = NULL;
4154
4155         skb_reset_mac_header(skb);
4156         skb_gro_reset_offset(skb);
4157
4158         eth = skb_gro_header_fast(skb, 0);
4159         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4160                 eth = skb_gro_header_slow(skb, hlen, 0);
4161                 if (unlikely(!eth)) {
4162                         napi_reuse_skb(napi, skb);
4163                         return NULL;
4164                 }
4165         } else {
4166                 gro_pull_from_frag0(skb, hlen);
4167                 NAPI_GRO_CB(skb)->frag0 += hlen;
4168                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4169         }
4170         __skb_pull(skb, hlen);
4171
4172         /*
4173          * This works because the only protocols we care about don't require
4174          * special handling.
4175          * We'll fix it up properly in napi_frags_finish()
4176          */
4177         skb->protocol = eth->h_proto;
4178
4179         return skb;
4180 }
4181
4182 gro_result_t napi_gro_frags(struct napi_struct *napi)
4183 {
4184         struct sk_buff *skb = napi_frags_skb(napi);
4185
4186         if (!skb)
4187                 return GRO_DROP;
4188
4189         trace_napi_gro_frags_entry(skb);
4190
4191         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4192 }
4193 EXPORT_SYMBOL(napi_gro_frags);
4194
4195 /*
4196  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4197  * Note: called with local irq disabled, but exits with local irq enabled.
4198  */
4199 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4200 {
4201 #ifdef CONFIG_RPS
4202         struct softnet_data *remsd = sd->rps_ipi_list;
4203
4204         if (remsd) {
4205                 sd->rps_ipi_list = NULL;
4206
4207                 local_irq_enable();
4208
4209                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4210                 while (remsd) {
4211                         struct softnet_data *next = remsd->rps_ipi_next;
4212
4213                         if (cpu_online(remsd->cpu))
4214                                 smp_call_function_single_async(remsd->cpu,
4215                                                            &remsd->csd);
4216                         remsd = next;
4217                 }
4218         } else
4219 #endif
4220                 local_irq_enable();
4221 }
4222
4223 static int process_backlog(struct napi_struct *napi, int quota)
4224 {
4225         int work = 0;
4226         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4227
4228 #ifdef CONFIG_RPS
4229         /* Check if we have pending ipi, its better to send them now,
4230          * not waiting net_rx_action() end.
4231          */
4232         if (sd->rps_ipi_list) {
4233                 local_irq_disable();
4234                 net_rps_action_and_irq_enable(sd);
4235         }
4236 #endif
4237         napi->weight = weight_p;
4238         local_irq_disable();
4239         while (1) {
4240                 struct sk_buff *skb;
4241
4242                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4243                         local_irq_enable();
4244                         __netif_receive_skb(skb);
4245                         local_irq_disable();
4246                         input_queue_head_incr(sd);
4247                         if (++work >= quota) {
4248                                 local_irq_enable();
4249                                 return work;
4250                         }
4251                 }
4252
4253                 rps_lock(sd);
4254                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4255                         /*
4256                          * Inline a custom version of __napi_complete().
4257                          * only current cpu owns and manipulates this napi,
4258                          * and NAPI_STATE_SCHED is the only possible flag set
4259                          * on backlog.
4260                          * We can use a plain write instead of clear_bit(),
4261                          * and we dont need an smp_mb() memory barrier.
4262                          */
4263                         list_del(&napi->poll_list);
4264                         napi->state = 0;
4265                         rps_unlock(sd);
4266
4267                         break;
4268                 }
4269
4270                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4271                                            &sd->process_queue);
4272                 rps_unlock(sd);
4273         }
4274         local_irq_enable();
4275
4276         return work;
4277 }
4278
4279 /**
4280  * __napi_schedule - schedule for receive
4281  * @n: entry to schedule
4282  *
4283  * The entry's receive function will be scheduled to run
4284  */
4285 void __napi_schedule(struct napi_struct *n)
4286 {
4287         unsigned long flags;
4288
4289         local_irq_save(flags);
4290         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4291         local_irq_restore(flags);
4292 }
4293 EXPORT_SYMBOL(__napi_schedule);
4294
4295 void __napi_complete(struct napi_struct *n)
4296 {
4297         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4298         BUG_ON(n->gro_list);
4299
4300         list_del(&n->poll_list);
4301         smp_mb__before_atomic();
4302         clear_bit(NAPI_STATE_SCHED, &n->state);
4303 }
4304 EXPORT_SYMBOL(__napi_complete);
4305
4306 void napi_complete(struct napi_struct *n)
4307 {
4308         unsigned long flags;
4309
4310         /*
4311          * don't let napi dequeue from the cpu poll list
4312          * just in case its running on a different cpu
4313          */
4314         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4315                 return;
4316
4317         napi_gro_flush(n, false);
4318         local_irq_save(flags);
4319         __napi_complete(n);
4320         local_irq_restore(flags);
4321 }
4322 EXPORT_SYMBOL(napi_complete);
4323
4324 /* must be called under rcu_read_lock(), as we dont take a reference */
4325 struct napi_struct *napi_by_id(unsigned int napi_id)
4326 {
4327         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4328         struct napi_struct *napi;
4329
4330         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4331                 if (napi->napi_id == napi_id)
4332                         return napi;
4333
4334         return NULL;
4335 }
4336 EXPORT_SYMBOL_GPL(napi_by_id);
4337
4338 void napi_hash_add(struct napi_struct *napi)
4339 {
4340         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4341
4342                 spin_lock(&napi_hash_lock);
4343
4344                 /* 0 is not a valid id, we also skip an id that is taken
4345                  * we expect both events to be extremely rare
4346                  */
4347                 napi->napi_id = 0;
4348                 while (!napi->napi_id) {
4349                         napi->napi_id = ++napi_gen_id;
4350                         if (napi_by_id(napi->napi_id))
4351                                 napi->napi_id = 0;
4352                 }
4353
4354                 hlist_add_head_rcu(&napi->napi_hash_node,
4355                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4356
4357                 spin_unlock(&napi_hash_lock);
4358         }
4359 }
4360 EXPORT_SYMBOL_GPL(napi_hash_add);
4361
4362 /* Warning : caller is responsible to make sure rcu grace period
4363  * is respected before freeing memory containing @napi
4364  */
4365 void napi_hash_del(struct napi_struct *napi)
4366 {
4367         spin_lock(&napi_hash_lock);
4368
4369         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4370                 hlist_del_rcu(&napi->napi_hash_node);
4371
4372         spin_unlock(&napi_hash_lock);
4373 }
4374 EXPORT_SYMBOL_GPL(napi_hash_del);
4375
4376 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4377                     int (*poll)(struct napi_struct *, int), int weight)
4378 {
4379         INIT_LIST_HEAD(&napi->poll_list);
4380         napi->gro_count = 0;
4381         napi->gro_list = NULL;
4382         napi->skb = NULL;
4383         napi->poll = poll;
4384         if (weight > NAPI_POLL_WEIGHT)
4385                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4386                             weight, dev->name);
4387         napi->weight = weight;
4388         list_add(&napi->dev_list, &dev->napi_list);
4389         napi->dev = dev;
4390 #ifdef CONFIG_NETPOLL
4391         spin_lock_init(&napi->poll_lock);
4392         napi->poll_owner = -1;
4393 #endif
4394         set_bit(NAPI_STATE_SCHED, &napi->state);
4395 }
4396 EXPORT_SYMBOL(netif_napi_add);
4397
4398 void netif_napi_del(struct napi_struct *napi)
4399 {
4400         list_del_init(&napi->dev_list);
4401         napi_free_frags(napi);
4402
4403         kfree_skb_list(napi->gro_list);
4404         napi->gro_list = NULL;
4405         napi->gro_count = 0;
4406 }
4407 EXPORT_SYMBOL(netif_napi_del);
4408
4409 static void net_rx_action(struct softirq_action *h)
4410 {
4411         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4412         unsigned long time_limit = jiffies + 2;
4413         int budget = netdev_budget;
4414         void *have;
4415
4416         local_irq_disable();
4417
4418         while (!list_empty(&sd->poll_list)) {
4419                 struct napi_struct *n;
4420                 int work, weight;
4421
4422                 /* If softirq window is exhuasted then punt.
4423                  * Allow this to run for 2 jiffies since which will allow
4424                  * an average latency of 1.5/HZ.
4425                  */
4426                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4427                         goto softnet_break;
4428
4429                 local_irq_enable();
4430
4431                 /* Even though interrupts have been re-enabled, this
4432                  * access is safe because interrupts can only add new
4433                  * entries to the tail of this list, and only ->poll()
4434                  * calls can remove this head entry from the list.
4435                  */
4436                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4437
4438                 have = netpoll_poll_lock(n);
4439
4440                 weight = n->weight;
4441
4442                 /* This NAPI_STATE_SCHED test is for avoiding a race
4443                  * with netpoll's poll_napi().  Only the entity which
4444                  * obtains the lock and sees NAPI_STATE_SCHED set will
4445                  * actually make the ->poll() call.  Therefore we avoid
4446                  * accidentally calling ->poll() when NAPI is not scheduled.
4447                  */
4448                 work = 0;
4449                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4450                         work = n->poll(n, weight);
4451                         trace_napi_poll(n);
4452                 }
4453
4454                 WARN_ON_ONCE(work > weight);
4455
4456                 budget -= work;
4457
4458                 local_irq_disable();
4459
4460                 /* Drivers must not modify the NAPI state if they
4461                  * consume the entire weight.  In such cases this code
4462                  * still "owns" the NAPI instance and therefore can
4463                  * move the instance around on the list at-will.
4464                  */
4465                 if (unlikely(work == weight)) {
4466                         if (unlikely(napi_disable_pending(n))) {
4467                                 local_irq_enable();
4468                                 napi_complete(n);
4469                                 local_irq_disable();
4470                         } else {
4471                                 if (n->gro_list) {
4472                                         /* flush too old packets
4473                                          * If HZ < 1000, flush all packets.
4474                                          */
4475                                         local_irq_enable();
4476                                         napi_gro_flush(n, HZ >= 1000);
4477                                         local_irq_disable();
4478                                 }
4479                                 list_move_tail(&n->poll_list, &sd->poll_list);
4480                         }
4481                 }
4482
4483                 netpoll_poll_unlock(have);
4484         }
4485 out:
4486         net_rps_action_and_irq_enable(sd);
4487
4488 #ifdef CONFIG_NET_DMA
4489         /*
4490          * There may not be any more sk_buffs coming right now, so push
4491          * any pending DMA copies to hardware
4492          */
4493         dma_issue_pending_all();
4494 #endif
4495
4496         return;
4497
4498 softnet_break:
4499         sd->time_squeeze++;
4500         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4501         goto out;
4502 }
4503
4504 struct netdev_adjacent {
4505         struct net_device *dev;
4506
4507         /* upper master flag, there can only be one master device per list */
4508         bool master;
4509
4510         /* counter for the number of times this device was added to us */
4511         u16 ref_nr;
4512
4513         /* private field for the users */
4514         void *private;
4515
4516         struct list_head list;
4517         struct rcu_head rcu;
4518 };
4519
4520 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4521                                                  struct net_device *adj_dev,
4522                                                  struct list_head *adj_list)
4523 {
4524         struct netdev_adjacent *adj;
4525
4526         list_for_each_entry(adj, adj_list, list) {
4527                 if (adj->dev == adj_dev)
4528                         return adj;
4529         }
4530         return NULL;
4531 }
4532
4533 /**
4534  * netdev_has_upper_dev - Check if device is linked to an upper device
4535  * @dev: device
4536  * @upper_dev: upper device to check
4537  *
4538  * Find out if a device is linked to specified upper device and return true
4539  * in case it is. Note that this checks only immediate upper device,
4540  * not through a complete stack of devices. The caller must hold the RTNL lock.
4541  */
4542 bool netdev_has_upper_dev(struct net_device *dev,
4543                           struct net_device *upper_dev)
4544 {
4545         ASSERT_RTNL();
4546
4547         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4548 }
4549 EXPORT_SYMBOL(netdev_has_upper_dev);
4550
4551 /**
4552  * netdev_has_any_upper_dev - Check if device is linked to some device
4553  * @dev: device
4554  *
4555  * Find out if a device is linked to an upper device and return true in case
4556  * it is. The caller must hold the RTNL lock.
4557  */
4558 static bool netdev_has_any_upper_dev(struct net_device *dev)
4559 {
4560         ASSERT_RTNL();
4561
4562         return !list_empty(&dev->all_adj_list.upper);
4563 }
4564
4565 /**
4566  * netdev_master_upper_dev_get - Get master upper device
4567  * @dev: device
4568  *
4569  * Find a master upper device and return pointer to it or NULL in case
4570  * it's not there. The caller must hold the RTNL lock.
4571  */
4572 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4573 {
4574         struct netdev_adjacent *upper;
4575
4576         ASSERT_RTNL();
4577
4578         if (list_empty(&dev->adj_list.upper))
4579                 return NULL;
4580
4581         upper = list_first_entry(&dev->adj_list.upper,
4582                                  struct netdev_adjacent, list);
4583         if (likely(upper->master))
4584                 return upper->dev;
4585         return NULL;
4586 }
4587 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4588
4589 void *netdev_adjacent_get_private(struct list_head *adj_list)
4590 {
4591         struct netdev_adjacent *adj;
4592
4593         adj = list_entry(adj_list, struct netdev_adjacent, list);
4594
4595         return adj->private;
4596 }
4597 EXPORT_SYMBOL(netdev_adjacent_get_private);
4598
4599 /**
4600  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4601  * @dev: device
4602  * @iter: list_head ** of the current position
4603  *
4604  * Gets the next device from the dev's upper list, starting from iter
4605  * position. The caller must hold RCU read lock.
4606  */
4607 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4608                                                  struct list_head **iter)
4609 {
4610         struct netdev_adjacent *upper;
4611
4612         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4613
4614         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4615
4616         if (&upper->list == &dev->adj_list.upper)
4617                 return NULL;
4618
4619         *iter = &upper->list;
4620
4621         return upper->dev;
4622 }
4623 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4624
4625 /**
4626  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4627  * @dev: device
4628  * @iter: list_head ** of the current position
4629  *
4630  * Gets the next device from the dev's upper list, starting from iter
4631  * position. The caller must hold RCU read lock.
4632  */
4633 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4634                                                      struct list_head **iter)
4635 {
4636         struct netdev_adjacent *upper;
4637
4638         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4639
4640         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4641
4642         if (&upper->list == &dev->all_adj_list.upper)
4643                 return NULL;
4644
4645         *iter = &upper->list;
4646
4647         return upper->dev;
4648 }
4649 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4650
4651 /**
4652  * netdev_lower_get_next_private - Get the next ->private from the
4653  *                                 lower neighbour list
4654  * @dev: device
4655  * @iter: list_head ** of the current position
4656  *
4657  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4658  * list, starting from iter position. The caller must hold either hold the
4659  * RTNL lock or its own locking that guarantees that the neighbour lower
4660  * list will remain unchainged.
4661  */
4662 void *netdev_lower_get_next_private(struct net_device *dev,
4663                                     struct list_head **iter)
4664 {
4665         struct netdev_adjacent *lower;
4666
4667         lower = list_entry(*iter, struct netdev_adjacent, list);
4668
4669         if (&lower->list == &dev->adj_list.lower)
4670                 return NULL;
4671
4672         *iter = lower->list.next;
4673
4674         return lower->private;
4675 }
4676 EXPORT_SYMBOL(netdev_lower_get_next_private);
4677
4678 /**
4679  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4680  *                                     lower neighbour list, RCU
4681  *                                     variant
4682  * @dev: device
4683  * @iter: list_head ** of the current position
4684  *
4685  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4686  * list, starting from iter position. The caller must hold RCU read lock.
4687  */
4688 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4689                                         struct list_head **iter)
4690 {
4691         struct netdev_adjacent *lower;
4692
4693         WARN_ON_ONCE(!rcu_read_lock_held());
4694
4695         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4696
4697         if (&lower->list == &dev->adj_list.lower)
4698                 return NULL;
4699
4700         *iter = &lower->list;
4701
4702         return lower->private;
4703 }
4704 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4705
4706 /**
4707  * netdev_lower_get_next - Get the next device from the lower neighbour
4708  *                         list
4709  * @dev: device
4710  * @iter: list_head ** of the current position
4711  *
4712  * Gets the next netdev_adjacent from the dev's lower neighbour
4713  * list, starting from iter position. The caller must hold RTNL lock or
4714  * its own locking that guarantees that the neighbour lower
4715  * list will remain unchainged.
4716  */
4717 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4718 {
4719         struct netdev_adjacent *lower;
4720
4721         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4722
4723         if (&lower->list == &dev->adj_list.lower)
4724                 return NULL;
4725
4726         *iter = &lower->list;
4727
4728         return lower->dev;
4729 }
4730 EXPORT_SYMBOL(netdev_lower_get_next);
4731
4732 /**
4733  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4734  *                                     lower neighbour list, RCU
4735  *                                     variant
4736  * @dev: device
4737  *
4738  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4739  * list. The caller must hold RCU read lock.
4740  */
4741 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4742 {
4743         struct netdev_adjacent *lower;
4744
4745         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4746                         struct netdev_adjacent, list);
4747         if (lower)
4748                 return lower->private;
4749         return NULL;
4750 }
4751 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4752
4753 /**
4754  * netdev_master_upper_dev_get_rcu - Get master upper device
4755  * @dev: device
4756  *
4757  * Find a master upper device and return pointer to it or NULL in case
4758  * it's not there. The caller must hold the RCU read lock.
4759  */
4760 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4761 {
4762         struct netdev_adjacent *upper;
4763
4764         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4765                                        struct netdev_adjacent, list);
4766         if (upper && likely(upper->master))
4767                 return upper->dev;
4768         return NULL;
4769 }
4770 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4771
4772 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4773                               struct net_device *adj_dev,
4774                               struct list_head *dev_list)
4775 {
4776         char linkname[IFNAMSIZ+7];
4777         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4778                 "upper_%s" : "lower_%s", adj_dev->name);
4779         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4780                                  linkname);
4781 }
4782 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4783                                char *name,
4784                                struct list_head *dev_list)
4785 {
4786         char linkname[IFNAMSIZ+7];
4787         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4788                 "upper_%s" : "lower_%s", name);
4789         sysfs_remove_link(&(dev->dev.kobj), linkname);
4790 }
4791
4792 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4793                 (dev_list == &dev->adj_list.upper || \
4794                  dev_list == &dev->adj_list.lower)
4795
4796 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4797                                         struct net_device *adj_dev,
4798                                         struct list_head *dev_list,
4799                                         void *private, bool master)
4800 {
4801         struct netdev_adjacent *adj;
4802         int ret;
4803
4804         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4805
4806         if (adj) {
4807                 adj->ref_nr++;
4808                 return 0;
4809         }
4810
4811         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4812         if (!adj)
4813                 return -ENOMEM;
4814
4815         adj->dev = adj_dev;
4816         adj->master = master;
4817         adj->ref_nr = 1;
4818         adj->private = private;
4819         dev_hold(adj_dev);
4820
4821         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4822                  adj_dev->name, dev->name, adj_dev->name);
4823
4824         if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4825                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4826                 if (ret)
4827                         goto free_adj;
4828         }
4829
4830         /* Ensure that master link is always the first item in list. */
4831         if (master) {
4832                 ret = sysfs_create_link(&(dev->dev.kobj),
4833                                         &(adj_dev->dev.kobj), "master");
4834                 if (ret)
4835                         goto remove_symlinks;
4836
4837                 list_add_rcu(&adj->list, dev_list);
4838         } else {
4839                 list_add_tail_rcu(&adj->list, dev_list);
4840         }
4841
4842         return 0;
4843
4844 remove_symlinks:
4845         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4846                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4847 free_adj:
4848         kfree(adj);
4849         dev_put(adj_dev);
4850
4851         return ret;
4852 }
4853
4854 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4855                                          struct net_device *adj_dev,
4856                                          struct list_head *dev_list)
4857 {
4858         struct netdev_adjacent *adj;
4859
4860         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4861
4862         if (!adj) {
4863                 pr_err("tried to remove device %s from %s\n",
4864                        dev->name, adj_dev->name);
4865                 BUG();
4866         }
4867
4868         if (adj->ref_nr > 1) {
4869                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4870                          adj->ref_nr-1);
4871                 adj->ref_nr--;
4872                 return;
4873         }
4874
4875         if (adj->master)
4876                 sysfs_remove_link(&(dev->dev.kobj), "master");
4877
4878         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4879                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4880
4881         list_del_rcu(&adj->list);
4882         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4883                  adj_dev->name, dev->name, adj_dev->name);
4884         dev_put(adj_dev);
4885         kfree_rcu(adj, rcu);
4886 }
4887
4888 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4889                                             struct net_device *upper_dev,
4890                                             struct list_head *up_list,
4891                                             struct list_head *down_list,
4892                                             void *private, bool master)
4893 {
4894         int ret;
4895
4896         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4897                                            master);
4898         if (ret)
4899                 return ret;
4900
4901         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4902                                            false);
4903         if (ret) {
4904                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4905                 return ret;
4906         }
4907
4908         return 0;
4909 }
4910
4911 static int __netdev_adjacent_dev_link(struct net_device *dev,
4912                                       struct net_device *upper_dev)
4913 {
4914         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4915                                                 &dev->all_adj_list.upper,
4916                                                 &upper_dev->all_adj_list.lower,
4917                                                 NULL, false);
4918 }
4919
4920 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4921                                                struct net_device *upper_dev,
4922                                                struct list_head *up_list,
4923                                                struct list_head *down_list)
4924 {
4925         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4926         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4927 }
4928
4929 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4930                                          struct net_device *upper_dev)
4931 {
4932         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4933                                            &dev->all_adj_list.upper,
4934                                            &upper_dev->all_adj_list.lower);
4935 }
4936
4937 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4938                                                 struct net_device *upper_dev,
4939                                                 void *private, bool master)
4940 {
4941         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4942
4943         if (ret)
4944                 return ret;
4945
4946         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4947                                                &dev->adj_list.upper,
4948                                                &upper_dev->adj_list.lower,
4949                                                private, master);
4950         if (ret) {
4951                 __netdev_adjacent_dev_unlink(dev, upper_dev);
4952                 return ret;
4953         }
4954
4955         return 0;
4956 }
4957
4958 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4959                                                    struct net_device *upper_dev)
4960 {
4961         __netdev_adjacent_dev_unlink(dev, upper_dev);
4962         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4963                                            &dev->adj_list.upper,
4964                                            &upper_dev->adj_list.lower);
4965 }
4966
4967 static int __netdev_upper_dev_link(struct net_device *dev,
4968                                    struct net_device *upper_dev, bool master,
4969                                    void *private)
4970 {
4971         struct netdev_adjacent *i, *j, *to_i, *to_j;
4972         int ret = 0;
4973
4974         ASSERT_RTNL();
4975
4976         if (dev == upper_dev)
4977                 return -EBUSY;
4978
4979         /* To prevent loops, check if dev is not upper device to upper_dev. */
4980         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4981                 return -EBUSY;
4982
4983         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4984                 return -EEXIST;
4985
4986         if (master && netdev_master_upper_dev_get(dev))
4987                 return -EBUSY;
4988
4989         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4990                                                    master);
4991         if (ret)
4992                 return ret;
4993
4994         /* Now that we linked these devs, make all the upper_dev's
4995          * all_adj_list.upper visible to every dev's all_adj_list.lower an
4996          * versa, and don't forget the devices itself. All of these
4997          * links are non-neighbours.
4998          */
4999         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5000                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5001                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5002                                  i->dev->name, j->dev->name);
5003                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5004                         if (ret)
5005                                 goto rollback_mesh;
5006                 }
5007         }
5008
5009         /* add dev to every upper_dev's upper device */
5010         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5011                 pr_debug("linking %s's upper device %s with %s\n",
5012                          upper_dev->name, i->dev->name, dev->name);
5013                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5014                 if (ret)
5015                         goto rollback_upper_mesh;
5016         }
5017
5018         /* add upper_dev to every dev's lower device */
5019         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5020                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5021                          i->dev->name, upper_dev->name);
5022                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5023                 if (ret)
5024                         goto rollback_lower_mesh;
5025         }
5026
5027         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5028         return 0;
5029
5030 rollback_lower_mesh:
5031         to_i = i;
5032         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5033                 if (i == to_i)
5034                         break;
5035                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5036         }
5037
5038         i = NULL;
5039
5040 rollback_upper_mesh:
5041         to_i = i;
5042         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5043                 if (i == to_i)
5044                         break;
5045                 __netdev_adjacent_dev_unlink(dev, i->dev);
5046         }
5047
5048         i = j = NULL;
5049
5050 rollback_mesh:
5051         to_i = i;
5052         to_j = j;
5053         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5054                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5055                         if (i == to_i && j == to_j)
5056                                 break;
5057                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5058                 }
5059                 if (i == to_i)
5060                         break;
5061         }
5062
5063         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5064
5065         return ret;
5066 }
5067
5068 /**
5069  * netdev_upper_dev_link - Add a link to the upper device
5070  * @dev: device
5071  * @upper_dev: new upper device
5072  *
5073  * Adds a link to device which is upper to this one. The caller must hold
5074  * the RTNL lock. On a failure a negative errno code is returned.
5075  * On success the reference counts are adjusted and the function
5076  * returns zero.
5077  */
5078 int netdev_upper_dev_link(struct net_device *dev,
5079                           struct net_device *upper_dev)
5080 {
5081         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5082 }
5083 EXPORT_SYMBOL(netdev_upper_dev_link);
5084
5085 /**
5086  * netdev_master_upper_dev_link - Add a master link to the upper device
5087  * @dev: device
5088  * @upper_dev: new upper device
5089  *
5090  * Adds a link to device which is upper to this one. In this case, only
5091  * one master upper device can be linked, although other non-master devices
5092  * might be linked as well. The caller must hold the RTNL lock.
5093  * On a failure a negative errno code is returned. On success the reference
5094  * counts are adjusted and the function returns zero.
5095  */
5096 int netdev_master_upper_dev_link(struct net_device *dev,
5097                                  struct net_device *upper_dev)
5098 {
5099         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5100 }
5101 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5102
5103 int netdev_master_upper_dev_link_private(struct net_device *dev,
5104                                          struct net_device *upper_dev,
5105                                          void *private)
5106 {
5107         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5108 }
5109 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5110
5111 /**
5112  * netdev_upper_dev_unlink - Removes a link to upper device
5113  * @dev: device
5114  * @upper_dev: new upper device
5115  *
5116  * Removes a link to device which is upper to this one. The caller must hold
5117  * the RTNL lock.
5118  */
5119 void netdev_upper_dev_unlink(struct net_device *dev,
5120                              struct net_device *upper_dev)
5121 {
5122         struct netdev_adjacent *i, *j;
5123         ASSERT_RTNL();
5124
5125         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5126
5127         /* Here is the tricky part. We must remove all dev's lower
5128          * devices from all upper_dev's upper devices and vice
5129          * versa, to maintain the graph relationship.
5130          */
5131         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5132                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5133                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5134
5135         /* remove also the devices itself from lower/upper device
5136          * list
5137          */
5138         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5139                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5140
5141         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5142                 __netdev_adjacent_dev_unlink(dev, i->dev);
5143
5144         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5145 }
5146 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5147
5148 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5149 {
5150         struct netdev_adjacent *iter;
5151
5152         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5153                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5154                                           &iter->dev->adj_list.lower);
5155                 netdev_adjacent_sysfs_add(iter->dev, dev,
5156                                           &iter->dev->adj_list.lower);
5157         }
5158
5159         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5160                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5161                                           &iter->dev->adj_list.upper);
5162                 netdev_adjacent_sysfs_add(iter->dev, dev,
5163                                           &iter->dev->adj_list.upper);
5164         }
5165 }
5166
5167 void *netdev_lower_dev_get_private(struct net_device *dev,
5168                                    struct net_device *lower_dev)
5169 {
5170         struct netdev_adjacent *lower;
5171
5172         if (!lower_dev)
5173                 return NULL;
5174         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5175         if (!lower)
5176                 return NULL;
5177
5178         return lower->private;
5179 }
5180 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5181
5182
5183 int dev_get_nest_level(struct net_device *dev,
5184                        bool (*type_check)(struct net_device *dev))
5185 {
5186         struct net_device *lower = NULL;
5187         struct list_head *iter;
5188         int max_nest = -1;
5189         int nest;
5190
5191         ASSERT_RTNL();
5192
5193         netdev_for_each_lower_dev(dev, lower, iter) {
5194                 nest = dev_get_nest_level(lower, type_check);
5195                 if (max_nest < nest)
5196                         max_nest = nest;
5197         }
5198
5199         if (type_check(dev))
5200                 max_nest++;
5201
5202         return max_nest;
5203 }
5204 EXPORT_SYMBOL(dev_get_nest_level);
5205
5206 static void dev_change_rx_flags(struct net_device *dev, int flags)
5207 {
5208         const struct net_device_ops *ops = dev->netdev_ops;
5209
5210         if (ops->ndo_change_rx_flags)
5211                 ops->ndo_change_rx_flags(dev, flags);
5212 }
5213
5214 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5215 {
5216         unsigned int old_flags = dev->flags;
5217         kuid_t uid;
5218         kgid_t gid;
5219
5220         ASSERT_RTNL();
5221
5222         dev->flags |= IFF_PROMISC;
5223         dev->promiscuity += inc;
5224         if (dev->promiscuity == 0) {
5225                 /*
5226                  * Avoid overflow.
5227                  * If inc causes overflow, untouch promisc and return error.
5228                  */
5229                 if (inc < 0)
5230                         dev->flags &= ~IFF_PROMISC;
5231                 else {
5232                         dev->promiscuity -= inc;
5233                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5234                                 dev->name);
5235                         return -EOVERFLOW;
5236                 }
5237         }
5238         if (dev->flags != old_flags) {
5239                 pr_info("device %s %s promiscuous mode\n",
5240                         dev->name,
5241                         dev->flags & IFF_PROMISC ? "entered" : "left");
5242                 if (audit_enabled) {
5243                         current_uid_gid(&uid, &gid);
5244                         audit_log(current->audit_context, GFP_ATOMIC,
5245                                 AUDIT_ANOM_PROMISCUOUS,
5246                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5247                                 dev->name, (dev->flags & IFF_PROMISC),
5248                                 (old_flags & IFF_PROMISC),
5249                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5250                                 from_kuid(&init_user_ns, uid),
5251                                 from_kgid(&init_user_ns, gid),
5252                                 audit_get_sessionid(current));
5253                 }
5254
5255                 dev_change_rx_flags(dev, IFF_PROMISC);
5256         }
5257         if (notify)
5258                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5259         return 0;
5260 }
5261
5262 /**
5263  *      dev_set_promiscuity     - update promiscuity count on a device
5264  *      @dev: device
5265  *      @inc: modifier
5266  *
5267  *      Add or remove promiscuity from a device. While the count in the device
5268  *      remains above zero the interface remains promiscuous. Once it hits zero
5269  *      the device reverts back to normal filtering operation. A negative inc
5270  *      value is used to drop promiscuity on the device.
5271  *      Return 0 if successful or a negative errno code on error.
5272  */
5273 int dev_set_promiscuity(struct net_device *dev, int inc)
5274 {
5275         unsigned int old_flags = dev->flags;
5276         int err;
5277
5278         err = __dev_set_promiscuity(dev, inc, true);
5279         if (err < 0)
5280                 return err;
5281         if (dev->flags != old_flags)
5282                 dev_set_rx_mode(dev);
5283         return err;
5284 }
5285 EXPORT_SYMBOL(dev_set_promiscuity);
5286
5287 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5288 {
5289         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5290
5291         ASSERT_RTNL();
5292
5293         dev->flags |= IFF_ALLMULTI;
5294         dev->allmulti += inc;
5295         if (dev->allmulti == 0) {
5296                 /*
5297                  * Avoid overflow.
5298                  * If inc causes overflow, untouch allmulti and return error.
5299                  */
5300                 if (inc < 0)
5301                         dev->flags &= ~IFF_ALLMULTI;
5302                 else {
5303                         dev->allmulti -= inc;
5304                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5305                                 dev->name);
5306                         return -EOVERFLOW;
5307                 }
5308         }
5309         if (dev->flags ^ old_flags) {
5310                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5311                 dev_set_rx_mode(dev);
5312                 if (notify)
5313                         __dev_notify_flags(dev, old_flags,
5314                                            dev->gflags ^ old_gflags);
5315         }
5316         return 0;
5317 }
5318
5319 /**
5320  *      dev_set_allmulti        - update allmulti count on a device
5321  *      @dev: device
5322  *      @inc: modifier
5323  *
5324  *      Add or remove reception of all multicast frames to a device. While the
5325  *      count in the device remains above zero the interface remains listening
5326  *      to all interfaces. Once it hits zero the device reverts back to normal
5327  *      filtering operation. A negative @inc value is used to drop the counter
5328  *      when releasing a resource needing all multicasts.
5329  *      Return 0 if successful or a negative errno code on error.
5330  */
5331
5332 int dev_set_allmulti(struct net_device *dev, int inc)
5333 {
5334         return __dev_set_allmulti(dev, inc, true);
5335 }
5336 EXPORT_SYMBOL(dev_set_allmulti);
5337
5338 /*
5339  *      Upload unicast and multicast address lists to device and
5340  *      configure RX filtering. When the device doesn't support unicast
5341  *      filtering it is put in promiscuous mode while unicast addresses
5342  *      are present.
5343  */
5344 void __dev_set_rx_mode(struct net_device *dev)
5345 {
5346         const struct net_device_ops *ops = dev->netdev_ops;
5347
5348         /* dev_open will call this function so the list will stay sane. */
5349         if (!(dev->flags&IFF_UP))
5350                 return;
5351
5352         if (!netif_device_present(dev))
5353                 return;
5354
5355         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5356                 /* Unicast addresses changes may only happen under the rtnl,
5357                  * therefore calling __dev_set_promiscuity here is safe.
5358                  */
5359                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5360                         __dev_set_promiscuity(dev, 1, false);
5361                         dev->uc_promisc = true;
5362                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5363                         __dev_set_promiscuity(dev, -1, false);
5364                         dev->uc_promisc = false;
5365                 }
5366         }
5367
5368         if (ops->ndo_set_rx_mode)
5369                 ops->ndo_set_rx_mode(dev);
5370 }
5371
5372 void dev_set_rx_mode(struct net_device *dev)
5373 {
5374         netif_addr_lock_bh(dev);
5375         __dev_set_rx_mode(dev);
5376         netif_addr_unlock_bh(dev);
5377 }
5378
5379 /**
5380  *      dev_get_flags - get flags reported to userspace
5381  *      @dev: device
5382  *
5383  *      Get the combination of flag bits exported through APIs to userspace.
5384  */
5385 unsigned int dev_get_flags(const struct net_device *dev)
5386 {
5387         unsigned int flags;
5388
5389         flags = (dev->flags & ~(IFF_PROMISC |
5390                                 IFF_ALLMULTI |
5391                                 IFF_RUNNING |
5392                                 IFF_LOWER_UP |
5393                                 IFF_DORMANT)) |
5394                 (dev->gflags & (IFF_PROMISC |
5395                                 IFF_ALLMULTI));
5396
5397         if (netif_running(dev)) {
5398                 if (netif_oper_up(dev))
5399                         flags |= IFF_RUNNING;
5400                 if (netif_carrier_ok(dev))
5401                         flags |= IFF_LOWER_UP;
5402                 if (netif_dormant(dev))
5403                         flags |= IFF_DORMANT;
5404         }
5405
5406         return flags;
5407 }
5408 EXPORT_SYMBOL(dev_get_flags);
5409
5410 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5411 {
5412         unsigned int old_flags = dev->flags;
5413         int ret;
5414
5415         ASSERT_RTNL();
5416
5417         /*
5418          *      Set the flags on our device.
5419          */
5420
5421         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5422                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5423                                IFF_AUTOMEDIA)) |
5424                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5425                                     IFF_ALLMULTI));
5426
5427         /*
5428          *      Load in the correct multicast list now the flags have changed.
5429          */
5430
5431         if ((old_flags ^ flags) & IFF_MULTICAST)
5432                 dev_change_rx_flags(dev, IFF_MULTICAST);
5433
5434         dev_set_rx_mode(dev);
5435
5436         /*
5437          *      Have we downed the interface. We handle IFF_UP ourselves
5438          *      according to user attempts to set it, rather than blindly
5439          *      setting it.
5440          */
5441
5442         ret = 0;
5443         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5444                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5445
5446                 if (!ret)
5447                         dev_set_rx_mode(dev);
5448         }
5449
5450         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5451                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5452                 unsigned int old_flags = dev->flags;
5453
5454                 dev->gflags ^= IFF_PROMISC;
5455
5456                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5457                         if (dev->flags != old_flags)
5458                                 dev_set_rx_mode(dev);
5459         }
5460
5461         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5462            is important. Some (broken) drivers set IFF_PROMISC, when
5463            IFF_ALLMULTI is requested not asking us and not reporting.
5464          */
5465         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5466                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5467
5468                 dev->gflags ^= IFF_ALLMULTI;
5469                 __dev_set_allmulti(dev, inc, false);
5470         }
5471
5472         return ret;
5473 }
5474
5475 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5476                         unsigned int gchanges)
5477 {
5478         unsigned int changes = dev->flags ^ old_flags;
5479
5480         if (gchanges)
5481                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5482
5483         if (changes & IFF_UP) {
5484                 if (dev->flags & IFF_UP)
5485                         call_netdevice_notifiers(NETDEV_UP, dev);
5486                 else
5487                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5488         }
5489
5490         if (dev->flags & IFF_UP &&
5491             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5492                 struct netdev_notifier_change_info change_info;
5493
5494                 change_info.flags_changed = changes;
5495                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5496                                               &change_info.info);
5497         }
5498 }
5499
5500 /**
5501  *      dev_change_flags - change device settings
5502  *      @dev: device
5503  *      @flags: device state flags
5504  *
5505  *      Change settings on device based state flags. The flags are
5506  *      in the userspace exported format.
5507  */
5508 int dev_change_flags(struct net_device *dev, unsigned int flags)
5509 {
5510         int ret;
5511         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5512
5513         ret = __dev_change_flags(dev, flags);
5514         if (ret < 0)
5515                 return ret;
5516
5517         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5518         __dev_notify_flags(dev, old_flags, changes);
5519         return ret;
5520 }
5521 EXPORT_SYMBOL(dev_change_flags);
5522
5523 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5524 {
5525         const struct net_device_ops *ops = dev->netdev_ops;
5526
5527         if (ops->ndo_change_mtu)
5528                 return ops->ndo_change_mtu(dev, new_mtu);
5529
5530         dev->mtu = new_mtu;
5531         return 0;
5532 }
5533
5534 /**
5535  *      dev_set_mtu - Change maximum transfer unit
5536  *      @dev: device
5537  *      @new_mtu: new transfer unit
5538  *
5539  *      Change the maximum transfer size of the network device.
5540  */
5541 int dev_set_mtu(struct net_device *dev, int new_mtu)
5542 {
5543         int err, orig_mtu;
5544
5545         if (new_mtu == dev->mtu)
5546                 return 0;
5547
5548         /*      MTU must be positive.    */
5549         if (new_mtu < 0)
5550                 return -EINVAL;
5551
5552         if (!netif_device_present(dev))
5553                 return -ENODEV;
5554
5555         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5556         err = notifier_to_errno(err);
5557         if (err)
5558                 return err;
5559
5560         orig_mtu = dev->mtu;
5561         err = __dev_set_mtu(dev, new_mtu);
5562
5563         if (!err) {
5564                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5565                 err = notifier_to_errno(err);
5566                 if (err) {
5567                         /* setting mtu back and notifying everyone again,
5568                          * so that they have a chance to revert changes.
5569                          */
5570                         __dev_set_mtu(dev, orig_mtu);
5571                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5572                 }
5573         }
5574         return err;
5575 }
5576 EXPORT_SYMBOL(dev_set_mtu);
5577
5578 /**
5579  *      dev_set_group - Change group this device belongs to
5580  *      @dev: device
5581  *      @new_group: group this device should belong to
5582  */
5583 void dev_set_group(struct net_device *dev, int new_group)
5584 {
5585         dev->group = new_group;
5586 }
5587 EXPORT_SYMBOL(dev_set_group);
5588
5589 /**
5590  *      dev_set_mac_address - Change Media Access Control Address
5591  *      @dev: device
5592  *      @sa: new address
5593  *
5594  *      Change the hardware (MAC) address of the device
5595  */
5596 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5597 {
5598         const struct net_device_ops *ops = dev->netdev_ops;
5599         int err;
5600
5601         if (!ops->ndo_set_mac_address)
5602                 return -EOPNOTSUPP;
5603         if (sa->sa_family != dev->type)
5604                 return -EINVAL;
5605         if (!netif_device_present(dev))
5606                 return -ENODEV;
5607         err = ops->ndo_set_mac_address(dev, sa);
5608         if (err)
5609                 return err;
5610         dev->addr_assign_type = NET_ADDR_SET;
5611         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5612         add_device_randomness(dev->dev_addr, dev->addr_len);
5613         return 0;
5614 }
5615 EXPORT_SYMBOL(dev_set_mac_address);
5616
5617 /**
5618  *      dev_change_carrier - Change device carrier
5619  *      @dev: device
5620  *      @new_carrier: new value
5621  *
5622  *      Change device carrier
5623  */
5624 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5625 {
5626         const struct net_device_ops *ops = dev->netdev_ops;
5627
5628         if (!ops->ndo_change_carrier)
5629                 return -EOPNOTSUPP;
5630         if (!netif_device_present(dev))
5631                 return -ENODEV;
5632         return ops->ndo_change_carrier(dev, new_carrier);
5633 }
5634 EXPORT_SYMBOL(dev_change_carrier);
5635
5636 /**
5637  *      dev_get_phys_port_id - Get device physical port ID
5638  *      @dev: device
5639  *      @ppid: port ID
5640  *
5641  *      Get device physical port ID
5642  */
5643 int dev_get_phys_port_id(struct net_device *dev,
5644                          struct netdev_phys_port_id *ppid)
5645 {
5646         const struct net_device_ops *ops = dev->netdev_ops;
5647
5648         if (!ops->ndo_get_phys_port_id)
5649                 return -EOPNOTSUPP;
5650         return ops->ndo_get_phys_port_id(dev, ppid);
5651 }
5652 EXPORT_SYMBOL(dev_get_phys_port_id);
5653
5654 /**
5655  *      dev_new_index   -       allocate an ifindex
5656  *      @net: the applicable net namespace
5657  *
5658  *      Returns a suitable unique value for a new device interface
5659  *      number.  The caller must hold the rtnl semaphore or the
5660  *      dev_base_lock to be sure it remains unique.
5661  */
5662 static int dev_new_index(struct net *net)
5663 {
5664         int ifindex = net->ifindex;
5665         for (;;) {
5666                 if (++ifindex <= 0)
5667                         ifindex = 1;
5668                 if (!__dev_get_by_index(net, ifindex))
5669                         return net->ifindex = ifindex;
5670         }
5671 }
5672
5673 /* Delayed registration/unregisteration */
5674 static LIST_HEAD(net_todo_list);
5675 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5676
5677 static void net_set_todo(struct net_device *dev)
5678 {
5679         list_add_tail(&dev->todo_list, &net_todo_list);
5680         dev_net(dev)->dev_unreg_count++;
5681 }
5682
5683 static void rollback_registered_many(struct list_head *head)
5684 {
5685         struct net_device *dev, *tmp;
5686         LIST_HEAD(close_head);
5687
5688         BUG_ON(dev_boot_phase);
5689         ASSERT_RTNL();
5690
5691         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5692                 /* Some devices call without registering
5693                  * for initialization unwind. Remove those
5694                  * devices and proceed with the remaining.
5695                  */
5696                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5697                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5698                                  dev->name, dev);
5699
5700                         WARN_ON(1);
5701                         list_del(&dev->unreg_list);
5702                         continue;
5703                 }
5704                 dev->dismantle = true;
5705                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5706         }
5707
5708         /* If device is running, close it first. */
5709         list_for_each_entry(dev, head, unreg_list)
5710                 list_add_tail(&dev->close_list, &close_head);
5711         dev_close_many(&close_head);
5712
5713         list_for_each_entry(dev, head, unreg_list) {
5714                 /* And unlink it from device chain. */
5715                 unlist_netdevice(dev);
5716
5717                 dev->reg_state = NETREG_UNREGISTERING;
5718         }
5719
5720         synchronize_net();
5721
5722         list_for_each_entry(dev, head, unreg_list) {
5723                 /* Shutdown queueing discipline. */
5724                 dev_shutdown(dev);
5725
5726
5727                 /* Notify protocols, that we are about to destroy
5728                    this device. They should clean all the things.
5729                 */
5730                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5731
5732                 /*
5733                  *      Flush the unicast and multicast chains
5734                  */
5735                 dev_uc_flush(dev);
5736                 dev_mc_flush(dev);
5737
5738                 if (dev->netdev_ops->ndo_uninit)
5739                         dev->netdev_ops->ndo_uninit(dev);
5740
5741                 if (!dev->rtnl_link_ops ||
5742                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5743                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5744
5745                 /* Notifier chain MUST detach us all upper devices. */
5746                 WARN_ON(netdev_has_any_upper_dev(dev));
5747
5748                 /* Remove entries from kobject tree */
5749                 netdev_unregister_kobject(dev);
5750 #ifdef CONFIG_XPS
5751                 /* Remove XPS queueing entries */
5752                 netif_reset_xps_queues_gt(dev, 0);
5753 #endif
5754         }
5755
5756         synchronize_net();
5757
5758         list_for_each_entry(dev, head, unreg_list)
5759                 dev_put(dev);
5760 }
5761
5762 static void rollback_registered(struct net_device *dev)
5763 {
5764         LIST_HEAD(single);
5765
5766         list_add(&dev->unreg_list, &single);
5767         rollback_registered_many(&single);
5768         list_del(&single);
5769 }
5770
5771 static netdev_features_t netdev_fix_features(struct net_device *dev,
5772         netdev_features_t features)
5773 {
5774         /* Fix illegal checksum combinations */
5775         if ((features & NETIF_F_HW_CSUM) &&
5776             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5777                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5778                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5779         }
5780
5781         /* TSO requires that SG is present as well. */
5782         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5783                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5784                 features &= ~NETIF_F_ALL_TSO;
5785         }
5786
5787         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5788                                         !(features & NETIF_F_IP_CSUM)) {
5789                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5790                 features &= ~NETIF_F_TSO;
5791                 features &= ~NETIF_F_TSO_ECN;
5792         }
5793
5794         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5795                                          !(features & NETIF_F_IPV6_CSUM)) {
5796                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5797                 features &= ~NETIF_F_TSO6;
5798         }
5799
5800         /* TSO ECN requires that TSO is present as well. */
5801         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5802                 features &= ~NETIF_F_TSO_ECN;
5803
5804         /* Software GSO depends on SG. */
5805         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5806                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5807                 features &= ~NETIF_F_GSO;
5808         }
5809
5810         /* UFO needs SG and checksumming */
5811         if (features & NETIF_F_UFO) {
5812                 /* maybe split UFO into V4 and V6? */
5813                 if (!((features & NETIF_F_GEN_CSUM) ||
5814                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5815                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5816                         netdev_dbg(dev,
5817                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5818                         features &= ~NETIF_F_UFO;
5819                 }
5820
5821                 if (!(features & NETIF_F_SG)) {
5822                         netdev_dbg(dev,
5823                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5824                         features &= ~NETIF_F_UFO;
5825                 }
5826         }
5827
5828 #ifdef CONFIG_NET_RX_BUSY_POLL
5829         if (dev->netdev_ops->ndo_busy_poll)
5830                 features |= NETIF_F_BUSY_POLL;
5831         else
5832 #endif
5833                 features &= ~NETIF_F_BUSY_POLL;
5834
5835         return features;
5836 }
5837
5838 int __netdev_update_features(struct net_device *dev)
5839 {
5840         netdev_features_t features;
5841         int err = 0;
5842
5843         ASSERT_RTNL();
5844
5845         features = netdev_get_wanted_features(dev);
5846
5847         if (dev->netdev_ops->ndo_fix_features)
5848                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5849
5850         /* driver might be less strict about feature dependencies */
5851         features = netdev_fix_features(dev, features);
5852
5853         if (dev->features == features)
5854                 return 0;
5855
5856         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5857                 &dev->features, &features);
5858
5859         if (dev->netdev_ops->ndo_set_features)
5860                 err = dev->netdev_ops->ndo_set_features(dev, features);
5861
5862         if (unlikely(err < 0)) {
5863                 netdev_err(dev,
5864                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5865                         err, &features, &dev->features);
5866                 return -1;
5867         }
5868
5869         if (!err)
5870                 dev->features = features;
5871
5872         return 1;
5873 }
5874
5875 /**
5876  *      netdev_update_features - recalculate device features
5877  *      @dev: the device to check
5878  *
5879  *      Recalculate dev->features set and send notifications if it
5880  *      has changed. Should be called after driver or hardware dependent
5881  *      conditions might have changed that influence the features.
5882  */
5883 void netdev_update_features(struct net_device *dev)
5884 {
5885         if (__netdev_update_features(dev))
5886                 netdev_features_change(dev);
5887 }
5888 EXPORT_SYMBOL(netdev_update_features);
5889
5890 /**
5891  *      netdev_change_features - recalculate device features
5892  *      @dev: the device to check
5893  *
5894  *      Recalculate dev->features set and send notifications even
5895  *      if they have not changed. Should be called instead of
5896  *      netdev_update_features() if also dev->vlan_features might
5897  *      have changed to allow the changes to be propagated to stacked
5898  *      VLAN devices.
5899  */
5900 void netdev_change_features(struct net_device *dev)
5901 {
5902         __netdev_update_features(dev);
5903         netdev_features_change(dev);
5904 }
5905 EXPORT_SYMBOL(netdev_change_features);
5906
5907 /**
5908  *      netif_stacked_transfer_operstate -      transfer operstate
5909  *      @rootdev: the root or lower level device to transfer state from
5910  *      @dev: the device to transfer operstate to
5911  *
5912  *      Transfer operational state from root to device. This is normally
5913  *      called when a stacking relationship exists between the root
5914  *      device and the device(a leaf device).
5915  */
5916 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5917                                         struct net_device *dev)
5918 {
5919         if (rootdev->operstate == IF_OPER_DORMANT)
5920                 netif_dormant_on(dev);
5921         else
5922                 netif_dormant_off(dev);
5923
5924         if (netif_carrier_ok(rootdev)) {
5925                 if (!netif_carrier_ok(dev))
5926                         netif_carrier_on(dev);
5927         } else {
5928                 if (netif_carrier_ok(dev))
5929                         netif_carrier_off(dev);
5930         }
5931 }
5932 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5933
5934 #ifdef CONFIG_SYSFS
5935 static int netif_alloc_rx_queues(struct net_device *dev)
5936 {
5937         unsigned int i, count = dev->num_rx_queues;
5938         struct netdev_rx_queue *rx;
5939
5940         BUG_ON(count < 1);
5941
5942         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5943         if (!rx)
5944                 return -ENOMEM;
5945
5946         dev->_rx = rx;
5947
5948         for (i = 0; i < count; i++)
5949                 rx[i].dev = dev;
5950         return 0;
5951 }
5952 #endif
5953
5954 static void netdev_init_one_queue(struct net_device *dev,
5955                                   struct netdev_queue *queue, void *_unused)
5956 {
5957         /* Initialize queue lock */
5958         spin_lock_init(&queue->_xmit_lock);
5959         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5960         queue->xmit_lock_owner = -1;
5961         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5962         queue->dev = dev;
5963 #ifdef CONFIG_BQL
5964         dql_init(&queue->dql, HZ);
5965 #endif
5966 }
5967
5968 static void netif_free_tx_queues(struct net_device *dev)
5969 {
5970         kvfree(dev->_tx);
5971 }
5972
5973 static int netif_alloc_netdev_queues(struct net_device *dev)
5974 {
5975         unsigned int count = dev->num_tx_queues;
5976         struct netdev_queue *tx;
5977         size_t sz = count * sizeof(*tx);
5978
5979         BUG_ON(count < 1 || count > 0xffff);
5980
5981         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5982         if (!tx) {
5983                 tx = vzalloc(sz);
5984                 if (!tx)
5985                         return -ENOMEM;
5986         }
5987         dev->_tx = tx;
5988
5989         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5990         spin_lock_init(&dev->tx_global_lock);
5991
5992         return 0;
5993 }
5994
5995 /**
5996  *      register_netdevice      - register a network device
5997  *      @dev: device to register
5998  *
5999  *      Take a completed network device structure and add it to the kernel
6000  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6001  *      chain. 0 is returned on success. A negative errno code is returned
6002  *      on a failure to set up the device, or if the name is a duplicate.
6003  *
6004  *      Callers must hold the rtnl semaphore. You may want
6005  *      register_netdev() instead of this.
6006  *
6007  *      BUGS:
6008  *      The locking appears insufficient to guarantee two parallel registers
6009  *      will not get the same name.
6010  */
6011
6012 int register_netdevice(struct net_device *dev)
6013 {
6014         int ret;
6015         struct net *net = dev_net(dev);
6016
6017         BUG_ON(dev_boot_phase);
6018         ASSERT_RTNL();
6019
6020         might_sleep();
6021
6022         /* When net_device's are persistent, this will be fatal. */
6023         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6024         BUG_ON(!net);
6025
6026         spin_lock_init(&dev->addr_list_lock);
6027         netdev_set_addr_lockdep_class(dev);
6028
6029         dev->iflink = -1;
6030
6031         ret = dev_get_valid_name(net, dev, dev->name);
6032         if (ret < 0)
6033                 goto out;
6034
6035         /* Init, if this function is available */
6036         if (dev->netdev_ops->ndo_init) {
6037                 ret = dev->netdev_ops->ndo_init(dev);
6038                 if (ret) {
6039                         if (ret > 0)
6040                                 ret = -EIO;
6041                         goto out;
6042                 }
6043         }
6044
6045         if (((dev->hw_features | dev->features) &
6046              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6047             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6048              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6049                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6050                 ret = -EINVAL;
6051                 goto err_uninit;
6052         }
6053
6054         ret = -EBUSY;
6055         if (!dev->ifindex)
6056                 dev->ifindex = dev_new_index(net);
6057         else if (__dev_get_by_index(net, dev->ifindex))
6058                 goto err_uninit;
6059
6060         if (dev->iflink == -1)
6061                 dev->iflink = dev->ifindex;
6062
6063         /* Transfer changeable features to wanted_features and enable
6064          * software offloads (GSO and GRO).
6065          */
6066         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6067         dev->features |= NETIF_F_SOFT_FEATURES;
6068         dev->wanted_features = dev->features & dev->hw_features;
6069
6070         if (!(dev->flags & IFF_LOOPBACK)) {
6071                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6072         }
6073
6074         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6075          */
6076         dev->vlan_features |= NETIF_F_HIGHDMA;
6077
6078         /* Make NETIF_F_SG inheritable to tunnel devices.
6079          */
6080         dev->hw_enc_features |= NETIF_F_SG;
6081
6082         /* Make NETIF_F_SG inheritable to MPLS.
6083          */
6084         dev->mpls_features |= NETIF_F_SG;
6085
6086         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6087         ret = notifier_to_errno(ret);
6088         if (ret)
6089                 goto err_uninit;
6090
6091         ret = netdev_register_kobject(dev);
6092         if (ret)
6093                 goto err_uninit;
6094         dev->reg_state = NETREG_REGISTERED;
6095
6096         __netdev_update_features(dev);
6097
6098         /*
6099          *      Default initial state at registry is that the
6100          *      device is present.
6101          */
6102
6103         set_bit(__LINK_STATE_PRESENT, &dev->state);
6104
6105         linkwatch_init_dev(dev);
6106
6107         dev_init_scheduler(dev);
6108         dev_hold(dev);
6109         list_netdevice(dev);
6110         add_device_randomness(dev->dev_addr, dev->addr_len);
6111
6112         /* If the device has permanent device address, driver should
6113          * set dev_addr and also addr_assign_type should be set to
6114          * NET_ADDR_PERM (default value).
6115          */
6116         if (dev->addr_assign_type == NET_ADDR_PERM)
6117                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6118
6119         /* Notify protocols, that a new device appeared. */
6120         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6121         ret = notifier_to_errno(ret);
6122         if (ret) {
6123                 rollback_registered(dev);
6124                 dev->reg_state = NETREG_UNREGISTERED;
6125         }
6126         /*
6127          *      Prevent userspace races by waiting until the network
6128          *      device is fully setup before sending notifications.
6129          */
6130         if (!dev->rtnl_link_ops ||
6131             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6132                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6133
6134 out:
6135         return ret;
6136
6137 err_uninit:
6138         if (dev->netdev_ops->ndo_uninit)
6139                 dev->netdev_ops->ndo_uninit(dev);
6140         goto out;
6141 }
6142 EXPORT_SYMBOL(register_netdevice);
6143
6144 /**
6145  *      init_dummy_netdev       - init a dummy network device for NAPI
6146  *      @dev: device to init
6147  *
6148  *      This takes a network device structure and initialize the minimum
6149  *      amount of fields so it can be used to schedule NAPI polls without
6150  *      registering a full blown interface. This is to be used by drivers
6151  *      that need to tie several hardware interfaces to a single NAPI
6152  *      poll scheduler due to HW limitations.
6153  */
6154 int init_dummy_netdev(struct net_device *dev)
6155 {
6156         /* Clear everything. Note we don't initialize spinlocks
6157          * are they aren't supposed to be taken by any of the
6158          * NAPI code and this dummy netdev is supposed to be
6159          * only ever used for NAPI polls
6160          */
6161         memset(dev, 0, sizeof(struct net_device));
6162
6163         /* make sure we BUG if trying to hit standard
6164          * register/unregister code path
6165          */
6166         dev->reg_state = NETREG_DUMMY;
6167
6168         /* NAPI wants this */
6169         INIT_LIST_HEAD(&dev->napi_list);
6170
6171         /* a dummy interface is started by default */
6172         set_bit(__LINK_STATE_PRESENT, &dev->state);
6173         set_bit(__LINK_STATE_START, &dev->state);
6174
6175         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6176          * because users of this 'device' dont need to change
6177          * its refcount.
6178          */
6179
6180         return 0;
6181 }
6182 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6183
6184
6185 /**
6186  *      register_netdev - register a network device
6187  *      @dev: device to register
6188  *
6189  *      Take a completed network device structure and add it to the kernel
6190  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6191  *      chain. 0 is returned on success. A negative errno code is returned
6192  *      on a failure to set up the device, or if the name is a duplicate.
6193  *
6194  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6195  *      and expands the device name if you passed a format string to
6196  *      alloc_netdev.
6197  */
6198 int register_netdev(struct net_device *dev)
6199 {
6200         int err;
6201
6202         rtnl_lock();
6203         err = register_netdevice(dev);
6204         rtnl_unlock();
6205         return err;
6206 }
6207 EXPORT_SYMBOL(register_netdev);
6208
6209 int netdev_refcnt_read(const struct net_device *dev)
6210 {
6211         int i, refcnt = 0;
6212
6213         for_each_possible_cpu(i)
6214                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6215         return refcnt;
6216 }
6217 EXPORT_SYMBOL(netdev_refcnt_read);
6218
6219 /**
6220  * netdev_wait_allrefs - wait until all references are gone.
6221  * @dev: target net_device
6222  *
6223  * This is called when unregistering network devices.
6224  *
6225  * Any protocol or device that holds a reference should register
6226  * for netdevice notification, and cleanup and put back the
6227  * reference if they receive an UNREGISTER event.
6228  * We can get stuck here if buggy protocols don't correctly
6229  * call dev_put.
6230  */
6231 static void netdev_wait_allrefs(struct net_device *dev)
6232 {
6233         unsigned long rebroadcast_time, warning_time;
6234         int refcnt;
6235
6236         linkwatch_forget_dev(dev);
6237
6238         rebroadcast_time = warning_time = jiffies;
6239         refcnt = netdev_refcnt_read(dev);
6240
6241         while (refcnt != 0) {
6242                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6243                         rtnl_lock();
6244
6245                         /* Rebroadcast unregister notification */
6246                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6247
6248                         __rtnl_unlock();
6249                         rcu_barrier();
6250                         rtnl_lock();
6251
6252                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6253                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6254                                      &dev->state)) {
6255                                 /* We must not have linkwatch events
6256                                  * pending on unregister. If this
6257                                  * happens, we simply run the queue
6258                                  * unscheduled, resulting in a noop
6259                                  * for this device.
6260                                  */
6261                                 linkwatch_run_queue();
6262                         }
6263
6264                         __rtnl_unlock();
6265
6266                         rebroadcast_time = jiffies;
6267                 }
6268
6269                 msleep(250);
6270
6271                 refcnt = netdev_refcnt_read(dev);
6272
6273                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6274                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6275                                  dev->name, refcnt);
6276                         warning_time = jiffies;
6277                 }
6278         }
6279 }
6280
6281 /* The sequence is:
6282  *
6283  *      rtnl_lock();
6284  *      ...
6285  *      register_netdevice(x1);
6286  *      register_netdevice(x2);
6287  *      ...
6288  *      unregister_netdevice(y1);
6289  *      unregister_netdevice(y2);
6290  *      ...
6291  *      rtnl_unlock();
6292  *      free_netdev(y1);
6293  *      free_netdev(y2);
6294  *
6295  * We are invoked by rtnl_unlock().
6296  * This allows us to deal with problems:
6297  * 1) We can delete sysfs objects which invoke hotplug
6298  *    without deadlocking with linkwatch via keventd.
6299  * 2) Since we run with the RTNL semaphore not held, we can sleep
6300  *    safely in order to wait for the netdev refcnt to drop to zero.
6301  *
6302  * We must not return until all unregister events added during
6303  * the interval the lock was held have been completed.
6304  */
6305 void netdev_run_todo(void)
6306 {
6307         struct list_head list;
6308
6309         /* Snapshot list, allow later requests */
6310         list_replace_init(&net_todo_list, &list);
6311
6312         __rtnl_unlock();
6313
6314
6315         /* Wait for rcu callbacks to finish before next phase */
6316         if (!list_empty(&list))
6317                 rcu_barrier();
6318
6319         while (!list_empty(&list)) {
6320                 struct net_device *dev
6321                         = list_first_entry(&list, struct net_device, todo_list);
6322                 list_del(&dev->todo_list);
6323
6324                 rtnl_lock();
6325                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6326                 __rtnl_unlock();
6327
6328                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6329                         pr_err("network todo '%s' but state %d\n",
6330                                dev->name, dev->reg_state);
6331                         dump_stack();
6332                         continue;
6333                 }
6334
6335                 dev->reg_state = NETREG_UNREGISTERED;
6336
6337                 on_each_cpu(flush_backlog, dev, 1);
6338
6339                 netdev_wait_allrefs(dev);
6340
6341                 /* paranoia */
6342                 BUG_ON(netdev_refcnt_read(dev));
6343                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6344                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6345                 WARN_ON(dev->dn_ptr);
6346
6347                 if (dev->destructor)
6348                         dev->destructor(dev);
6349
6350                 /* Report a network device has been unregistered */
6351                 rtnl_lock();
6352                 dev_net(dev)->dev_unreg_count--;
6353                 __rtnl_unlock();
6354                 wake_up(&netdev_unregistering_wq);
6355
6356                 /* Free network device */
6357                 kobject_put(&dev->dev.kobj);
6358         }
6359 }
6360
6361 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6362  * fields in the same order, with only the type differing.
6363  */
6364 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6365                              const struct net_device_stats *netdev_stats)
6366 {
6367 #if BITS_PER_LONG == 64
6368         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6369         memcpy(stats64, netdev_stats, sizeof(*stats64));
6370 #else
6371         size_t i, n = sizeof(*stats64) / sizeof(u64);
6372         const unsigned long *src = (const unsigned long *)netdev_stats;
6373         u64 *dst = (u64 *)stats64;
6374
6375         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6376                      sizeof(*stats64) / sizeof(u64));
6377         for (i = 0; i < n; i++)
6378                 dst[i] = src[i];
6379 #endif
6380 }
6381 EXPORT_SYMBOL(netdev_stats_to_stats64);
6382
6383 /**
6384  *      dev_get_stats   - get network device statistics
6385  *      @dev: device to get statistics from
6386  *      @storage: place to store stats
6387  *
6388  *      Get network statistics from device. Return @storage.
6389  *      The device driver may provide its own method by setting
6390  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6391  *      otherwise the internal statistics structure is used.
6392  */
6393 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6394                                         struct rtnl_link_stats64 *storage)
6395 {
6396         const struct net_device_ops *ops = dev->netdev_ops;
6397
6398         if (ops->ndo_get_stats64) {
6399                 memset(storage, 0, sizeof(*storage));
6400                 ops->ndo_get_stats64(dev, storage);
6401         } else if (ops->ndo_get_stats) {
6402                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6403         } else {
6404                 netdev_stats_to_stats64(storage, &dev->stats);
6405         }
6406         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6407         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6408         return storage;
6409 }
6410 EXPORT_SYMBOL(dev_get_stats);
6411
6412 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6413 {
6414         struct netdev_queue *queue = dev_ingress_queue(dev);
6415
6416 #ifdef CONFIG_NET_CLS_ACT
6417         if (queue)
6418                 return queue;
6419         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6420         if (!queue)
6421                 return NULL;
6422         netdev_init_one_queue(dev, queue, NULL);
6423         queue->qdisc = &noop_qdisc;
6424         queue->qdisc_sleeping = &noop_qdisc;
6425         rcu_assign_pointer(dev->ingress_queue, queue);
6426 #endif
6427         return queue;
6428 }
6429
6430 static const struct ethtool_ops default_ethtool_ops;
6431
6432 void netdev_set_default_ethtool_ops(struct net_device *dev,
6433                                     const struct ethtool_ops *ops)
6434 {
6435         if (dev->ethtool_ops == &default_ethtool_ops)
6436                 dev->ethtool_ops = ops;
6437 }
6438 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6439
6440 void netdev_freemem(struct net_device *dev)
6441 {
6442         char *addr = (char *)dev - dev->padded;
6443
6444         kvfree(addr);
6445 }
6446
6447 /**
6448  *      alloc_netdev_mqs - allocate network device
6449  *      @sizeof_priv:   size of private data to allocate space for
6450  *      @name:          device name format string
6451  *      @setup:         callback to initialize device
6452  *      @txqs:          the number of TX subqueues to allocate
6453  *      @rxqs:          the number of RX subqueues to allocate
6454  *
6455  *      Allocates a struct net_device with private data area for driver use
6456  *      and performs basic initialization.  Also allocates subqueue structs
6457  *      for each queue on the device.
6458  */
6459 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6460                 void (*setup)(struct net_device *),
6461                 unsigned int txqs, unsigned int rxqs)
6462 {
6463         struct net_device *dev;
6464         size_t alloc_size;
6465         struct net_device *p;
6466
6467         BUG_ON(strlen(name) >= sizeof(dev->name));
6468
6469         if (txqs < 1) {
6470                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6471                 return NULL;
6472         }
6473
6474 #ifdef CONFIG_SYSFS
6475         if (rxqs < 1) {
6476                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6477                 return NULL;
6478         }
6479 #endif
6480
6481         alloc_size = sizeof(struct net_device);
6482         if (sizeof_priv) {
6483                 /* ensure 32-byte alignment of private area */
6484                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6485                 alloc_size += sizeof_priv;
6486         }
6487         /* ensure 32-byte alignment of whole construct */
6488         alloc_size += NETDEV_ALIGN - 1;
6489
6490         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6491         if (!p)
6492                 p = vzalloc(alloc_size);
6493         if (!p)
6494                 return NULL;
6495
6496         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6497         dev->padded = (char *)dev - (char *)p;
6498
6499         dev->pcpu_refcnt = alloc_percpu(int);
6500         if (!dev->pcpu_refcnt)
6501                 goto free_dev;
6502
6503         if (dev_addr_init(dev))
6504                 goto free_pcpu;
6505
6506         dev_mc_init(dev);
6507         dev_uc_init(dev);
6508
6509         dev_net_set(dev, &init_net);
6510
6511         dev->gso_max_size = GSO_MAX_SIZE;
6512         dev->gso_max_segs = GSO_MAX_SEGS;
6513
6514         INIT_LIST_HEAD(&dev->napi_list);
6515         INIT_LIST_HEAD(&dev->unreg_list);
6516         INIT_LIST_HEAD(&dev->close_list);
6517         INIT_LIST_HEAD(&dev->link_watch_list);
6518         INIT_LIST_HEAD(&dev->adj_list.upper);
6519         INIT_LIST_HEAD(&dev->adj_list.lower);
6520         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6521         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6522         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6523         setup(dev);
6524
6525         dev->num_tx_queues = txqs;
6526         dev->real_num_tx_queues = txqs;
6527         if (netif_alloc_netdev_queues(dev))
6528                 goto free_all;
6529
6530 #ifdef CONFIG_SYSFS
6531         dev->num_rx_queues = rxqs;
6532         dev->real_num_rx_queues = rxqs;
6533         if (netif_alloc_rx_queues(dev))
6534                 goto free_all;
6535 #endif
6536
6537         strcpy(dev->name, name);
6538         dev->group = INIT_NETDEV_GROUP;
6539         if (!dev->ethtool_ops)
6540                 dev->ethtool_ops = &default_ethtool_ops;
6541         return dev;
6542
6543 free_all:
6544         free_netdev(dev);
6545         return NULL;
6546
6547 free_pcpu:
6548         free_percpu(dev->pcpu_refcnt);
6549 free_dev:
6550         netdev_freemem(dev);
6551         return NULL;
6552 }
6553 EXPORT_SYMBOL(alloc_netdev_mqs);
6554
6555 /**
6556  *      free_netdev - free network device
6557  *      @dev: device
6558  *
6559  *      This function does the last stage of destroying an allocated device
6560  *      interface. The reference to the device object is released.
6561  *      If this is the last reference then it will be freed.
6562  */
6563 void free_netdev(struct net_device *dev)
6564 {
6565         struct napi_struct *p, *n;
6566
6567         release_net(dev_net(dev));
6568
6569         netif_free_tx_queues(dev);
6570 #ifdef CONFIG_SYSFS
6571         kfree(dev->_rx);
6572 #endif
6573
6574         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6575
6576         /* Flush device addresses */
6577         dev_addr_flush(dev);
6578
6579         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6580                 netif_napi_del(p);
6581
6582         free_percpu(dev->pcpu_refcnt);
6583         dev->pcpu_refcnt = NULL;
6584
6585         /*  Compatibility with error handling in drivers */
6586         if (dev->reg_state == NETREG_UNINITIALIZED) {
6587                 netdev_freemem(dev);
6588                 return;
6589         }
6590
6591         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6592         dev->reg_state = NETREG_RELEASED;
6593
6594         /* will free via device release */
6595         put_device(&dev->dev);
6596 }
6597 EXPORT_SYMBOL(free_netdev);
6598
6599 /**
6600  *      synchronize_net -  Synchronize with packet receive processing
6601  *
6602  *      Wait for packets currently being received to be done.
6603  *      Does not block later packets from starting.
6604  */
6605 void synchronize_net(void)
6606 {
6607         might_sleep();
6608         if (rtnl_is_locked())
6609                 synchronize_rcu_expedited();
6610         else
6611                 synchronize_rcu();
6612 }
6613 EXPORT_SYMBOL(synchronize_net);
6614
6615 /**
6616  *      unregister_netdevice_queue - remove device from the kernel
6617  *      @dev: device
6618  *      @head: list
6619  *
6620  *      This function shuts down a device interface and removes it
6621  *      from the kernel tables.
6622  *      If head not NULL, device is queued to be unregistered later.
6623  *
6624  *      Callers must hold the rtnl semaphore.  You may want
6625  *      unregister_netdev() instead of this.
6626  */
6627
6628 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6629 {
6630         ASSERT_RTNL();
6631
6632         if (head) {
6633                 list_move_tail(&dev->unreg_list, head);
6634         } else {
6635                 rollback_registered(dev);
6636                 /* Finish processing unregister after unlock */
6637                 net_set_todo(dev);
6638         }
6639 }
6640 EXPORT_SYMBOL(unregister_netdevice_queue);
6641
6642 /**
6643  *      unregister_netdevice_many - unregister many devices
6644  *      @head: list of devices
6645  *
6646  *  Note: As most callers use a stack allocated list_head,
6647  *  we force a list_del() to make sure stack wont be corrupted later.
6648  */
6649 void unregister_netdevice_many(struct list_head *head)
6650 {
6651         struct net_device *dev;
6652
6653         if (!list_empty(head)) {
6654                 rollback_registered_many(head);
6655                 list_for_each_entry(dev, head, unreg_list)
6656                         net_set_todo(dev);
6657                 list_del(head);
6658         }
6659 }
6660 EXPORT_SYMBOL(unregister_netdevice_many);
6661
6662 /**
6663  *      unregister_netdev - remove device from the kernel
6664  *      @dev: device
6665  *
6666  *      This function shuts down a device interface and removes it
6667  *      from the kernel tables.
6668  *
6669  *      This is just a wrapper for unregister_netdevice that takes
6670  *      the rtnl semaphore.  In general you want to use this and not
6671  *      unregister_netdevice.
6672  */
6673 void unregister_netdev(struct net_device *dev)
6674 {
6675         rtnl_lock();
6676         unregister_netdevice(dev);
6677         rtnl_unlock();
6678 }
6679 EXPORT_SYMBOL(unregister_netdev);
6680
6681 /**
6682  *      dev_change_net_namespace - move device to different nethost namespace
6683  *      @dev: device
6684  *      @net: network namespace
6685  *      @pat: If not NULL name pattern to try if the current device name
6686  *            is already taken in the destination network namespace.
6687  *
6688  *      This function shuts down a device interface and moves it
6689  *      to a new network namespace. On success 0 is returned, on
6690  *      a failure a netagive errno code is returned.
6691  *
6692  *      Callers must hold the rtnl semaphore.
6693  */
6694
6695 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6696 {
6697         int err;
6698
6699         ASSERT_RTNL();
6700
6701         /* Don't allow namespace local devices to be moved. */
6702         err = -EINVAL;
6703         if (dev->features & NETIF_F_NETNS_LOCAL)
6704                 goto out;
6705
6706         /* Ensure the device has been registrered */
6707         if (dev->reg_state != NETREG_REGISTERED)
6708                 goto out;
6709
6710         /* Get out if there is nothing todo */
6711         err = 0;
6712         if (net_eq(dev_net(dev), net))
6713                 goto out;
6714
6715         /* Pick the destination device name, and ensure
6716          * we can use it in the destination network namespace.
6717          */
6718         err = -EEXIST;
6719         if (__dev_get_by_name(net, dev->name)) {
6720                 /* We get here if we can't use the current device name */
6721                 if (!pat)
6722                         goto out;
6723                 if (dev_get_valid_name(net, dev, pat) < 0)
6724                         goto out;
6725         }
6726
6727         /*
6728          * And now a mini version of register_netdevice unregister_netdevice.
6729          */
6730
6731         /* If device is running close it first. */
6732         dev_close(dev);
6733
6734         /* And unlink it from device chain */
6735         err = -ENODEV;
6736         unlist_netdevice(dev);
6737
6738         synchronize_net();
6739
6740         /* Shutdown queueing discipline. */
6741         dev_shutdown(dev);
6742
6743         /* Notify protocols, that we are about to destroy
6744            this device. They should clean all the things.
6745
6746            Note that dev->reg_state stays at NETREG_REGISTERED.
6747            This is wanted because this way 8021q and macvlan know
6748            the device is just moving and can keep their slaves up.
6749         */
6750         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6751         rcu_barrier();
6752         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6753         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6754
6755         /*
6756          *      Flush the unicast and multicast chains
6757          */
6758         dev_uc_flush(dev);
6759         dev_mc_flush(dev);
6760
6761         /* Send a netdev-removed uevent to the old namespace */
6762         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6763
6764         /* Actually switch the network namespace */
6765         dev_net_set(dev, net);
6766
6767         /* If there is an ifindex conflict assign a new one */
6768         if (__dev_get_by_index(net, dev->ifindex)) {
6769                 int iflink = (dev->iflink == dev->ifindex);
6770                 dev->ifindex = dev_new_index(net);
6771                 if (iflink)
6772                         dev->iflink = dev->ifindex;
6773         }
6774
6775         /* Send a netdev-add uevent to the new namespace */
6776         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6777
6778         /* Fixup kobjects */
6779         err = device_rename(&dev->dev, dev->name);
6780         WARN_ON(err);
6781
6782         /* Add the device back in the hashes */
6783         list_netdevice(dev);
6784
6785         /* Notify protocols, that a new device appeared. */
6786         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6787
6788         /*
6789          *      Prevent userspace races by waiting until the network
6790          *      device is fully setup before sending notifications.
6791          */
6792         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6793
6794         synchronize_net();
6795         err = 0;
6796 out:
6797         return err;
6798 }
6799 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6800
6801 static int dev_cpu_callback(struct notifier_block *nfb,
6802                             unsigned long action,
6803                             void *ocpu)
6804 {
6805         struct sk_buff **list_skb;
6806         struct sk_buff *skb;
6807         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6808         struct softnet_data *sd, *oldsd;
6809
6810         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6811                 return NOTIFY_OK;
6812
6813         local_irq_disable();
6814         cpu = smp_processor_id();
6815         sd = &per_cpu(softnet_data, cpu);
6816         oldsd = &per_cpu(softnet_data, oldcpu);
6817
6818         /* Find end of our completion_queue. */
6819         list_skb = &sd->completion_queue;
6820         while (*list_skb)
6821                 list_skb = &(*list_skb)->next;
6822         /* Append completion queue from offline CPU. */
6823         *list_skb = oldsd->completion_queue;
6824         oldsd->completion_queue = NULL;
6825
6826         /* Append output queue from offline CPU. */
6827         if (oldsd->output_queue) {
6828                 *sd->output_queue_tailp = oldsd->output_queue;
6829                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6830                 oldsd->output_queue = NULL;
6831                 oldsd->output_queue_tailp = &oldsd->output_queue;
6832         }
6833         /* Append NAPI poll list from offline CPU. */
6834         if (!list_empty(&oldsd->poll_list)) {
6835                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6836                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6837         }
6838
6839         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6840         local_irq_enable();
6841
6842         /* Process offline CPU's input_pkt_queue */
6843         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6844                 netif_rx_internal(skb);
6845                 input_queue_head_incr(oldsd);
6846         }
6847         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6848                 netif_rx_internal(skb);
6849                 input_queue_head_incr(oldsd);
6850         }
6851
6852         return NOTIFY_OK;
6853 }
6854
6855
6856 /**
6857  *      netdev_increment_features - increment feature set by one
6858  *      @all: current feature set
6859  *      @one: new feature set
6860  *      @mask: mask feature set
6861  *
6862  *      Computes a new feature set after adding a device with feature set
6863  *      @one to the master device with current feature set @all.  Will not
6864  *      enable anything that is off in @mask. Returns the new feature set.
6865  */
6866 netdev_features_t netdev_increment_features(netdev_features_t all,
6867         netdev_features_t one, netdev_features_t mask)
6868 {
6869         if (mask & NETIF_F_GEN_CSUM)
6870                 mask |= NETIF_F_ALL_CSUM;
6871         mask |= NETIF_F_VLAN_CHALLENGED;
6872
6873         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6874         all &= one | ~NETIF_F_ALL_FOR_ALL;
6875
6876         /* If one device supports hw checksumming, set for all. */
6877         if (all & NETIF_F_GEN_CSUM)
6878                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6879
6880         return all;
6881 }
6882 EXPORT_SYMBOL(netdev_increment_features);
6883
6884 static struct hlist_head * __net_init netdev_create_hash(void)
6885 {
6886         int i;
6887         struct hlist_head *hash;
6888
6889         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6890         if (hash != NULL)
6891                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6892                         INIT_HLIST_HEAD(&hash[i]);
6893
6894         return hash;
6895 }
6896
6897 /* Initialize per network namespace state */
6898 static int __net_init netdev_init(struct net *net)
6899 {
6900         if (net != &init_net)
6901                 INIT_LIST_HEAD(&net->dev_base_head);
6902
6903         net->dev_name_head = netdev_create_hash();
6904         if (net->dev_name_head == NULL)
6905                 goto err_name;
6906
6907         net->dev_index_head = netdev_create_hash();
6908         if (net->dev_index_head == NULL)
6909                 goto err_idx;
6910
6911         return 0;
6912
6913 err_idx:
6914         kfree(net->dev_name_head);
6915 err_name:
6916         return -ENOMEM;
6917 }
6918
6919 /**
6920  *      netdev_drivername - network driver for the device
6921  *      @dev: network device
6922  *
6923  *      Determine network driver for device.
6924  */
6925 const char *netdev_drivername(const struct net_device *dev)
6926 {
6927         const struct device_driver *driver;
6928         const struct device *parent;
6929         const char *empty = "";
6930
6931         parent = dev->dev.parent;
6932         if (!parent)
6933                 return empty;
6934
6935         driver = parent->driver;
6936         if (driver && driver->name)
6937                 return driver->name;
6938         return empty;
6939 }
6940
6941 static int __netdev_printk(const char *level, const struct net_device *dev,
6942                            struct va_format *vaf)
6943 {
6944         int r;
6945
6946         if (dev && dev->dev.parent) {
6947                 r = dev_printk_emit(level[1] - '0',
6948                                     dev->dev.parent,
6949                                     "%s %s %s: %pV",
6950                                     dev_driver_string(dev->dev.parent),
6951                                     dev_name(dev->dev.parent),
6952                                     netdev_name(dev), vaf);
6953         } else if (dev) {
6954                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6955         } else {
6956                 r = printk("%s(NULL net_device): %pV", level, vaf);
6957         }
6958
6959         return r;
6960 }
6961
6962 int netdev_printk(const char *level, const struct net_device *dev,
6963                   const char *format, ...)
6964 {
6965         struct va_format vaf;
6966         va_list args;
6967         int r;
6968
6969         va_start(args, format);
6970
6971         vaf.fmt = format;
6972         vaf.va = &args;
6973
6974         r = __netdev_printk(level, dev, &vaf);
6975
6976         va_end(args);
6977
6978         return r;
6979 }
6980 EXPORT_SYMBOL(netdev_printk);
6981
6982 #define define_netdev_printk_level(func, level)                 \
6983 int func(const struct net_device *dev, const char *fmt, ...)    \
6984 {                                                               \
6985         int r;                                                  \
6986         struct va_format vaf;                                   \
6987         va_list args;                                           \
6988                                                                 \
6989         va_start(args, fmt);                                    \
6990                                                                 \
6991         vaf.fmt = fmt;                                          \
6992         vaf.va = &args;                                         \
6993                                                                 \
6994         r = __netdev_printk(level, dev, &vaf);                  \
6995                                                                 \
6996         va_end(args);                                           \
6997                                                                 \
6998         return r;                                               \
6999 }                                                               \
7000 EXPORT_SYMBOL(func);
7001
7002 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7003 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7004 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7005 define_netdev_printk_level(netdev_err, KERN_ERR);
7006 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7007 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7008 define_netdev_printk_level(netdev_info, KERN_INFO);
7009
7010 static void __net_exit netdev_exit(struct net *net)
7011 {
7012         kfree(net->dev_name_head);
7013         kfree(net->dev_index_head);
7014 }
7015
7016 static struct pernet_operations __net_initdata netdev_net_ops = {
7017         .init = netdev_init,
7018         .exit = netdev_exit,
7019 };
7020
7021 static void __net_exit default_device_exit(struct net *net)
7022 {
7023         struct net_device *dev, *aux;
7024         /*
7025          * Push all migratable network devices back to the
7026          * initial network namespace
7027          */
7028         rtnl_lock();
7029         for_each_netdev_safe(net, dev, aux) {
7030                 int err;
7031                 char fb_name[IFNAMSIZ];
7032
7033                 /* Ignore unmoveable devices (i.e. loopback) */
7034                 if (dev->features & NETIF_F_NETNS_LOCAL)
7035                         continue;
7036
7037                 /* Leave virtual devices for the generic cleanup */
7038                 if (dev->rtnl_link_ops)
7039                         continue;
7040
7041                 /* Push remaining network devices to init_net */
7042                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7043                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7044                 if (err) {
7045                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7046                                  __func__, dev->name, err);
7047                         BUG();
7048                 }
7049         }
7050         rtnl_unlock();
7051 }
7052
7053 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7054 {
7055         /* Return with the rtnl_lock held when there are no network
7056          * devices unregistering in any network namespace in net_list.
7057          */
7058         struct net *net;
7059         bool unregistering;
7060         DEFINE_WAIT(wait);
7061
7062         for (;;) {
7063                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7064                                 TASK_UNINTERRUPTIBLE);
7065                 unregistering = false;
7066                 rtnl_lock();
7067                 list_for_each_entry(net, net_list, exit_list) {
7068                         if (net->dev_unreg_count > 0) {
7069                                 unregistering = true;
7070                                 break;
7071                         }
7072                 }
7073                 if (!unregistering)
7074                         break;
7075                 __rtnl_unlock();
7076                 schedule();
7077         }
7078         finish_wait(&netdev_unregistering_wq, &wait);
7079 }
7080
7081 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7082 {
7083         /* At exit all network devices most be removed from a network
7084          * namespace.  Do this in the reverse order of registration.
7085          * Do this across as many network namespaces as possible to
7086          * improve batching efficiency.
7087          */
7088         struct net_device *dev;
7089         struct net *net;
7090         LIST_HEAD(dev_kill_list);
7091
7092         /* To prevent network device cleanup code from dereferencing
7093          * loopback devices or network devices that have been freed
7094          * wait here for all pending unregistrations to complete,
7095          * before unregistring the loopback device and allowing the
7096          * network namespace be freed.
7097          *
7098          * The netdev todo list containing all network devices
7099          * unregistrations that happen in default_device_exit_batch
7100          * will run in the rtnl_unlock() at the end of
7101          * default_device_exit_batch.
7102          */
7103         rtnl_lock_unregistering(net_list);
7104         list_for_each_entry(net, net_list, exit_list) {
7105                 for_each_netdev_reverse(net, dev) {
7106                         if (dev->rtnl_link_ops)
7107                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7108                         else
7109                                 unregister_netdevice_queue(dev, &dev_kill_list);
7110                 }
7111         }
7112         unregister_netdevice_many(&dev_kill_list);
7113         rtnl_unlock();
7114 }
7115
7116 static struct pernet_operations __net_initdata default_device_ops = {
7117         .exit = default_device_exit,
7118         .exit_batch = default_device_exit_batch,
7119 };
7120
7121 /*
7122  *      Initialize the DEV module. At boot time this walks the device list and
7123  *      unhooks any devices that fail to initialise (normally hardware not
7124  *      present) and leaves us with a valid list of present and active devices.
7125  *
7126  */
7127
7128 /*
7129  *       This is called single threaded during boot, so no need
7130  *       to take the rtnl semaphore.
7131  */
7132 static int __init net_dev_init(void)
7133 {
7134         int i, rc = -ENOMEM;
7135
7136         BUG_ON(!dev_boot_phase);
7137
7138         if (dev_proc_init())
7139                 goto out;
7140
7141         if (netdev_kobject_init())
7142                 goto out;
7143
7144         INIT_LIST_HEAD(&ptype_all);
7145         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7146                 INIT_LIST_HEAD(&ptype_base[i]);
7147
7148         INIT_LIST_HEAD(&offload_base);
7149
7150         if (register_pernet_subsys(&netdev_net_ops))
7151                 goto out;
7152
7153         /*
7154          *      Initialise the packet receive queues.
7155          */
7156
7157         for_each_possible_cpu(i) {
7158                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7159
7160                 skb_queue_head_init(&sd->input_pkt_queue);
7161                 skb_queue_head_init(&sd->process_queue);
7162                 INIT_LIST_HEAD(&sd->poll_list);
7163                 sd->output_queue_tailp = &sd->output_queue;
7164 #ifdef CONFIG_RPS
7165                 sd->csd.func = rps_trigger_softirq;
7166                 sd->csd.info = sd;
7167                 sd->cpu = i;
7168 #endif
7169
7170                 sd->backlog.poll = process_backlog;
7171                 sd->backlog.weight = weight_p;
7172         }
7173
7174         dev_boot_phase = 0;
7175
7176         /* The loopback device is special if any other network devices
7177          * is present in a network namespace the loopback device must
7178          * be present. Since we now dynamically allocate and free the
7179          * loopback device ensure this invariant is maintained by
7180          * keeping the loopback device as the first device on the
7181          * list of network devices.  Ensuring the loopback devices
7182          * is the first device that appears and the last network device
7183          * that disappears.
7184          */
7185         if (register_pernet_device(&loopback_net_ops))
7186                 goto out;
7187
7188         if (register_pernet_device(&default_device_ops))
7189                 goto out;
7190
7191         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7192         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7193
7194         hotcpu_notifier(dev_cpu_callback, 0);
7195         dst_init();
7196         rc = 0;
7197 out:
7198         return rc;
7199 }
7200
7201 subsys_initcall(net_dev_init);