net: keep original skb which only needs header checking during software GSO
[cascardo/linux.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 #include <linux/errqueue.h>
136
137 #include "net-sysfs.h"
138
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145 static DEFINE_SPINLOCK(ptype_lock);
146 static DEFINE_SPINLOCK(offload_lock);
147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
148 struct list_head ptype_all __read_mostly;       /* Taps */
149 static struct list_head offload_base __read_mostly;
150
151 static int netif_rx_internal(struct sk_buff *skb);
152 static int call_netdevice_notifiers_info(unsigned long val,
153                                          struct net_device *dev,
154                                          struct netdev_notifier_info *info);
155
156 /*
157  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
158  * semaphore.
159  *
160  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
161  *
162  * Writers must hold the rtnl semaphore while they loop through the
163  * dev_base_head list, and hold dev_base_lock for writing when they do the
164  * actual updates.  This allows pure readers to access the list even
165  * while a writer is preparing to update it.
166  *
167  * To put it another way, dev_base_lock is held for writing only to
168  * protect against pure readers; the rtnl semaphore provides the
169  * protection against other writers.
170  *
171  * See, for example usages, register_netdevice() and
172  * unregister_netdevice(), which must be called with the rtnl
173  * semaphore held.
174  */
175 DEFINE_RWLOCK(dev_base_lock);
176 EXPORT_SYMBOL(dev_base_lock);
177
178 /* protects napi_hash addition/deletion and napi_gen_id */
179 static DEFINE_SPINLOCK(napi_hash_lock);
180
181 static unsigned int napi_gen_id;
182 static DEFINE_HASHTABLE(napi_hash, 8);
183
184 static seqcount_t devnet_rename_seq;
185
186 static inline void dev_base_seq_inc(struct net *net)
187 {
188         while (++net->dev_base_seq == 0);
189 }
190
191 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
192 {
193         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
194
195         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
196 }
197
198 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
199 {
200         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
201 }
202
203 static inline void rps_lock(struct softnet_data *sd)
204 {
205 #ifdef CONFIG_RPS
206         spin_lock(&sd->input_pkt_queue.lock);
207 #endif
208 }
209
210 static inline void rps_unlock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213         spin_unlock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 /* Device list insertion */
218 static void list_netdevice(struct net_device *dev)
219 {
220         struct net *net = dev_net(dev);
221
222         ASSERT_RTNL();
223
224         write_lock_bh(&dev_base_lock);
225         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
226         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
227         hlist_add_head_rcu(&dev->index_hlist,
228                            dev_index_hash(net, dev->ifindex));
229         write_unlock_bh(&dev_base_lock);
230
231         dev_base_seq_inc(net);
232 }
233
234 /* Device list removal
235  * caller must respect a RCU grace period before freeing/reusing dev
236  */
237 static void unlist_netdevice(struct net_device *dev)
238 {
239         ASSERT_RTNL();
240
241         /* Unlink dev from the device chain */
242         write_lock_bh(&dev_base_lock);
243         list_del_rcu(&dev->dev_list);
244         hlist_del_rcu(&dev->name_hlist);
245         hlist_del_rcu(&dev->index_hlist);
246         write_unlock_bh(&dev_base_lock);
247
248         dev_base_seq_inc(dev_net(dev));
249 }
250
251 /*
252  *      Our notifier list
253  */
254
255 static RAW_NOTIFIER_HEAD(netdev_chain);
256
257 /*
258  *      Device drivers call our routines to queue packets here. We empty the
259  *      queue in the local softnet handler.
260  */
261
262 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
263 EXPORT_PER_CPU_SYMBOL(softnet_data);
264
265 #ifdef CONFIG_LOCKDEP
266 /*
267  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
268  * according to dev->type
269  */
270 static const unsigned short netdev_lock_type[] =
271         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
272          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
273          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
274          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
275          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
276          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
277          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
278          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
279          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
280          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
281          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
282          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
283          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
284          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
285          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
286
287 static const char *const netdev_lock_name[] =
288         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
289          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
290          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
291          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
292          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
293          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
294          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
295          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
296          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
297          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
298          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
299          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
300          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
301          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
302          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
303
304 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
305 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
306
307 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
308 {
309         int i;
310
311         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
312                 if (netdev_lock_type[i] == dev_type)
313                         return i;
314         /* the last key is used by default */
315         return ARRAY_SIZE(netdev_lock_type) - 1;
316 }
317
318 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
319                                                  unsigned short dev_type)
320 {
321         int i;
322
323         i = netdev_lock_pos(dev_type);
324         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
325                                    netdev_lock_name[i]);
326 }
327
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330         int i;
331
332         i = netdev_lock_pos(dev->type);
333         lockdep_set_class_and_name(&dev->addr_list_lock,
334                                    &netdev_addr_lock_key[i],
335                                    netdev_lock_name[i]);
336 }
337 #else
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339                                                  unsigned short dev_type)
340 {
341 }
342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
343 {
344 }
345 #endif
346
347 /*******************************************************************************
348
349                 Protocol management and registration routines
350
351 *******************************************************************************/
352
353 /*
354  *      Add a protocol ID to the list. Now that the input handler is
355  *      smarter we can dispense with all the messy stuff that used to be
356  *      here.
357  *
358  *      BEWARE!!! Protocol handlers, mangling input packets,
359  *      MUST BE last in hash buckets and checking protocol handlers
360  *      MUST start from promiscuous ptype_all chain in net_bh.
361  *      It is true now, do not change it.
362  *      Explanation follows: if protocol handler, mangling packet, will
363  *      be the first on list, it is not able to sense, that packet
364  *      is cloned and should be copied-on-write, so that it will
365  *      change it and subsequent readers will get broken packet.
366  *                                                      --ANK (980803)
367  */
368
369 static inline struct list_head *ptype_head(const struct packet_type *pt)
370 {
371         if (pt->type == htons(ETH_P_ALL))
372                 return &ptype_all;
373         else
374                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
375 }
376
377 /**
378  *      dev_add_pack - add packet handler
379  *      @pt: packet type declaration
380  *
381  *      Add a protocol handler to the networking stack. The passed &packet_type
382  *      is linked into kernel lists and may not be freed until it has been
383  *      removed from the kernel lists.
384  *
385  *      This call does not sleep therefore it can not
386  *      guarantee all CPU's that are in middle of receiving packets
387  *      will see the new packet type (until the next received packet).
388  */
389
390 void dev_add_pack(struct packet_type *pt)
391 {
392         struct list_head *head = ptype_head(pt);
393
394         spin_lock(&ptype_lock);
395         list_add_rcu(&pt->list, head);
396         spin_unlock(&ptype_lock);
397 }
398 EXPORT_SYMBOL(dev_add_pack);
399
400 /**
401  *      __dev_remove_pack        - remove packet handler
402  *      @pt: packet type declaration
403  *
404  *      Remove a protocol handler that was previously added to the kernel
405  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
406  *      from the kernel lists and can be freed or reused once this function
407  *      returns.
408  *
409  *      The packet type might still be in use by receivers
410  *      and must not be freed until after all the CPU's have gone
411  *      through a quiescent state.
412  */
413 void __dev_remove_pack(struct packet_type *pt)
414 {
415         struct list_head *head = ptype_head(pt);
416         struct packet_type *pt1;
417
418         spin_lock(&ptype_lock);
419
420         list_for_each_entry(pt1, head, list) {
421                 if (pt == pt1) {
422                         list_del_rcu(&pt->list);
423                         goto out;
424                 }
425         }
426
427         pr_warn("dev_remove_pack: %p not found\n", pt);
428 out:
429         spin_unlock(&ptype_lock);
430 }
431 EXPORT_SYMBOL(__dev_remove_pack);
432
433 /**
434  *      dev_remove_pack  - remove packet handler
435  *      @pt: packet type declaration
436  *
437  *      Remove a protocol handler that was previously added to the kernel
438  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
439  *      from the kernel lists and can be freed or reused once this function
440  *      returns.
441  *
442  *      This call sleeps to guarantee that no CPU is looking at the packet
443  *      type after return.
444  */
445 void dev_remove_pack(struct packet_type *pt)
446 {
447         __dev_remove_pack(pt);
448
449         synchronize_net();
450 }
451 EXPORT_SYMBOL(dev_remove_pack);
452
453
454 /**
455  *      dev_add_offload - register offload handlers
456  *      @po: protocol offload declaration
457  *
458  *      Add protocol offload handlers to the networking stack. The passed
459  *      &proto_offload is linked into kernel lists and may not be freed until
460  *      it has been removed from the kernel lists.
461  *
462  *      This call does not sleep therefore it can not
463  *      guarantee all CPU's that are in middle of receiving packets
464  *      will see the new offload handlers (until the next received packet).
465  */
466 void dev_add_offload(struct packet_offload *po)
467 {
468         struct list_head *head = &offload_base;
469
470         spin_lock(&offload_lock);
471         list_add_rcu(&po->list, head);
472         spin_unlock(&offload_lock);
473 }
474 EXPORT_SYMBOL(dev_add_offload);
475
476 /**
477  *      __dev_remove_offload     - remove offload handler
478  *      @po: packet offload declaration
479  *
480  *      Remove a protocol offload handler that was previously added to the
481  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
482  *      is removed from the kernel lists and can be freed or reused once this
483  *      function returns.
484  *
485  *      The packet type might still be in use by receivers
486  *      and must not be freed until after all the CPU's have gone
487  *      through a quiescent state.
488  */
489 static void __dev_remove_offload(struct packet_offload *po)
490 {
491         struct list_head *head = &offload_base;
492         struct packet_offload *po1;
493
494         spin_lock(&offload_lock);
495
496         list_for_each_entry(po1, head, list) {
497                 if (po == po1) {
498                         list_del_rcu(&po->list);
499                         goto out;
500                 }
501         }
502
503         pr_warn("dev_remove_offload: %p not found\n", po);
504 out:
505         spin_unlock(&offload_lock);
506 }
507
508 /**
509  *      dev_remove_offload       - remove packet offload handler
510  *      @po: packet offload declaration
511  *
512  *      Remove a packet offload handler that was previously added to the kernel
513  *      offload handlers by dev_add_offload(). The passed &offload_type is
514  *      removed from the kernel lists and can be freed or reused once this
515  *      function returns.
516  *
517  *      This call sleeps to guarantee that no CPU is looking at the packet
518  *      type after return.
519  */
520 void dev_remove_offload(struct packet_offload *po)
521 {
522         __dev_remove_offload(po);
523
524         synchronize_net();
525 }
526 EXPORT_SYMBOL(dev_remove_offload);
527
528 /******************************************************************************
529
530                       Device Boot-time Settings Routines
531
532 *******************************************************************************/
533
534 /* Boot time configuration table */
535 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
536
537 /**
538  *      netdev_boot_setup_add   - add new setup entry
539  *      @name: name of the device
540  *      @map: configured settings for the device
541  *
542  *      Adds new setup entry to the dev_boot_setup list.  The function
543  *      returns 0 on error and 1 on success.  This is a generic routine to
544  *      all netdevices.
545  */
546 static int netdev_boot_setup_add(char *name, struct ifmap *map)
547 {
548         struct netdev_boot_setup *s;
549         int i;
550
551         s = dev_boot_setup;
552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
553                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
554                         memset(s[i].name, 0, sizeof(s[i].name));
555                         strlcpy(s[i].name, name, IFNAMSIZ);
556                         memcpy(&s[i].map, map, sizeof(s[i].map));
557                         break;
558                 }
559         }
560
561         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
562 }
563
564 /**
565  *      netdev_boot_setup_check - check boot time settings
566  *      @dev: the netdevice
567  *
568  *      Check boot time settings for the device.
569  *      The found settings are set for the device to be used
570  *      later in the device probing.
571  *      Returns 0 if no settings found, 1 if they are.
572  */
573 int netdev_boot_setup_check(struct net_device *dev)
574 {
575         struct netdev_boot_setup *s = dev_boot_setup;
576         int i;
577
578         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
579                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
580                     !strcmp(dev->name, s[i].name)) {
581                         dev->irq        = s[i].map.irq;
582                         dev->base_addr  = s[i].map.base_addr;
583                         dev->mem_start  = s[i].map.mem_start;
584                         dev->mem_end    = s[i].map.mem_end;
585                         return 1;
586                 }
587         }
588         return 0;
589 }
590 EXPORT_SYMBOL(netdev_boot_setup_check);
591
592
593 /**
594  *      netdev_boot_base        - get address from boot time settings
595  *      @prefix: prefix for network device
596  *      @unit: id for network device
597  *
598  *      Check boot time settings for the base address of device.
599  *      The found settings are set for the device to be used
600  *      later in the device probing.
601  *      Returns 0 if no settings found.
602  */
603 unsigned long netdev_boot_base(const char *prefix, int unit)
604 {
605         const struct netdev_boot_setup *s = dev_boot_setup;
606         char name[IFNAMSIZ];
607         int i;
608
609         sprintf(name, "%s%d", prefix, unit);
610
611         /*
612          * If device already registered then return base of 1
613          * to indicate not to probe for this interface
614          */
615         if (__dev_get_by_name(&init_net, name))
616                 return 1;
617
618         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
619                 if (!strcmp(name, s[i].name))
620                         return s[i].map.base_addr;
621         return 0;
622 }
623
624 /*
625  * Saves at boot time configured settings for any netdevice.
626  */
627 int __init netdev_boot_setup(char *str)
628 {
629         int ints[5];
630         struct ifmap map;
631
632         str = get_options(str, ARRAY_SIZE(ints), ints);
633         if (!str || !*str)
634                 return 0;
635
636         /* Save settings */
637         memset(&map, 0, sizeof(map));
638         if (ints[0] > 0)
639                 map.irq = ints[1];
640         if (ints[0] > 1)
641                 map.base_addr = ints[2];
642         if (ints[0] > 2)
643                 map.mem_start = ints[3];
644         if (ints[0] > 3)
645                 map.mem_end = ints[4];
646
647         /* Add new entry to the list */
648         return netdev_boot_setup_add(str, &map);
649 }
650
651 __setup("netdev=", netdev_boot_setup);
652
653 /*******************************************************************************
654
655                             Device Interface Subroutines
656
657 *******************************************************************************/
658
659 /**
660  *      __dev_get_by_name       - find a device by its name
661  *      @net: the applicable net namespace
662  *      @name: name to find
663  *
664  *      Find an interface by name. Must be called under RTNL semaphore
665  *      or @dev_base_lock. If the name is found a pointer to the device
666  *      is returned. If the name is not found then %NULL is returned. The
667  *      reference counters are not incremented so the caller must be
668  *      careful with locks.
669  */
670
671 struct net_device *__dev_get_by_name(struct net *net, const char *name)
672 {
673         struct net_device *dev;
674         struct hlist_head *head = dev_name_hash(net, name);
675
676         hlist_for_each_entry(dev, head, name_hlist)
677                 if (!strncmp(dev->name, name, IFNAMSIZ))
678                         return dev;
679
680         return NULL;
681 }
682 EXPORT_SYMBOL(__dev_get_by_name);
683
684 /**
685  *      dev_get_by_name_rcu     - find a device by its name
686  *      @net: the applicable net namespace
687  *      @name: name to find
688  *
689  *      Find an interface by name.
690  *      If the name is found a pointer to the device is returned.
691  *      If the name is not found then %NULL is returned.
692  *      The reference counters are not incremented so the caller must be
693  *      careful with locks. The caller must hold RCU lock.
694  */
695
696 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
697 {
698         struct net_device *dev;
699         struct hlist_head *head = dev_name_hash(net, name);
700
701         hlist_for_each_entry_rcu(dev, head, name_hlist)
702                 if (!strncmp(dev->name, name, IFNAMSIZ))
703                         return dev;
704
705         return NULL;
706 }
707 EXPORT_SYMBOL(dev_get_by_name_rcu);
708
709 /**
710  *      dev_get_by_name         - find a device by its name
711  *      @net: the applicable net namespace
712  *      @name: name to find
713  *
714  *      Find an interface by name. This can be called from any
715  *      context and does its own locking. The returned handle has
716  *      the usage count incremented and the caller must use dev_put() to
717  *      release it when it is no longer needed. %NULL is returned if no
718  *      matching device is found.
719  */
720
721 struct net_device *dev_get_by_name(struct net *net, const char *name)
722 {
723         struct net_device *dev;
724
725         rcu_read_lock();
726         dev = dev_get_by_name_rcu(net, name);
727         if (dev)
728                 dev_hold(dev);
729         rcu_read_unlock();
730         return dev;
731 }
732 EXPORT_SYMBOL(dev_get_by_name);
733
734 /**
735  *      __dev_get_by_index - find a device by its ifindex
736  *      @net: the applicable net namespace
737  *      @ifindex: index of device
738  *
739  *      Search for an interface by index. Returns %NULL if the device
740  *      is not found or a pointer to the device. The device has not
741  *      had its reference counter increased so the caller must be careful
742  *      about locking. The caller must hold either the RTNL semaphore
743  *      or @dev_base_lock.
744  */
745
746 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
747 {
748         struct net_device *dev;
749         struct hlist_head *head = dev_index_hash(net, ifindex);
750
751         hlist_for_each_entry(dev, head, index_hlist)
752                 if (dev->ifindex == ifindex)
753                         return dev;
754
755         return NULL;
756 }
757 EXPORT_SYMBOL(__dev_get_by_index);
758
759 /**
760  *      dev_get_by_index_rcu - find a device by its ifindex
761  *      @net: the applicable net namespace
762  *      @ifindex: index of device
763  *
764  *      Search for an interface by index. Returns %NULL if the device
765  *      is not found or a pointer to the device. The device has not
766  *      had its reference counter increased so the caller must be careful
767  *      about locking. The caller must hold RCU lock.
768  */
769
770 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
771 {
772         struct net_device *dev;
773         struct hlist_head *head = dev_index_hash(net, ifindex);
774
775         hlist_for_each_entry_rcu(dev, head, index_hlist)
776                 if (dev->ifindex == ifindex)
777                         return dev;
778
779         return NULL;
780 }
781 EXPORT_SYMBOL(dev_get_by_index_rcu);
782
783
784 /**
785  *      dev_get_by_index - find a device by its ifindex
786  *      @net: the applicable net namespace
787  *      @ifindex: index of device
788  *
789  *      Search for an interface by index. Returns NULL if the device
790  *      is not found or a pointer to the device. The device returned has
791  *      had a reference added and the pointer is safe until the user calls
792  *      dev_put to indicate they have finished with it.
793  */
794
795 struct net_device *dev_get_by_index(struct net *net, int ifindex)
796 {
797         struct net_device *dev;
798
799         rcu_read_lock();
800         dev = dev_get_by_index_rcu(net, ifindex);
801         if (dev)
802                 dev_hold(dev);
803         rcu_read_unlock();
804         return dev;
805 }
806 EXPORT_SYMBOL(dev_get_by_index);
807
808 /**
809  *      netdev_get_name - get a netdevice name, knowing its ifindex.
810  *      @net: network namespace
811  *      @name: a pointer to the buffer where the name will be stored.
812  *      @ifindex: the ifindex of the interface to get the name from.
813  *
814  *      The use of raw_seqcount_begin() and cond_resched() before
815  *      retrying is required as we want to give the writers a chance
816  *      to complete when CONFIG_PREEMPT is not set.
817  */
818 int netdev_get_name(struct net *net, char *name, int ifindex)
819 {
820         struct net_device *dev;
821         unsigned int seq;
822
823 retry:
824         seq = raw_seqcount_begin(&devnet_rename_seq);
825         rcu_read_lock();
826         dev = dev_get_by_index_rcu(net, ifindex);
827         if (!dev) {
828                 rcu_read_unlock();
829                 return -ENODEV;
830         }
831
832         strcpy(name, dev->name);
833         rcu_read_unlock();
834         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
835                 cond_resched();
836                 goto retry;
837         }
838
839         return 0;
840 }
841
842 /**
843  *      dev_getbyhwaddr_rcu - find a device by its hardware address
844  *      @net: the applicable net namespace
845  *      @type: media type of device
846  *      @ha: hardware address
847  *
848  *      Search for an interface by MAC address. Returns NULL if the device
849  *      is not found or a pointer to the device.
850  *      The caller must hold RCU or RTNL.
851  *      The returned device has not had its ref count increased
852  *      and the caller must therefore be careful about locking
853  *
854  */
855
856 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
857                                        const char *ha)
858 {
859         struct net_device *dev;
860
861         for_each_netdev_rcu(net, dev)
862                 if (dev->type == type &&
863                     !memcmp(dev->dev_addr, ha, dev->addr_len))
864                         return dev;
865
866         return NULL;
867 }
868 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
869
870 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
871 {
872         struct net_device *dev;
873
874         ASSERT_RTNL();
875         for_each_netdev(net, dev)
876                 if (dev->type == type)
877                         return dev;
878
879         return NULL;
880 }
881 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
882
883 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
884 {
885         struct net_device *dev, *ret = NULL;
886
887         rcu_read_lock();
888         for_each_netdev_rcu(net, dev)
889                 if (dev->type == type) {
890                         dev_hold(dev);
891                         ret = dev;
892                         break;
893                 }
894         rcu_read_unlock();
895         return ret;
896 }
897 EXPORT_SYMBOL(dev_getfirstbyhwtype);
898
899 /**
900  *      __dev_get_by_flags - find any device with given flags
901  *      @net: the applicable net namespace
902  *      @if_flags: IFF_* values
903  *      @mask: bitmask of bits in if_flags to check
904  *
905  *      Search for any interface with the given flags. Returns NULL if a device
906  *      is not found or a pointer to the device. Must be called inside
907  *      rtnl_lock(), and result refcount is unchanged.
908  */
909
910 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
911                                       unsigned short mask)
912 {
913         struct net_device *dev, *ret;
914
915         ASSERT_RTNL();
916
917         ret = NULL;
918         for_each_netdev(net, dev) {
919                 if (((dev->flags ^ if_flags) & mask) == 0) {
920                         ret = dev;
921                         break;
922                 }
923         }
924         return ret;
925 }
926 EXPORT_SYMBOL(__dev_get_by_flags);
927
928 /**
929  *      dev_valid_name - check if name is okay for network device
930  *      @name: name string
931  *
932  *      Network device names need to be valid file names to
933  *      to allow sysfs to work.  We also disallow any kind of
934  *      whitespace.
935  */
936 bool dev_valid_name(const char *name)
937 {
938         if (*name == '\0')
939                 return false;
940         if (strlen(name) >= IFNAMSIZ)
941                 return false;
942         if (!strcmp(name, ".") || !strcmp(name, ".."))
943                 return false;
944
945         while (*name) {
946                 if (*name == '/' || isspace(*name))
947                         return false;
948                 name++;
949         }
950         return true;
951 }
952 EXPORT_SYMBOL(dev_valid_name);
953
954 /**
955  *      __dev_alloc_name - allocate a name for a device
956  *      @net: network namespace to allocate the device name in
957  *      @name: name format string
958  *      @buf:  scratch buffer and result name string
959  *
960  *      Passed a format string - eg "lt%d" it will try and find a suitable
961  *      id. It scans list of devices to build up a free map, then chooses
962  *      the first empty slot. The caller must hold the dev_base or rtnl lock
963  *      while allocating the name and adding the device in order to avoid
964  *      duplicates.
965  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
966  *      Returns the number of the unit assigned or a negative errno code.
967  */
968
969 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
970 {
971         int i = 0;
972         const char *p;
973         const int max_netdevices = 8*PAGE_SIZE;
974         unsigned long *inuse;
975         struct net_device *d;
976
977         p = strnchr(name, IFNAMSIZ-1, '%');
978         if (p) {
979                 /*
980                  * Verify the string as this thing may have come from
981                  * the user.  There must be either one "%d" and no other "%"
982                  * characters.
983                  */
984                 if (p[1] != 'd' || strchr(p + 2, '%'))
985                         return -EINVAL;
986
987                 /* Use one page as a bit array of possible slots */
988                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
989                 if (!inuse)
990                         return -ENOMEM;
991
992                 for_each_netdev(net, d) {
993                         if (!sscanf(d->name, name, &i))
994                                 continue;
995                         if (i < 0 || i >= max_netdevices)
996                                 continue;
997
998                         /*  avoid cases where sscanf is not exact inverse of printf */
999                         snprintf(buf, IFNAMSIZ, name, i);
1000                         if (!strncmp(buf, d->name, IFNAMSIZ))
1001                                 set_bit(i, inuse);
1002                 }
1003
1004                 i = find_first_zero_bit(inuse, max_netdevices);
1005                 free_page((unsigned long) inuse);
1006         }
1007
1008         if (buf != name)
1009                 snprintf(buf, IFNAMSIZ, name, i);
1010         if (!__dev_get_by_name(net, buf))
1011                 return i;
1012
1013         /* It is possible to run out of possible slots
1014          * when the name is long and there isn't enough space left
1015          * for the digits, or if all bits are used.
1016          */
1017         return -ENFILE;
1018 }
1019
1020 /**
1021  *      dev_alloc_name - allocate a name for a device
1022  *      @dev: device
1023  *      @name: name format string
1024  *
1025  *      Passed a format string - eg "lt%d" it will try and find a suitable
1026  *      id. It scans list of devices to build up a free map, then chooses
1027  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1028  *      while allocating the name and adding the device in order to avoid
1029  *      duplicates.
1030  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1031  *      Returns the number of the unit assigned or a negative errno code.
1032  */
1033
1034 int dev_alloc_name(struct net_device *dev, const char *name)
1035 {
1036         char buf[IFNAMSIZ];
1037         struct net *net;
1038         int ret;
1039
1040         BUG_ON(!dev_net(dev));
1041         net = dev_net(dev);
1042         ret = __dev_alloc_name(net, name, buf);
1043         if (ret >= 0)
1044                 strlcpy(dev->name, buf, IFNAMSIZ);
1045         return ret;
1046 }
1047 EXPORT_SYMBOL(dev_alloc_name);
1048
1049 static int dev_alloc_name_ns(struct net *net,
1050                              struct net_device *dev,
1051                              const char *name)
1052 {
1053         char buf[IFNAMSIZ];
1054         int ret;
1055
1056         ret = __dev_alloc_name(net, name, buf);
1057         if (ret >= 0)
1058                 strlcpy(dev->name, buf, IFNAMSIZ);
1059         return ret;
1060 }
1061
1062 static int dev_get_valid_name(struct net *net,
1063                               struct net_device *dev,
1064                               const char *name)
1065 {
1066         BUG_ON(!net);
1067
1068         if (!dev_valid_name(name))
1069                 return -EINVAL;
1070
1071         if (strchr(name, '%'))
1072                 return dev_alloc_name_ns(net, dev, name);
1073         else if (__dev_get_by_name(net, name))
1074                 return -EEXIST;
1075         else if (dev->name != name)
1076                 strlcpy(dev->name, name, IFNAMSIZ);
1077
1078         return 0;
1079 }
1080
1081 /**
1082  *      dev_change_name - change name of a device
1083  *      @dev: device
1084  *      @newname: name (or format string) must be at least IFNAMSIZ
1085  *
1086  *      Change name of a device, can pass format strings "eth%d".
1087  *      for wildcarding.
1088  */
1089 int dev_change_name(struct net_device *dev, const char *newname)
1090 {
1091         unsigned char old_assign_type;
1092         char oldname[IFNAMSIZ];
1093         int err = 0;
1094         int ret;
1095         struct net *net;
1096
1097         ASSERT_RTNL();
1098         BUG_ON(!dev_net(dev));
1099
1100         net = dev_net(dev);
1101         if (dev->flags & IFF_UP)
1102                 return -EBUSY;
1103
1104         write_seqcount_begin(&devnet_rename_seq);
1105
1106         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1107                 write_seqcount_end(&devnet_rename_seq);
1108                 return 0;
1109         }
1110
1111         memcpy(oldname, dev->name, IFNAMSIZ);
1112
1113         err = dev_get_valid_name(net, dev, newname);
1114         if (err < 0) {
1115                 write_seqcount_end(&devnet_rename_seq);
1116                 return err;
1117         }
1118
1119         if (oldname[0] && !strchr(oldname, '%'))
1120                 netdev_info(dev, "renamed from %s\n", oldname);
1121
1122         old_assign_type = dev->name_assign_type;
1123         dev->name_assign_type = NET_NAME_RENAMED;
1124
1125 rollback:
1126         ret = device_rename(&dev->dev, dev->name);
1127         if (ret) {
1128                 memcpy(dev->name, oldname, IFNAMSIZ);
1129                 dev->name_assign_type = old_assign_type;
1130                 write_seqcount_end(&devnet_rename_seq);
1131                 return ret;
1132         }
1133
1134         write_seqcount_end(&devnet_rename_seq);
1135
1136         netdev_adjacent_rename_links(dev, oldname);
1137
1138         write_lock_bh(&dev_base_lock);
1139         hlist_del_rcu(&dev->name_hlist);
1140         write_unlock_bh(&dev_base_lock);
1141
1142         synchronize_rcu();
1143
1144         write_lock_bh(&dev_base_lock);
1145         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1146         write_unlock_bh(&dev_base_lock);
1147
1148         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1149         ret = notifier_to_errno(ret);
1150
1151         if (ret) {
1152                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1153                 if (err >= 0) {
1154                         err = ret;
1155                         write_seqcount_begin(&devnet_rename_seq);
1156                         memcpy(dev->name, oldname, IFNAMSIZ);
1157                         memcpy(oldname, newname, IFNAMSIZ);
1158                         dev->name_assign_type = old_assign_type;
1159                         old_assign_type = NET_NAME_RENAMED;
1160                         goto rollback;
1161                 } else {
1162                         pr_err("%s: name change rollback failed: %d\n",
1163                                dev->name, ret);
1164                 }
1165         }
1166
1167         return err;
1168 }
1169
1170 /**
1171  *      dev_set_alias - change ifalias of a device
1172  *      @dev: device
1173  *      @alias: name up to IFALIASZ
1174  *      @len: limit of bytes to copy from info
1175  *
1176  *      Set ifalias for a device,
1177  */
1178 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1179 {
1180         char *new_ifalias;
1181
1182         ASSERT_RTNL();
1183
1184         if (len >= IFALIASZ)
1185                 return -EINVAL;
1186
1187         if (!len) {
1188                 kfree(dev->ifalias);
1189                 dev->ifalias = NULL;
1190                 return 0;
1191         }
1192
1193         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1194         if (!new_ifalias)
1195                 return -ENOMEM;
1196         dev->ifalias = new_ifalias;
1197
1198         strlcpy(dev->ifalias, alias, len+1);
1199         return len;
1200 }
1201
1202
1203 /**
1204  *      netdev_features_change - device changes features
1205  *      @dev: device to cause notification
1206  *
1207  *      Called to indicate a device has changed features.
1208  */
1209 void netdev_features_change(struct net_device *dev)
1210 {
1211         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1212 }
1213 EXPORT_SYMBOL(netdev_features_change);
1214
1215 /**
1216  *      netdev_state_change - device changes state
1217  *      @dev: device to cause notification
1218  *
1219  *      Called to indicate a device has changed state. This function calls
1220  *      the notifier chains for netdev_chain and sends a NEWLINK message
1221  *      to the routing socket.
1222  */
1223 void netdev_state_change(struct net_device *dev)
1224 {
1225         if (dev->flags & IFF_UP) {
1226                 struct netdev_notifier_change_info change_info;
1227
1228                 change_info.flags_changed = 0;
1229                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1230                                               &change_info.info);
1231                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1232         }
1233 }
1234 EXPORT_SYMBOL(netdev_state_change);
1235
1236 /**
1237  *      netdev_notify_peers - notify network peers about existence of @dev
1238  *      @dev: network device
1239  *
1240  * Generate traffic such that interested network peers are aware of
1241  * @dev, such as by generating a gratuitous ARP. This may be used when
1242  * a device wants to inform the rest of the network about some sort of
1243  * reconfiguration such as a failover event or virtual machine
1244  * migration.
1245  */
1246 void netdev_notify_peers(struct net_device *dev)
1247 {
1248         rtnl_lock();
1249         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1250         rtnl_unlock();
1251 }
1252 EXPORT_SYMBOL(netdev_notify_peers);
1253
1254 static int __dev_open(struct net_device *dev)
1255 {
1256         const struct net_device_ops *ops = dev->netdev_ops;
1257         int ret;
1258
1259         ASSERT_RTNL();
1260
1261         if (!netif_device_present(dev))
1262                 return -ENODEV;
1263
1264         /* Block netpoll from trying to do any rx path servicing.
1265          * If we don't do this there is a chance ndo_poll_controller
1266          * or ndo_poll may be running while we open the device
1267          */
1268         netpoll_poll_disable(dev);
1269
1270         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1271         ret = notifier_to_errno(ret);
1272         if (ret)
1273                 return ret;
1274
1275         set_bit(__LINK_STATE_START, &dev->state);
1276
1277         if (ops->ndo_validate_addr)
1278                 ret = ops->ndo_validate_addr(dev);
1279
1280         if (!ret && ops->ndo_open)
1281                 ret = ops->ndo_open(dev);
1282
1283         netpoll_poll_enable(dev);
1284
1285         if (ret)
1286                 clear_bit(__LINK_STATE_START, &dev->state);
1287         else {
1288                 dev->flags |= IFF_UP;
1289                 net_dmaengine_get();
1290                 dev_set_rx_mode(dev);
1291                 dev_activate(dev);
1292                 add_device_randomness(dev->dev_addr, dev->addr_len);
1293         }
1294
1295         return ret;
1296 }
1297
1298 /**
1299  *      dev_open        - prepare an interface for use.
1300  *      @dev:   device to open
1301  *
1302  *      Takes a device from down to up state. The device's private open
1303  *      function is invoked and then the multicast lists are loaded. Finally
1304  *      the device is moved into the up state and a %NETDEV_UP message is
1305  *      sent to the netdev notifier chain.
1306  *
1307  *      Calling this function on an active interface is a nop. On a failure
1308  *      a negative errno code is returned.
1309  */
1310 int dev_open(struct net_device *dev)
1311 {
1312         int ret;
1313
1314         if (dev->flags & IFF_UP)
1315                 return 0;
1316
1317         ret = __dev_open(dev);
1318         if (ret < 0)
1319                 return ret;
1320
1321         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1322         call_netdevice_notifiers(NETDEV_UP, dev);
1323
1324         return ret;
1325 }
1326 EXPORT_SYMBOL(dev_open);
1327
1328 static int __dev_close_many(struct list_head *head)
1329 {
1330         struct net_device *dev;
1331
1332         ASSERT_RTNL();
1333         might_sleep();
1334
1335         list_for_each_entry(dev, head, close_list) {
1336                 /* Temporarily disable netpoll until the interface is down */
1337                 netpoll_poll_disable(dev);
1338
1339                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1340
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342
1343                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1344                  * can be even on different cpu. So just clear netif_running().
1345                  *
1346                  * dev->stop() will invoke napi_disable() on all of it's
1347                  * napi_struct instances on this device.
1348                  */
1349                 smp_mb__after_atomic(); /* Commit netif_running(). */
1350         }
1351
1352         dev_deactivate_many(head);
1353
1354         list_for_each_entry(dev, head, close_list) {
1355                 const struct net_device_ops *ops = dev->netdev_ops;
1356
1357                 /*
1358                  *      Call the device specific close. This cannot fail.
1359                  *      Only if device is UP
1360                  *
1361                  *      We allow it to be called even after a DETACH hot-plug
1362                  *      event.
1363                  */
1364                 if (ops->ndo_stop)
1365                         ops->ndo_stop(dev);
1366
1367                 dev->flags &= ~IFF_UP;
1368                 net_dmaengine_put();
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         /*
1441          * If we're trying to disable lro on a vlan device
1442          * use the underlying physical device instead
1443          */
1444         if (is_vlan_dev(dev))
1445                 dev = vlan_dev_real_dev(dev);
1446
1447         /* the same for macvlan devices */
1448         if (netif_is_macvlan(dev))
1449                 dev = macvlan_dev_real_dev(dev);
1450
1451         dev->wanted_features &= ~NETIF_F_LRO;
1452         netdev_update_features(dev);
1453
1454         if (unlikely(dev->features & NETIF_F_LRO))
1455                 netdev_WARN(dev, "failed to disable LRO!\n");
1456 }
1457 EXPORT_SYMBOL(dev_disable_lro);
1458
1459 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1460                                    struct net_device *dev)
1461 {
1462         struct netdev_notifier_info info;
1463
1464         netdev_notifier_info_init(&info, dev);
1465         return nb->notifier_call(nb, val, &info);
1466 }
1467
1468 static int dev_boot_phase = 1;
1469
1470 /**
1471  *      register_netdevice_notifier - register a network notifier block
1472  *      @nb: notifier
1473  *
1474  *      Register a notifier to be called when network device events occur.
1475  *      The notifier passed is linked into the kernel structures and must
1476  *      not be reused until it has been unregistered. A negative errno code
1477  *      is returned on a failure.
1478  *
1479  *      When registered all registration and up events are replayed
1480  *      to the new notifier to allow device to have a race free
1481  *      view of the network device list.
1482  */
1483
1484 int register_netdevice_notifier(struct notifier_block *nb)
1485 {
1486         struct net_device *dev;
1487         struct net_device *last;
1488         struct net *net;
1489         int err;
1490
1491         rtnl_lock();
1492         err = raw_notifier_chain_register(&netdev_chain, nb);
1493         if (err)
1494                 goto unlock;
1495         if (dev_boot_phase)
1496                 goto unlock;
1497         for_each_net(net) {
1498                 for_each_netdev(net, dev) {
1499                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1500                         err = notifier_to_errno(err);
1501                         if (err)
1502                                 goto rollback;
1503
1504                         if (!(dev->flags & IFF_UP))
1505                                 continue;
1506
1507                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1508                 }
1509         }
1510
1511 unlock:
1512         rtnl_unlock();
1513         return err;
1514
1515 rollback:
1516         last = dev;
1517         for_each_net(net) {
1518                 for_each_netdev(net, dev) {
1519                         if (dev == last)
1520                                 goto outroll;
1521
1522                         if (dev->flags & IFF_UP) {
1523                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1524                                                         dev);
1525                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1526                         }
1527                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1528                 }
1529         }
1530
1531 outroll:
1532         raw_notifier_chain_unregister(&netdev_chain, nb);
1533         goto unlock;
1534 }
1535 EXPORT_SYMBOL(register_netdevice_notifier);
1536
1537 /**
1538  *      unregister_netdevice_notifier - unregister a network notifier block
1539  *      @nb: notifier
1540  *
1541  *      Unregister a notifier previously registered by
1542  *      register_netdevice_notifier(). The notifier is unlinked into the
1543  *      kernel structures and may then be reused. A negative errno code
1544  *      is returned on a failure.
1545  *
1546  *      After unregistering unregister and down device events are synthesized
1547  *      for all devices on the device list to the removed notifier to remove
1548  *      the need for special case cleanup code.
1549  */
1550
1551 int unregister_netdevice_notifier(struct notifier_block *nb)
1552 {
1553         struct net_device *dev;
1554         struct net *net;
1555         int err;
1556
1557         rtnl_lock();
1558         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1559         if (err)
1560                 goto unlock;
1561
1562         for_each_net(net) {
1563                 for_each_netdev(net, dev) {
1564                         if (dev->flags & IFF_UP) {
1565                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1566                                                         dev);
1567                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1568                         }
1569                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1570                 }
1571         }
1572 unlock:
1573         rtnl_unlock();
1574         return err;
1575 }
1576 EXPORT_SYMBOL(unregister_netdevice_notifier);
1577
1578 /**
1579  *      call_netdevice_notifiers_info - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *      @info: notifier information data
1583  *
1584  *      Call all network notifier blocks.  Parameters and return value
1585  *      are as for raw_notifier_call_chain().
1586  */
1587
1588 static int call_netdevice_notifiers_info(unsigned long val,
1589                                          struct net_device *dev,
1590                                          struct netdev_notifier_info *info)
1591 {
1592         ASSERT_RTNL();
1593         netdev_notifier_info_init(info, dev);
1594         return raw_notifier_call_chain(&netdev_chain, val, info);
1595 }
1596
1597 /**
1598  *      call_netdevice_notifiers - call all network notifier blocks
1599  *      @val: value passed unmodified to notifier function
1600  *      @dev: net_device pointer passed unmodified to notifier function
1601  *
1602  *      Call all network notifier blocks.  Parameters and return value
1603  *      are as for raw_notifier_call_chain().
1604  */
1605
1606 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1607 {
1608         struct netdev_notifier_info info;
1609
1610         return call_netdevice_notifiers_info(val, dev, &info);
1611 }
1612 EXPORT_SYMBOL(call_netdevice_notifiers);
1613
1614 static struct static_key netstamp_needed __read_mostly;
1615 #ifdef HAVE_JUMP_LABEL
1616 /* We are not allowed to call static_key_slow_dec() from irq context
1617  * If net_disable_timestamp() is called from irq context, defer the
1618  * static_key_slow_dec() calls.
1619  */
1620 static atomic_t netstamp_needed_deferred;
1621 #endif
1622
1623 void net_enable_timestamp(void)
1624 {
1625 #ifdef HAVE_JUMP_LABEL
1626         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1627
1628         if (deferred) {
1629                 while (--deferred)
1630                         static_key_slow_dec(&netstamp_needed);
1631                 return;
1632         }
1633 #endif
1634         static_key_slow_inc(&netstamp_needed);
1635 }
1636 EXPORT_SYMBOL(net_enable_timestamp);
1637
1638 void net_disable_timestamp(void)
1639 {
1640 #ifdef HAVE_JUMP_LABEL
1641         if (in_interrupt()) {
1642                 atomic_inc(&netstamp_needed_deferred);
1643                 return;
1644         }
1645 #endif
1646         static_key_slow_dec(&netstamp_needed);
1647 }
1648 EXPORT_SYMBOL(net_disable_timestamp);
1649
1650 static inline void net_timestamp_set(struct sk_buff *skb)
1651 {
1652         skb->tstamp.tv64 = 0;
1653         if (static_key_false(&netstamp_needed))
1654                 __net_timestamp(skb);
1655 }
1656
1657 #define net_timestamp_check(COND, SKB)                  \
1658         if (static_key_false(&netstamp_needed)) {               \
1659                 if ((COND) && !(SKB)->tstamp.tv64)      \
1660                         __net_timestamp(SKB);           \
1661         }                                               \
1662
1663 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1664 {
1665         unsigned int len;
1666
1667         if (!(dev->flags & IFF_UP))
1668                 return false;
1669
1670         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1671         if (skb->len <= len)
1672                 return true;
1673
1674         /* if TSO is enabled, we don't care about the length as the packet
1675          * could be forwarded without being segmented before
1676          */
1677         if (skb_is_gso(skb))
1678                 return true;
1679
1680         return false;
1681 }
1682 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1683
1684 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1685 {
1686         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1687                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1688                         atomic_long_inc(&dev->rx_dropped);
1689                         kfree_skb(skb);
1690                         return NET_RX_DROP;
1691                 }
1692         }
1693
1694         if (unlikely(!is_skb_forwardable(dev, skb))) {
1695                 atomic_long_inc(&dev->rx_dropped);
1696                 kfree_skb(skb);
1697                 return NET_RX_DROP;
1698         }
1699
1700         skb_scrub_packet(skb, true);
1701         skb->protocol = eth_type_trans(skb, dev);
1702
1703         return 0;
1704 }
1705 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1706
1707 /**
1708  * dev_forward_skb - loopback an skb to another netif
1709  *
1710  * @dev: destination network device
1711  * @skb: buffer to forward
1712  *
1713  * return values:
1714  *      NET_RX_SUCCESS  (no congestion)
1715  *      NET_RX_DROP     (packet was dropped, but freed)
1716  *
1717  * dev_forward_skb can be used for injecting an skb from the
1718  * start_xmit function of one device into the receive queue
1719  * of another device.
1720  *
1721  * The receiving device may be in another namespace, so
1722  * we have to clear all information in the skb that could
1723  * impact namespace isolation.
1724  */
1725 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1726 {
1727         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1728 }
1729 EXPORT_SYMBOL_GPL(dev_forward_skb);
1730
1731 static inline int deliver_skb(struct sk_buff *skb,
1732                               struct packet_type *pt_prev,
1733                               struct net_device *orig_dev)
1734 {
1735         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1736                 return -ENOMEM;
1737         atomic_inc(&skb->users);
1738         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1739 }
1740
1741 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1742 {
1743         if (!ptype->af_packet_priv || !skb->sk)
1744                 return false;
1745
1746         if (ptype->id_match)
1747                 return ptype->id_match(ptype, skb->sk);
1748         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1749                 return true;
1750
1751         return false;
1752 }
1753
1754 /*
1755  *      Support routine. Sends outgoing frames to any network
1756  *      taps currently in use.
1757  */
1758
1759 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1760 {
1761         struct packet_type *ptype;
1762         struct sk_buff *skb2 = NULL;
1763         struct packet_type *pt_prev = NULL;
1764
1765         rcu_read_lock();
1766         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1767                 /* Never send packets back to the socket
1768                  * they originated from - MvS (miquels@drinkel.ow.org)
1769                  */
1770                 if ((ptype->dev == dev || !ptype->dev) &&
1771                     (!skb_loop_sk(ptype, skb))) {
1772                         if (pt_prev) {
1773                                 deliver_skb(skb2, pt_prev, skb->dev);
1774                                 pt_prev = ptype;
1775                                 continue;
1776                         }
1777
1778                         skb2 = skb_clone(skb, GFP_ATOMIC);
1779                         if (!skb2)
1780                                 break;
1781
1782                         net_timestamp_set(skb2);
1783
1784                         /* skb->nh should be correctly
1785                            set by sender, so that the second statement is
1786                            just protection against buggy protocols.
1787                          */
1788                         skb_reset_mac_header(skb2);
1789
1790                         if (skb_network_header(skb2) < skb2->data ||
1791                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1792                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1793                                                      ntohs(skb2->protocol),
1794                                                      dev->name);
1795                                 skb_reset_network_header(skb2);
1796                         }
1797
1798                         skb2->transport_header = skb2->network_header;
1799                         skb2->pkt_type = PACKET_OUTGOING;
1800                         pt_prev = ptype;
1801                 }
1802         }
1803         if (pt_prev)
1804                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1805         rcu_read_unlock();
1806 }
1807
1808 /**
1809  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1810  * @dev: Network device
1811  * @txq: number of queues available
1812  *
1813  * If real_num_tx_queues is changed the tc mappings may no longer be
1814  * valid. To resolve this verify the tc mapping remains valid and if
1815  * not NULL the mapping. With no priorities mapping to this
1816  * offset/count pair it will no longer be used. In the worst case TC0
1817  * is invalid nothing can be done so disable priority mappings. If is
1818  * expected that drivers will fix this mapping if they can before
1819  * calling netif_set_real_num_tx_queues.
1820  */
1821 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1822 {
1823         int i;
1824         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1825
1826         /* If TC0 is invalidated disable TC mapping */
1827         if (tc->offset + tc->count > txq) {
1828                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1829                 dev->num_tc = 0;
1830                 return;
1831         }
1832
1833         /* Invalidated prio to tc mappings set to TC0 */
1834         for (i = 1; i < TC_BITMASK + 1; i++) {
1835                 int q = netdev_get_prio_tc_map(dev, i);
1836
1837                 tc = &dev->tc_to_txq[q];
1838                 if (tc->offset + tc->count > txq) {
1839                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1840                                 i, q);
1841                         netdev_set_prio_tc_map(dev, i, 0);
1842                 }
1843         }
1844 }
1845
1846 #ifdef CONFIG_XPS
1847 static DEFINE_MUTEX(xps_map_mutex);
1848 #define xmap_dereference(P)             \
1849         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1850
1851 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1852                                         int cpu, u16 index)
1853 {
1854         struct xps_map *map = NULL;
1855         int pos;
1856
1857         if (dev_maps)
1858                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1859
1860         for (pos = 0; map && pos < map->len; pos++) {
1861                 if (map->queues[pos] == index) {
1862                         if (map->len > 1) {
1863                                 map->queues[pos] = map->queues[--map->len];
1864                         } else {
1865                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1866                                 kfree_rcu(map, rcu);
1867                                 map = NULL;
1868                         }
1869                         break;
1870                 }
1871         }
1872
1873         return map;
1874 }
1875
1876 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1877 {
1878         struct xps_dev_maps *dev_maps;
1879         int cpu, i;
1880         bool active = false;
1881
1882         mutex_lock(&xps_map_mutex);
1883         dev_maps = xmap_dereference(dev->xps_maps);
1884
1885         if (!dev_maps)
1886                 goto out_no_maps;
1887
1888         for_each_possible_cpu(cpu) {
1889                 for (i = index; i < dev->num_tx_queues; i++) {
1890                         if (!remove_xps_queue(dev_maps, cpu, i))
1891                                 break;
1892                 }
1893                 if (i == dev->num_tx_queues)
1894                         active = true;
1895         }
1896
1897         if (!active) {
1898                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1899                 kfree_rcu(dev_maps, rcu);
1900         }
1901
1902         for (i = index; i < dev->num_tx_queues; i++)
1903                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1904                                              NUMA_NO_NODE);
1905
1906 out_no_maps:
1907         mutex_unlock(&xps_map_mutex);
1908 }
1909
1910 static struct xps_map *expand_xps_map(struct xps_map *map,
1911                                       int cpu, u16 index)
1912 {
1913         struct xps_map *new_map;
1914         int alloc_len = XPS_MIN_MAP_ALLOC;
1915         int i, pos;
1916
1917         for (pos = 0; map && pos < map->len; pos++) {
1918                 if (map->queues[pos] != index)
1919                         continue;
1920                 return map;
1921         }
1922
1923         /* Need to add queue to this CPU's existing map */
1924         if (map) {
1925                 if (pos < map->alloc_len)
1926                         return map;
1927
1928                 alloc_len = map->alloc_len * 2;
1929         }
1930
1931         /* Need to allocate new map to store queue on this CPU's map */
1932         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1933                                cpu_to_node(cpu));
1934         if (!new_map)
1935                 return NULL;
1936
1937         for (i = 0; i < pos; i++)
1938                 new_map->queues[i] = map->queues[i];
1939         new_map->alloc_len = alloc_len;
1940         new_map->len = pos;
1941
1942         return new_map;
1943 }
1944
1945 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1946                         u16 index)
1947 {
1948         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1949         struct xps_map *map, *new_map;
1950         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1951         int cpu, numa_node_id = -2;
1952         bool active = false;
1953
1954         mutex_lock(&xps_map_mutex);
1955
1956         dev_maps = xmap_dereference(dev->xps_maps);
1957
1958         /* allocate memory for queue storage */
1959         for_each_online_cpu(cpu) {
1960                 if (!cpumask_test_cpu(cpu, mask))
1961                         continue;
1962
1963                 if (!new_dev_maps)
1964                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1965                 if (!new_dev_maps) {
1966                         mutex_unlock(&xps_map_mutex);
1967                         return -ENOMEM;
1968                 }
1969
1970                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1971                                  NULL;
1972
1973                 map = expand_xps_map(map, cpu, index);
1974                 if (!map)
1975                         goto error;
1976
1977                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1978         }
1979
1980         if (!new_dev_maps)
1981                 goto out_no_new_maps;
1982
1983         for_each_possible_cpu(cpu) {
1984                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1985                         /* add queue to CPU maps */
1986                         int pos = 0;
1987
1988                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989                         while ((pos < map->len) && (map->queues[pos] != index))
1990                                 pos++;
1991
1992                         if (pos == map->len)
1993                                 map->queues[map->len++] = index;
1994 #ifdef CONFIG_NUMA
1995                         if (numa_node_id == -2)
1996                                 numa_node_id = cpu_to_node(cpu);
1997                         else if (numa_node_id != cpu_to_node(cpu))
1998                                 numa_node_id = -1;
1999 #endif
2000                 } else if (dev_maps) {
2001                         /* fill in the new device map from the old device map */
2002                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2003                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2004                 }
2005
2006         }
2007
2008         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2009
2010         /* Cleanup old maps */
2011         if (dev_maps) {
2012                 for_each_possible_cpu(cpu) {
2013                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2014                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2015                         if (map && map != new_map)
2016                                 kfree_rcu(map, rcu);
2017                 }
2018
2019                 kfree_rcu(dev_maps, rcu);
2020         }
2021
2022         dev_maps = new_dev_maps;
2023         active = true;
2024
2025 out_no_new_maps:
2026         /* update Tx queue numa node */
2027         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2028                                      (numa_node_id >= 0) ? numa_node_id :
2029                                      NUMA_NO_NODE);
2030
2031         if (!dev_maps)
2032                 goto out_no_maps;
2033
2034         /* removes queue from unused CPUs */
2035         for_each_possible_cpu(cpu) {
2036                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2037                         continue;
2038
2039                 if (remove_xps_queue(dev_maps, cpu, index))
2040                         active = true;
2041         }
2042
2043         /* free map if not active */
2044         if (!active) {
2045                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2046                 kfree_rcu(dev_maps, rcu);
2047         }
2048
2049 out_no_maps:
2050         mutex_unlock(&xps_map_mutex);
2051
2052         return 0;
2053 error:
2054         /* remove any maps that we added */
2055         for_each_possible_cpu(cpu) {
2056                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2057                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2058                                  NULL;
2059                 if (new_map && new_map != map)
2060                         kfree(new_map);
2061         }
2062
2063         mutex_unlock(&xps_map_mutex);
2064
2065         kfree(new_dev_maps);
2066         return -ENOMEM;
2067 }
2068 EXPORT_SYMBOL(netif_set_xps_queue);
2069
2070 #endif
2071 /*
2072  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2073  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2074  */
2075 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2076 {
2077         int rc;
2078
2079         if (txq < 1 || txq > dev->num_tx_queues)
2080                 return -EINVAL;
2081
2082         if (dev->reg_state == NETREG_REGISTERED ||
2083             dev->reg_state == NETREG_UNREGISTERING) {
2084                 ASSERT_RTNL();
2085
2086                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2087                                                   txq);
2088                 if (rc)
2089                         return rc;
2090
2091                 if (dev->num_tc)
2092                         netif_setup_tc(dev, txq);
2093
2094                 if (txq < dev->real_num_tx_queues) {
2095                         qdisc_reset_all_tx_gt(dev, txq);
2096 #ifdef CONFIG_XPS
2097                         netif_reset_xps_queues_gt(dev, txq);
2098 #endif
2099                 }
2100         }
2101
2102         dev->real_num_tx_queues = txq;
2103         return 0;
2104 }
2105 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2106
2107 #ifdef CONFIG_SYSFS
2108 /**
2109  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2110  *      @dev: Network device
2111  *      @rxq: Actual number of RX queues
2112  *
2113  *      This must be called either with the rtnl_lock held or before
2114  *      registration of the net device.  Returns 0 on success, or a
2115  *      negative error code.  If called before registration, it always
2116  *      succeeds.
2117  */
2118 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2119 {
2120         int rc;
2121
2122         if (rxq < 1 || rxq > dev->num_rx_queues)
2123                 return -EINVAL;
2124
2125         if (dev->reg_state == NETREG_REGISTERED) {
2126                 ASSERT_RTNL();
2127
2128                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2129                                                   rxq);
2130                 if (rc)
2131                         return rc;
2132         }
2133
2134         dev->real_num_rx_queues = rxq;
2135         return 0;
2136 }
2137 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2138 #endif
2139
2140 /**
2141  * netif_get_num_default_rss_queues - default number of RSS queues
2142  *
2143  * This routine should set an upper limit on the number of RSS queues
2144  * used by default by multiqueue devices.
2145  */
2146 int netif_get_num_default_rss_queues(void)
2147 {
2148         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2149 }
2150 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2151
2152 static inline void __netif_reschedule(struct Qdisc *q)
2153 {
2154         struct softnet_data *sd;
2155         unsigned long flags;
2156
2157         local_irq_save(flags);
2158         sd = &__get_cpu_var(softnet_data);
2159         q->next_sched = NULL;
2160         *sd->output_queue_tailp = q;
2161         sd->output_queue_tailp = &q->next_sched;
2162         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2163         local_irq_restore(flags);
2164 }
2165
2166 void __netif_schedule(struct Qdisc *q)
2167 {
2168         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2169                 __netif_reschedule(q);
2170 }
2171 EXPORT_SYMBOL(__netif_schedule);
2172
2173 struct dev_kfree_skb_cb {
2174         enum skb_free_reason reason;
2175 };
2176
2177 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2178 {
2179         return (struct dev_kfree_skb_cb *)skb->cb;
2180 }
2181
2182 void netif_schedule_queue(struct netdev_queue *txq)
2183 {
2184         rcu_read_lock();
2185         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2186                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2187
2188                 __netif_schedule(q);
2189         }
2190         rcu_read_unlock();
2191 }
2192 EXPORT_SYMBOL(netif_schedule_queue);
2193
2194 /**
2195  *      netif_wake_subqueue - allow sending packets on subqueue
2196  *      @dev: network device
2197  *      @queue_index: sub queue index
2198  *
2199  * Resume individual transmit queue of a device with multiple transmit queues.
2200  */
2201 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2202 {
2203         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2204
2205         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2206                 struct Qdisc *q;
2207
2208                 rcu_read_lock();
2209                 q = rcu_dereference(txq->qdisc);
2210                 __netif_schedule(q);
2211                 rcu_read_unlock();
2212         }
2213 }
2214 EXPORT_SYMBOL(netif_wake_subqueue);
2215
2216 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2217 {
2218         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2219                 struct Qdisc *q;
2220
2221                 rcu_read_lock();
2222                 q = rcu_dereference(dev_queue->qdisc);
2223                 __netif_schedule(q);
2224                 rcu_read_unlock();
2225         }
2226 }
2227 EXPORT_SYMBOL(netif_tx_wake_queue);
2228
2229 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2230 {
2231         unsigned long flags;
2232
2233         if (likely(atomic_read(&skb->users) == 1)) {
2234                 smp_rmb();
2235                 atomic_set(&skb->users, 0);
2236         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2237                 return;
2238         }
2239         get_kfree_skb_cb(skb)->reason = reason;
2240         local_irq_save(flags);
2241         skb->next = __this_cpu_read(softnet_data.completion_queue);
2242         __this_cpu_write(softnet_data.completion_queue, skb);
2243         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2244         local_irq_restore(flags);
2245 }
2246 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2247
2248 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2249 {
2250         if (in_irq() || irqs_disabled())
2251                 __dev_kfree_skb_irq(skb, reason);
2252         else
2253                 dev_kfree_skb(skb);
2254 }
2255 EXPORT_SYMBOL(__dev_kfree_skb_any);
2256
2257
2258 /**
2259  * netif_device_detach - mark device as removed
2260  * @dev: network device
2261  *
2262  * Mark device as removed from system and therefore no longer available.
2263  */
2264 void netif_device_detach(struct net_device *dev)
2265 {
2266         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2267             netif_running(dev)) {
2268                 netif_tx_stop_all_queues(dev);
2269         }
2270 }
2271 EXPORT_SYMBOL(netif_device_detach);
2272
2273 /**
2274  * netif_device_attach - mark device as attached
2275  * @dev: network device
2276  *
2277  * Mark device as attached from system and restart if needed.
2278  */
2279 void netif_device_attach(struct net_device *dev)
2280 {
2281         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2282             netif_running(dev)) {
2283                 netif_tx_wake_all_queues(dev);
2284                 __netdev_watchdog_up(dev);
2285         }
2286 }
2287 EXPORT_SYMBOL(netif_device_attach);
2288
2289 static void skb_warn_bad_offload(const struct sk_buff *skb)
2290 {
2291         static const netdev_features_t null_features = 0;
2292         struct net_device *dev = skb->dev;
2293         const char *driver = "";
2294
2295         if (!net_ratelimit())
2296                 return;
2297
2298         if (dev && dev->dev.parent)
2299                 driver = dev_driver_string(dev->dev.parent);
2300
2301         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2302              "gso_type=%d ip_summed=%d\n",
2303              driver, dev ? &dev->features : &null_features,
2304              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2305              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2306              skb_shinfo(skb)->gso_type, skb->ip_summed);
2307 }
2308
2309 /*
2310  * Invalidate hardware checksum when packet is to be mangled, and
2311  * complete checksum manually on outgoing path.
2312  */
2313 int skb_checksum_help(struct sk_buff *skb)
2314 {
2315         __wsum csum;
2316         int ret = 0, offset;
2317
2318         if (skb->ip_summed == CHECKSUM_COMPLETE)
2319                 goto out_set_summed;
2320
2321         if (unlikely(skb_shinfo(skb)->gso_size)) {
2322                 skb_warn_bad_offload(skb);
2323                 return -EINVAL;
2324         }
2325
2326         /* Before computing a checksum, we should make sure no frag could
2327          * be modified by an external entity : checksum could be wrong.
2328          */
2329         if (skb_has_shared_frag(skb)) {
2330                 ret = __skb_linearize(skb);
2331                 if (ret)
2332                         goto out;
2333         }
2334
2335         offset = skb_checksum_start_offset(skb);
2336         BUG_ON(offset >= skb_headlen(skb));
2337         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2338
2339         offset += skb->csum_offset;
2340         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2341
2342         if (skb_cloned(skb) &&
2343             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2344                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2345                 if (ret)
2346                         goto out;
2347         }
2348
2349         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2350 out_set_summed:
2351         skb->ip_summed = CHECKSUM_NONE;
2352 out:
2353         return ret;
2354 }
2355 EXPORT_SYMBOL(skb_checksum_help);
2356
2357 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2358 {
2359         unsigned int vlan_depth = skb->mac_len;
2360         __be16 type = skb->protocol;
2361
2362         /* Tunnel gso handlers can set protocol to ethernet. */
2363         if (type == htons(ETH_P_TEB)) {
2364                 struct ethhdr *eth;
2365
2366                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2367                         return 0;
2368
2369                 eth = (struct ethhdr *)skb_mac_header(skb);
2370                 type = eth->h_proto;
2371         }
2372
2373         /* if skb->protocol is 802.1Q/AD then the header should already be
2374          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2375          * ETH_HLEN otherwise
2376          */
2377         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2378                 if (vlan_depth) {
2379                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2380                                 return 0;
2381                         vlan_depth -= VLAN_HLEN;
2382                 } else {
2383                         vlan_depth = ETH_HLEN;
2384                 }
2385                 do {
2386                         struct vlan_hdr *vh;
2387
2388                         if (unlikely(!pskb_may_pull(skb,
2389                                                     vlan_depth + VLAN_HLEN)))
2390                                 return 0;
2391
2392                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2393                         type = vh->h_vlan_encapsulated_proto;
2394                         vlan_depth += VLAN_HLEN;
2395                 } while (type == htons(ETH_P_8021Q) ||
2396                          type == htons(ETH_P_8021AD));
2397         }
2398
2399         *depth = vlan_depth;
2400
2401         return type;
2402 }
2403
2404 /**
2405  *      skb_mac_gso_segment - mac layer segmentation handler.
2406  *      @skb: buffer to segment
2407  *      @features: features for the output path (see dev->features)
2408  */
2409 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2410                                     netdev_features_t features)
2411 {
2412         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2413         struct packet_offload *ptype;
2414         int vlan_depth = skb->mac_len;
2415         __be16 type = skb_network_protocol(skb, &vlan_depth);
2416
2417         if (unlikely(!type))
2418                 return ERR_PTR(-EINVAL);
2419
2420         __skb_pull(skb, vlan_depth);
2421
2422         rcu_read_lock();
2423         list_for_each_entry_rcu(ptype, &offload_base, list) {
2424                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2425                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2426                                 int err;
2427
2428                                 err = ptype->callbacks.gso_send_check(skb);
2429                                 segs = ERR_PTR(err);
2430                                 if (err || skb_gso_ok(skb, features))
2431                                         break;
2432                                 __skb_push(skb, (skb->data -
2433                                                  skb_network_header(skb)));
2434                         }
2435                         segs = ptype->callbacks.gso_segment(skb, features);
2436                         break;
2437                 }
2438         }
2439         rcu_read_unlock();
2440
2441         __skb_push(skb, skb->data - skb_mac_header(skb));
2442
2443         return segs;
2444 }
2445 EXPORT_SYMBOL(skb_mac_gso_segment);
2446
2447
2448 /* openvswitch calls this on rx path, so we need a different check.
2449  */
2450 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2451 {
2452         if (tx_path)
2453                 return skb->ip_summed != CHECKSUM_PARTIAL;
2454         else
2455                 return skb->ip_summed == CHECKSUM_NONE;
2456 }
2457
2458 /**
2459  *      __skb_gso_segment - Perform segmentation on skb.
2460  *      @skb: buffer to segment
2461  *      @features: features for the output path (see dev->features)
2462  *      @tx_path: whether it is called in TX path
2463  *
2464  *      This function segments the given skb and returns a list of segments.
2465  *
2466  *      It may return NULL if the skb requires no segmentation.  This is
2467  *      only possible when GSO is used for verifying header integrity.
2468  */
2469 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2470                                   netdev_features_t features, bool tx_path)
2471 {
2472         if (unlikely(skb_needs_check(skb, tx_path))) {
2473                 int err;
2474
2475                 skb_warn_bad_offload(skb);
2476
2477                 err = skb_cow_head(skb, 0);
2478                 if (err < 0)
2479                         return ERR_PTR(err);
2480         }
2481
2482         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2483         SKB_GSO_CB(skb)->encap_level = 0;
2484
2485         skb_reset_mac_header(skb);
2486         skb_reset_mac_len(skb);
2487
2488         return skb_mac_gso_segment(skb, features);
2489 }
2490 EXPORT_SYMBOL(__skb_gso_segment);
2491
2492 /* Take action when hardware reception checksum errors are detected. */
2493 #ifdef CONFIG_BUG
2494 void netdev_rx_csum_fault(struct net_device *dev)
2495 {
2496         if (net_ratelimit()) {
2497                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2498                 dump_stack();
2499         }
2500 }
2501 EXPORT_SYMBOL(netdev_rx_csum_fault);
2502 #endif
2503
2504 /* Actually, we should eliminate this check as soon as we know, that:
2505  * 1. IOMMU is present and allows to map all the memory.
2506  * 2. No high memory really exists on this machine.
2507  */
2508
2509 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2510 {
2511 #ifdef CONFIG_HIGHMEM
2512         int i;
2513         if (!(dev->features & NETIF_F_HIGHDMA)) {
2514                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2515                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2516                         if (PageHighMem(skb_frag_page(frag)))
2517                                 return 1;
2518                 }
2519         }
2520
2521         if (PCI_DMA_BUS_IS_PHYS) {
2522                 struct device *pdev = dev->dev.parent;
2523
2524                 if (!pdev)
2525                         return 0;
2526                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2527                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2528                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2529                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2530                                 return 1;
2531                 }
2532         }
2533 #endif
2534         return 0;
2535 }
2536
2537 /* If MPLS offload request, verify we are testing hardware MPLS features
2538  * instead of standard features for the netdev.
2539  */
2540 #ifdef CONFIG_NET_MPLS_GSO
2541 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2542                                            netdev_features_t features,
2543                                            __be16 type)
2544 {
2545         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2546                 features &= skb->dev->mpls_features;
2547
2548         return features;
2549 }
2550 #else
2551 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2552                                            netdev_features_t features,
2553                                            __be16 type)
2554 {
2555         return features;
2556 }
2557 #endif
2558
2559 static netdev_features_t harmonize_features(struct sk_buff *skb,
2560         netdev_features_t features)
2561 {
2562         int tmp;
2563         __be16 type;
2564
2565         type = skb_network_protocol(skb, &tmp);
2566         features = net_mpls_features(skb, features, type);
2567
2568         if (skb->ip_summed != CHECKSUM_NONE &&
2569             !can_checksum_protocol(features, type)) {
2570                 features &= ~NETIF_F_ALL_CSUM;
2571         } else if (illegal_highdma(skb->dev, skb)) {
2572                 features &= ~NETIF_F_SG;
2573         }
2574
2575         return features;
2576 }
2577
2578 netdev_features_t netif_skb_features(struct sk_buff *skb)
2579 {
2580         __be16 protocol = skb->protocol;
2581         netdev_features_t features = skb->dev->features;
2582
2583         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2584                 features &= ~NETIF_F_GSO_MASK;
2585
2586         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2587                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2588                 protocol = veh->h_vlan_encapsulated_proto;
2589         } else if (!vlan_tx_tag_present(skb)) {
2590                 return harmonize_features(skb, features);
2591         }
2592
2593         features = netdev_intersect_features(features,
2594                                              skb->dev->vlan_features |
2595                                              NETIF_F_HW_VLAN_CTAG_TX |
2596                                              NETIF_F_HW_VLAN_STAG_TX);
2597
2598         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2599                 features = netdev_intersect_features(features,
2600                                                      NETIF_F_SG |
2601                                                      NETIF_F_HIGHDMA |
2602                                                      NETIF_F_FRAGLIST |
2603                                                      NETIF_F_GEN_CSUM |
2604                                                      NETIF_F_HW_VLAN_CTAG_TX |
2605                                                      NETIF_F_HW_VLAN_STAG_TX);
2606
2607         return harmonize_features(skb, features);
2608 }
2609 EXPORT_SYMBOL(netif_skb_features);
2610
2611 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2612                     struct netdev_queue *txq, bool more)
2613 {
2614         unsigned int len;
2615         int rc;
2616
2617         if (!list_empty(&ptype_all))
2618                 dev_queue_xmit_nit(skb, dev);
2619
2620         len = skb->len;
2621         trace_net_dev_start_xmit(skb, dev);
2622         rc = netdev_start_xmit(skb, dev, txq, more);
2623         trace_net_dev_xmit(skb, rc, dev, len);
2624
2625         return rc;
2626 }
2627
2628 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2629                                     struct netdev_queue *txq, int *ret)
2630 {
2631         struct sk_buff *skb = first;
2632         int rc = NETDEV_TX_OK;
2633
2634         while (skb) {
2635                 struct sk_buff *next = skb->next;
2636
2637                 skb->next = NULL;
2638                 rc = xmit_one(skb, dev, txq, next != NULL);
2639                 if (unlikely(!dev_xmit_complete(rc))) {
2640                         skb->next = next;
2641                         goto out;
2642                 }
2643
2644                 skb = next;
2645                 if (netif_xmit_stopped(txq) && skb) {
2646                         rc = NETDEV_TX_BUSY;
2647                         break;
2648                 }
2649         }
2650
2651 out:
2652         *ret = rc;
2653         return skb;
2654 }
2655
2656 struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features)
2657 {
2658         if (vlan_tx_tag_present(skb) &&
2659             !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2660                 skb = __vlan_put_tag(skb, skb->vlan_proto,
2661                                      vlan_tx_tag_get(skb));
2662                 if (skb)
2663                         skb->vlan_tci = 0;
2664         }
2665         return skb;
2666 }
2667
2668 struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2669 {
2670         netdev_features_t features;
2671
2672         if (skb->next)
2673                 return skb;
2674
2675         /* If device doesn't need skb->dst, release it right now while
2676          * its hot in this cpu cache
2677          */
2678         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2679                 skb_dst_drop(skb);
2680
2681         features = netif_skb_features(skb);
2682         skb = validate_xmit_vlan(skb, features);
2683         if (unlikely(!skb))
2684                 goto out_null;
2685
2686         /* If encapsulation offload request, verify we are testing
2687          * hardware encapsulation features instead of standard
2688          * features for the netdev
2689          */
2690         if (skb->encapsulation)
2691                 features &= dev->hw_enc_features;
2692
2693         if (netif_needs_gso(skb, features)) {
2694                 struct sk_buff *segs;
2695
2696                 segs = skb_gso_segment(skb, features);
2697                 if (IS_ERR(segs)) {
2698                         segs = NULL;
2699                 } else if (segs) {
2700                         consume_skb(skb);
2701                         skb = segs;
2702                 }
2703         } else {
2704                 if (skb_needs_linearize(skb, features) &&
2705                     __skb_linearize(skb))
2706                         goto out_kfree_skb;
2707
2708                 /* If packet is not checksummed and device does not
2709                  * support checksumming for this protocol, complete
2710                  * checksumming here.
2711                  */
2712                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2713                         if (skb->encapsulation)
2714                                 skb_set_inner_transport_header(skb,
2715                                                                skb_checksum_start_offset(skb));
2716                         else
2717                                 skb_set_transport_header(skb,
2718                                                          skb_checksum_start_offset(skb));
2719                         if (!(features & NETIF_F_ALL_CSUM) &&
2720                             skb_checksum_help(skb))
2721                                 goto out_kfree_skb;
2722                 }
2723         }
2724
2725         return skb;
2726
2727 out_kfree_skb:
2728         kfree_skb(skb);
2729 out_null:
2730         return NULL;
2731 }
2732
2733 static void qdisc_pkt_len_init(struct sk_buff *skb)
2734 {
2735         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2736
2737         qdisc_skb_cb(skb)->pkt_len = skb->len;
2738
2739         /* To get more precise estimation of bytes sent on wire,
2740          * we add to pkt_len the headers size of all segments
2741          */
2742         if (shinfo->gso_size)  {
2743                 unsigned int hdr_len;
2744                 u16 gso_segs = shinfo->gso_segs;
2745
2746                 /* mac layer + network layer */
2747                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2748
2749                 /* + transport layer */
2750                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2751                         hdr_len += tcp_hdrlen(skb);
2752                 else
2753                         hdr_len += sizeof(struct udphdr);
2754
2755                 if (shinfo->gso_type & SKB_GSO_DODGY)
2756                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2757                                                 shinfo->gso_size);
2758
2759                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2760         }
2761 }
2762
2763 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2764                                  struct net_device *dev,
2765                                  struct netdev_queue *txq)
2766 {
2767         spinlock_t *root_lock = qdisc_lock(q);
2768         bool contended;
2769         int rc;
2770
2771         qdisc_pkt_len_init(skb);
2772         qdisc_calculate_pkt_len(skb, q);
2773         /*
2774          * Heuristic to force contended enqueues to serialize on a
2775          * separate lock before trying to get qdisc main lock.
2776          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2777          * often and dequeue packets faster.
2778          */
2779         contended = qdisc_is_running(q);
2780         if (unlikely(contended))
2781                 spin_lock(&q->busylock);
2782
2783         spin_lock(root_lock);
2784         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2785                 kfree_skb(skb);
2786                 rc = NET_XMIT_DROP;
2787         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2788                    qdisc_run_begin(q)) {
2789                 /*
2790                  * This is a work-conserving queue; there are no old skbs
2791                  * waiting to be sent out; and the qdisc is not running -
2792                  * xmit the skb directly.
2793                  */
2794                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2795                         skb_dst_force(skb);
2796
2797                 qdisc_bstats_update(q, skb);
2798
2799                 skb = validate_xmit_skb(skb, dev);
2800                 if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2801                         if (unlikely(contended)) {
2802                                 spin_unlock(&q->busylock);
2803                                 contended = false;
2804                         }
2805                         __qdisc_run(q);
2806                 } else
2807                         qdisc_run_end(q);
2808
2809                 rc = NET_XMIT_SUCCESS;
2810         } else {
2811                 skb_dst_force(skb);
2812                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2813                 if (qdisc_run_begin(q)) {
2814                         if (unlikely(contended)) {
2815                                 spin_unlock(&q->busylock);
2816                                 contended = false;
2817                         }
2818                         __qdisc_run(q);
2819                 }
2820         }
2821         spin_unlock(root_lock);
2822         if (unlikely(contended))
2823                 spin_unlock(&q->busylock);
2824         return rc;
2825 }
2826
2827 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2828 static void skb_update_prio(struct sk_buff *skb)
2829 {
2830         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2831
2832         if (!skb->priority && skb->sk && map) {
2833                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2834
2835                 if (prioidx < map->priomap_len)
2836                         skb->priority = map->priomap[prioidx];
2837         }
2838 }
2839 #else
2840 #define skb_update_prio(skb)
2841 #endif
2842
2843 static DEFINE_PER_CPU(int, xmit_recursion);
2844 #define RECURSION_LIMIT 10
2845
2846 /**
2847  *      dev_loopback_xmit - loop back @skb
2848  *      @skb: buffer to transmit
2849  */
2850 int dev_loopback_xmit(struct sk_buff *skb)
2851 {
2852         skb_reset_mac_header(skb);
2853         __skb_pull(skb, skb_network_offset(skb));
2854         skb->pkt_type = PACKET_LOOPBACK;
2855         skb->ip_summed = CHECKSUM_UNNECESSARY;
2856         WARN_ON(!skb_dst(skb));
2857         skb_dst_force(skb);
2858         netif_rx_ni(skb);
2859         return 0;
2860 }
2861 EXPORT_SYMBOL(dev_loopback_xmit);
2862
2863 /**
2864  *      __dev_queue_xmit - transmit a buffer
2865  *      @skb: buffer to transmit
2866  *      @accel_priv: private data used for L2 forwarding offload
2867  *
2868  *      Queue a buffer for transmission to a network device. The caller must
2869  *      have set the device and priority and built the buffer before calling
2870  *      this function. The function can be called from an interrupt.
2871  *
2872  *      A negative errno code is returned on a failure. A success does not
2873  *      guarantee the frame will be transmitted as it may be dropped due
2874  *      to congestion or traffic shaping.
2875  *
2876  * -----------------------------------------------------------------------------------
2877  *      I notice this method can also return errors from the queue disciplines,
2878  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2879  *      be positive.
2880  *
2881  *      Regardless of the return value, the skb is consumed, so it is currently
2882  *      difficult to retry a send to this method.  (You can bump the ref count
2883  *      before sending to hold a reference for retry if you are careful.)
2884  *
2885  *      When calling this method, interrupts MUST be enabled.  This is because
2886  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2887  *          --BLG
2888  */
2889 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2890 {
2891         struct net_device *dev = skb->dev;
2892         struct netdev_queue *txq;
2893         struct Qdisc *q;
2894         int rc = -ENOMEM;
2895
2896         skb_reset_mac_header(skb);
2897
2898         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2899                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2900
2901         /* Disable soft irqs for various locks below. Also
2902          * stops preemption for RCU.
2903          */
2904         rcu_read_lock_bh();
2905
2906         skb_update_prio(skb);
2907
2908         txq = netdev_pick_tx(dev, skb, accel_priv);
2909         q = rcu_dereference_bh(txq->qdisc);
2910
2911 #ifdef CONFIG_NET_CLS_ACT
2912         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2913 #endif
2914         trace_net_dev_queue(skb);
2915         if (q->enqueue) {
2916                 rc = __dev_xmit_skb(skb, q, dev, txq);
2917                 goto out;
2918         }
2919
2920         /* The device has no queue. Common case for software devices:
2921            loopback, all the sorts of tunnels...
2922
2923            Really, it is unlikely that netif_tx_lock protection is necessary
2924            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2925            counters.)
2926            However, it is possible, that they rely on protection
2927            made by us here.
2928
2929            Check this and shot the lock. It is not prone from deadlocks.
2930            Either shot noqueue qdisc, it is even simpler 8)
2931          */
2932         if (dev->flags & IFF_UP) {
2933                 int cpu = smp_processor_id(); /* ok because BHs are off */
2934
2935                 if (txq->xmit_lock_owner != cpu) {
2936
2937                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2938                                 goto recursion_alert;
2939
2940                         skb = validate_xmit_skb(skb, dev);
2941                         if (!skb)
2942                                 goto drop;
2943
2944                         HARD_TX_LOCK(dev, txq, cpu);
2945
2946                         if (!netif_xmit_stopped(txq)) {
2947                                 __this_cpu_inc(xmit_recursion);
2948                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2949                                 __this_cpu_dec(xmit_recursion);
2950                                 if (dev_xmit_complete(rc)) {
2951                                         HARD_TX_UNLOCK(dev, txq);
2952                                         goto out;
2953                                 }
2954                         }
2955                         HARD_TX_UNLOCK(dev, txq);
2956                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2957                                              dev->name);
2958                 } else {
2959                         /* Recursion is detected! It is possible,
2960                          * unfortunately
2961                          */
2962 recursion_alert:
2963                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2964                                              dev->name);
2965                 }
2966         }
2967
2968         rc = -ENETDOWN;
2969 drop:
2970         rcu_read_unlock_bh();
2971
2972         atomic_long_inc(&dev->tx_dropped);
2973         kfree_skb_list(skb);
2974         return rc;
2975 out:
2976         rcu_read_unlock_bh();
2977         return rc;
2978 }
2979
2980 int dev_queue_xmit(struct sk_buff *skb)
2981 {
2982         return __dev_queue_xmit(skb, NULL);
2983 }
2984 EXPORT_SYMBOL(dev_queue_xmit);
2985
2986 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2987 {
2988         return __dev_queue_xmit(skb, accel_priv);
2989 }
2990 EXPORT_SYMBOL(dev_queue_xmit_accel);
2991
2992
2993 /*=======================================================================
2994                         Receiver routines
2995   =======================================================================*/
2996
2997 int netdev_max_backlog __read_mostly = 1000;
2998 EXPORT_SYMBOL(netdev_max_backlog);
2999
3000 int netdev_tstamp_prequeue __read_mostly = 1;
3001 int netdev_budget __read_mostly = 300;
3002 int weight_p __read_mostly = 64;            /* old backlog weight */
3003
3004 /* Called with irq disabled */
3005 static inline void ____napi_schedule(struct softnet_data *sd,
3006                                      struct napi_struct *napi)
3007 {
3008         list_add_tail(&napi->poll_list, &sd->poll_list);
3009         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3010 }
3011
3012 #ifdef CONFIG_RPS
3013
3014 /* One global table that all flow-based protocols share. */
3015 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3016 EXPORT_SYMBOL(rps_sock_flow_table);
3017
3018 struct static_key rps_needed __read_mostly;
3019
3020 static struct rps_dev_flow *
3021 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3022             struct rps_dev_flow *rflow, u16 next_cpu)
3023 {
3024         if (next_cpu != RPS_NO_CPU) {
3025 #ifdef CONFIG_RFS_ACCEL
3026                 struct netdev_rx_queue *rxqueue;
3027                 struct rps_dev_flow_table *flow_table;
3028                 struct rps_dev_flow *old_rflow;
3029                 u32 flow_id;
3030                 u16 rxq_index;
3031                 int rc;
3032
3033                 /* Should we steer this flow to a different hardware queue? */
3034                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3035                     !(dev->features & NETIF_F_NTUPLE))
3036                         goto out;
3037                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3038                 if (rxq_index == skb_get_rx_queue(skb))
3039                         goto out;
3040
3041                 rxqueue = dev->_rx + rxq_index;
3042                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3043                 if (!flow_table)
3044                         goto out;
3045                 flow_id = skb_get_hash(skb) & flow_table->mask;
3046                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3047                                                         rxq_index, flow_id);
3048                 if (rc < 0)
3049                         goto out;
3050                 old_rflow = rflow;
3051                 rflow = &flow_table->flows[flow_id];
3052                 rflow->filter = rc;
3053                 if (old_rflow->filter == rflow->filter)
3054                         old_rflow->filter = RPS_NO_FILTER;
3055         out:
3056 #endif
3057                 rflow->last_qtail =
3058                         per_cpu(softnet_data, next_cpu).input_queue_head;
3059         }
3060
3061         rflow->cpu = next_cpu;
3062         return rflow;
3063 }
3064
3065 /*
3066  * get_rps_cpu is called from netif_receive_skb and returns the target
3067  * CPU from the RPS map of the receiving queue for a given skb.
3068  * rcu_read_lock must be held on entry.
3069  */
3070 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3071                        struct rps_dev_flow **rflowp)
3072 {
3073         struct netdev_rx_queue *rxqueue;
3074         struct rps_map *map;
3075         struct rps_dev_flow_table *flow_table;
3076         struct rps_sock_flow_table *sock_flow_table;
3077         int cpu = -1;
3078         u16 tcpu;
3079         u32 hash;
3080
3081         if (skb_rx_queue_recorded(skb)) {
3082                 u16 index = skb_get_rx_queue(skb);
3083                 if (unlikely(index >= dev->real_num_rx_queues)) {
3084                         WARN_ONCE(dev->real_num_rx_queues > 1,
3085                                   "%s received packet on queue %u, but number "
3086                                   "of RX queues is %u\n",
3087                                   dev->name, index, dev->real_num_rx_queues);
3088                         goto done;
3089                 }
3090                 rxqueue = dev->_rx + index;
3091         } else
3092                 rxqueue = dev->_rx;
3093
3094         map = rcu_dereference(rxqueue->rps_map);
3095         if (map) {
3096                 if (map->len == 1 &&
3097                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3098                         tcpu = map->cpus[0];
3099                         if (cpu_online(tcpu))
3100                                 cpu = tcpu;
3101                         goto done;
3102                 }
3103         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3104                 goto done;
3105         }
3106
3107         skb_reset_network_header(skb);
3108         hash = skb_get_hash(skb);
3109         if (!hash)
3110                 goto done;
3111
3112         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3113         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3114         if (flow_table && sock_flow_table) {
3115                 u16 next_cpu;
3116                 struct rps_dev_flow *rflow;
3117
3118                 rflow = &flow_table->flows[hash & flow_table->mask];
3119                 tcpu = rflow->cpu;
3120
3121                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3122
3123                 /*
3124                  * If the desired CPU (where last recvmsg was done) is
3125                  * different from current CPU (one in the rx-queue flow
3126                  * table entry), switch if one of the following holds:
3127                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3128                  *   - Current CPU is offline.
3129                  *   - The current CPU's queue tail has advanced beyond the
3130                  *     last packet that was enqueued using this table entry.
3131                  *     This guarantees that all previous packets for the flow
3132                  *     have been dequeued, thus preserving in order delivery.
3133                  */
3134                 if (unlikely(tcpu != next_cpu) &&
3135                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3136                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3137                       rflow->last_qtail)) >= 0)) {
3138                         tcpu = next_cpu;
3139                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3140                 }
3141
3142                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3143                         *rflowp = rflow;
3144                         cpu = tcpu;
3145                         goto done;
3146                 }
3147         }
3148
3149         if (map) {
3150                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3151                 if (cpu_online(tcpu)) {
3152                         cpu = tcpu;
3153                         goto done;
3154                 }
3155         }
3156
3157 done:
3158         return cpu;
3159 }
3160
3161 #ifdef CONFIG_RFS_ACCEL
3162
3163 /**
3164  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3165  * @dev: Device on which the filter was set
3166  * @rxq_index: RX queue index
3167  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3168  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3169  *
3170  * Drivers that implement ndo_rx_flow_steer() should periodically call
3171  * this function for each installed filter and remove the filters for
3172  * which it returns %true.
3173  */
3174 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3175                          u32 flow_id, u16 filter_id)
3176 {
3177         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3178         struct rps_dev_flow_table *flow_table;
3179         struct rps_dev_flow *rflow;
3180         bool expire = true;
3181         int cpu;
3182
3183         rcu_read_lock();
3184         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3185         if (flow_table && flow_id <= flow_table->mask) {
3186                 rflow = &flow_table->flows[flow_id];
3187                 cpu = ACCESS_ONCE(rflow->cpu);
3188                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3189                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3190                            rflow->last_qtail) <
3191                      (int)(10 * flow_table->mask)))
3192                         expire = false;
3193         }
3194         rcu_read_unlock();
3195         return expire;
3196 }
3197 EXPORT_SYMBOL(rps_may_expire_flow);
3198
3199 #endif /* CONFIG_RFS_ACCEL */
3200
3201 /* Called from hardirq (IPI) context */
3202 static void rps_trigger_softirq(void *data)
3203 {
3204         struct softnet_data *sd = data;
3205
3206         ____napi_schedule(sd, &sd->backlog);
3207         sd->received_rps++;
3208 }
3209
3210 #endif /* CONFIG_RPS */
3211
3212 /*
3213  * Check if this softnet_data structure is another cpu one
3214  * If yes, queue it to our IPI list and return 1
3215  * If no, return 0
3216  */
3217 static int rps_ipi_queued(struct softnet_data *sd)
3218 {
3219 #ifdef CONFIG_RPS
3220         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3221
3222         if (sd != mysd) {
3223                 sd->rps_ipi_next = mysd->rps_ipi_list;
3224                 mysd->rps_ipi_list = sd;
3225
3226                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3227                 return 1;
3228         }
3229 #endif /* CONFIG_RPS */
3230         return 0;
3231 }
3232
3233 #ifdef CONFIG_NET_FLOW_LIMIT
3234 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3235 #endif
3236
3237 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3238 {
3239 #ifdef CONFIG_NET_FLOW_LIMIT
3240         struct sd_flow_limit *fl;
3241         struct softnet_data *sd;
3242         unsigned int old_flow, new_flow;
3243
3244         if (qlen < (netdev_max_backlog >> 1))
3245                 return false;
3246
3247         sd = &__get_cpu_var(softnet_data);
3248
3249         rcu_read_lock();
3250         fl = rcu_dereference(sd->flow_limit);
3251         if (fl) {
3252                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3253                 old_flow = fl->history[fl->history_head];
3254                 fl->history[fl->history_head] = new_flow;
3255
3256                 fl->history_head++;
3257                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3258
3259                 if (likely(fl->buckets[old_flow]))
3260                         fl->buckets[old_flow]--;
3261
3262                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3263                         fl->count++;
3264                         rcu_read_unlock();
3265                         return true;
3266                 }
3267         }
3268         rcu_read_unlock();
3269 #endif
3270         return false;
3271 }
3272
3273 /*
3274  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3275  * queue (may be a remote CPU queue).
3276  */
3277 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3278                               unsigned int *qtail)
3279 {
3280         struct softnet_data *sd;
3281         unsigned long flags;
3282         unsigned int qlen;
3283
3284         sd = &per_cpu(softnet_data, cpu);
3285
3286         local_irq_save(flags);
3287
3288         rps_lock(sd);
3289         qlen = skb_queue_len(&sd->input_pkt_queue);
3290         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3291                 if (skb_queue_len(&sd->input_pkt_queue)) {
3292 enqueue:
3293                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3294                         input_queue_tail_incr_save(sd, qtail);
3295                         rps_unlock(sd);
3296                         local_irq_restore(flags);
3297                         return NET_RX_SUCCESS;
3298                 }
3299
3300                 /* Schedule NAPI for backlog device
3301                  * We can use non atomic operation since we own the queue lock
3302                  */
3303                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3304                         if (!rps_ipi_queued(sd))
3305                                 ____napi_schedule(sd, &sd->backlog);
3306                 }
3307                 goto enqueue;
3308         }
3309
3310         sd->dropped++;
3311         rps_unlock(sd);
3312
3313         local_irq_restore(flags);
3314
3315         atomic_long_inc(&skb->dev->rx_dropped);
3316         kfree_skb(skb);
3317         return NET_RX_DROP;
3318 }
3319
3320 static int netif_rx_internal(struct sk_buff *skb)
3321 {
3322         int ret;
3323
3324         net_timestamp_check(netdev_tstamp_prequeue, skb);
3325
3326         trace_netif_rx(skb);
3327 #ifdef CONFIG_RPS
3328         if (static_key_false(&rps_needed)) {
3329                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3330                 int cpu;
3331
3332                 preempt_disable();
3333                 rcu_read_lock();
3334
3335                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3336                 if (cpu < 0)
3337                         cpu = smp_processor_id();
3338
3339                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3340
3341                 rcu_read_unlock();
3342                 preempt_enable();
3343         } else
3344 #endif
3345         {
3346                 unsigned int qtail;
3347                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3348                 put_cpu();
3349         }
3350         return ret;
3351 }
3352
3353 /**
3354  *      netif_rx        -       post buffer to the network code
3355  *      @skb: buffer to post
3356  *
3357  *      This function receives a packet from a device driver and queues it for
3358  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3359  *      may be dropped during processing for congestion control or by the
3360  *      protocol layers.
3361  *
3362  *      return values:
3363  *      NET_RX_SUCCESS  (no congestion)
3364  *      NET_RX_DROP     (packet was dropped)
3365  *
3366  */
3367
3368 int netif_rx(struct sk_buff *skb)
3369 {
3370         trace_netif_rx_entry(skb);
3371
3372         return netif_rx_internal(skb);
3373 }
3374 EXPORT_SYMBOL(netif_rx);
3375
3376 int netif_rx_ni(struct sk_buff *skb)
3377 {
3378         int err;
3379
3380         trace_netif_rx_ni_entry(skb);
3381
3382         preempt_disable();
3383         err = netif_rx_internal(skb);
3384         if (local_softirq_pending())
3385                 do_softirq();
3386         preempt_enable();
3387
3388         return err;
3389 }
3390 EXPORT_SYMBOL(netif_rx_ni);
3391
3392 static void net_tx_action(struct softirq_action *h)
3393 {
3394         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3395
3396         if (sd->completion_queue) {
3397                 struct sk_buff *clist;
3398
3399                 local_irq_disable();
3400                 clist = sd->completion_queue;
3401                 sd->completion_queue = NULL;
3402                 local_irq_enable();
3403
3404                 while (clist) {
3405                         struct sk_buff *skb = clist;
3406                         clist = clist->next;
3407
3408                         WARN_ON(atomic_read(&skb->users));
3409                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3410                                 trace_consume_skb(skb);
3411                         else
3412                                 trace_kfree_skb(skb, net_tx_action);
3413                         __kfree_skb(skb);
3414                 }
3415         }
3416
3417         if (sd->output_queue) {
3418                 struct Qdisc *head;
3419
3420                 local_irq_disable();
3421                 head = sd->output_queue;
3422                 sd->output_queue = NULL;
3423                 sd->output_queue_tailp = &sd->output_queue;
3424                 local_irq_enable();
3425
3426                 while (head) {
3427                         struct Qdisc *q = head;
3428                         spinlock_t *root_lock;
3429
3430                         head = head->next_sched;
3431
3432                         root_lock = qdisc_lock(q);
3433                         if (spin_trylock(root_lock)) {
3434                                 smp_mb__before_atomic();
3435                                 clear_bit(__QDISC_STATE_SCHED,
3436                                           &q->state);
3437                                 qdisc_run(q);
3438                                 spin_unlock(root_lock);
3439                         } else {
3440                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3441                                               &q->state)) {
3442                                         __netif_reschedule(q);
3443                                 } else {
3444                                         smp_mb__before_atomic();
3445                                         clear_bit(__QDISC_STATE_SCHED,
3446                                                   &q->state);
3447                                 }
3448                         }
3449                 }
3450         }
3451 }
3452
3453 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3454     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3455 /* This hook is defined here for ATM LANE */
3456 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3457                              unsigned char *addr) __read_mostly;
3458 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3459 #endif
3460
3461 #ifdef CONFIG_NET_CLS_ACT
3462 /* TODO: Maybe we should just force sch_ingress to be compiled in
3463  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3464  * a compare and 2 stores extra right now if we dont have it on
3465  * but have CONFIG_NET_CLS_ACT
3466  * NOTE: This doesn't stop any functionality; if you dont have
3467  * the ingress scheduler, you just can't add policies on ingress.
3468  *
3469  */
3470 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3471 {
3472         struct net_device *dev = skb->dev;
3473         u32 ttl = G_TC_RTTL(skb->tc_verd);
3474         int result = TC_ACT_OK;
3475         struct Qdisc *q;
3476
3477         if (unlikely(MAX_RED_LOOP < ttl++)) {
3478                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3479                                      skb->skb_iif, dev->ifindex);
3480                 return TC_ACT_SHOT;
3481         }
3482
3483         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3484         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3485
3486         q = rcu_dereference(rxq->qdisc);
3487         if (q != &noop_qdisc) {
3488                 spin_lock(qdisc_lock(q));
3489                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3490                         result = qdisc_enqueue_root(skb, q);
3491                 spin_unlock(qdisc_lock(q));
3492         }
3493
3494         return result;
3495 }
3496
3497 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3498                                          struct packet_type **pt_prev,
3499                                          int *ret, struct net_device *orig_dev)
3500 {
3501         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3502
3503         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3504                 goto out;
3505
3506         if (*pt_prev) {
3507                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3508                 *pt_prev = NULL;
3509         }
3510
3511         switch (ing_filter(skb, rxq)) {
3512         case TC_ACT_SHOT:
3513         case TC_ACT_STOLEN:
3514                 kfree_skb(skb);
3515                 return NULL;
3516         }
3517
3518 out:
3519         skb->tc_verd = 0;
3520         return skb;
3521 }
3522 #endif
3523
3524 /**
3525  *      netdev_rx_handler_register - register receive handler
3526  *      @dev: device to register a handler for
3527  *      @rx_handler: receive handler to register
3528  *      @rx_handler_data: data pointer that is used by rx handler
3529  *
3530  *      Register a receive handler for a device. This handler will then be
3531  *      called from __netif_receive_skb. A negative errno code is returned
3532  *      on a failure.
3533  *
3534  *      The caller must hold the rtnl_mutex.
3535  *
3536  *      For a general description of rx_handler, see enum rx_handler_result.
3537  */
3538 int netdev_rx_handler_register(struct net_device *dev,
3539                                rx_handler_func_t *rx_handler,
3540                                void *rx_handler_data)
3541 {
3542         ASSERT_RTNL();
3543
3544         if (dev->rx_handler)
3545                 return -EBUSY;
3546
3547         /* Note: rx_handler_data must be set before rx_handler */
3548         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3549         rcu_assign_pointer(dev->rx_handler, rx_handler);
3550
3551         return 0;
3552 }
3553 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3554
3555 /**
3556  *      netdev_rx_handler_unregister - unregister receive handler
3557  *      @dev: device to unregister a handler from
3558  *
3559  *      Unregister a receive handler from a device.
3560  *
3561  *      The caller must hold the rtnl_mutex.
3562  */
3563 void netdev_rx_handler_unregister(struct net_device *dev)
3564 {
3565
3566         ASSERT_RTNL();
3567         RCU_INIT_POINTER(dev->rx_handler, NULL);
3568         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3569          * section has a guarantee to see a non NULL rx_handler_data
3570          * as well.
3571          */
3572         synchronize_net();
3573         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3574 }
3575 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3576
3577 /*
3578  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3579  * the special handling of PFMEMALLOC skbs.
3580  */
3581 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3582 {
3583         switch (skb->protocol) {
3584         case htons(ETH_P_ARP):
3585         case htons(ETH_P_IP):
3586         case htons(ETH_P_IPV6):
3587         case htons(ETH_P_8021Q):
3588         case htons(ETH_P_8021AD):
3589                 return true;
3590         default:
3591                 return false;
3592         }
3593 }
3594
3595 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3596 {
3597         struct packet_type *ptype, *pt_prev;
3598         rx_handler_func_t *rx_handler;
3599         struct net_device *orig_dev;
3600         struct net_device *null_or_dev;
3601         bool deliver_exact = false;
3602         int ret = NET_RX_DROP;
3603         __be16 type;
3604
3605         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3606
3607         trace_netif_receive_skb(skb);
3608
3609         orig_dev = skb->dev;
3610
3611         skb_reset_network_header(skb);
3612         if (!skb_transport_header_was_set(skb))
3613                 skb_reset_transport_header(skb);
3614         skb_reset_mac_len(skb);
3615
3616         pt_prev = NULL;
3617
3618         rcu_read_lock();
3619
3620 another_round:
3621         skb->skb_iif = skb->dev->ifindex;
3622
3623         __this_cpu_inc(softnet_data.processed);
3624
3625         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3626             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3627                 skb = skb_vlan_untag(skb);
3628                 if (unlikely(!skb))
3629                         goto unlock;
3630         }
3631
3632 #ifdef CONFIG_NET_CLS_ACT
3633         if (skb->tc_verd & TC_NCLS) {
3634                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3635                 goto ncls;
3636         }
3637 #endif
3638
3639         if (pfmemalloc)
3640                 goto skip_taps;
3641
3642         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3643                 if (!ptype->dev || ptype->dev == skb->dev) {
3644                         if (pt_prev)
3645                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3646                         pt_prev = ptype;
3647                 }
3648         }
3649
3650 skip_taps:
3651 #ifdef CONFIG_NET_CLS_ACT
3652         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3653         if (!skb)
3654                 goto unlock;
3655 ncls:
3656 #endif
3657
3658         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3659                 goto drop;
3660
3661         if (vlan_tx_tag_present(skb)) {
3662                 if (pt_prev) {
3663                         ret = deliver_skb(skb, pt_prev, orig_dev);
3664                         pt_prev = NULL;
3665                 }
3666                 if (vlan_do_receive(&skb))
3667                         goto another_round;
3668                 else if (unlikely(!skb))
3669                         goto unlock;
3670         }
3671
3672         rx_handler = rcu_dereference(skb->dev->rx_handler);
3673         if (rx_handler) {
3674                 if (pt_prev) {
3675                         ret = deliver_skb(skb, pt_prev, orig_dev);
3676                         pt_prev = NULL;
3677                 }
3678                 switch (rx_handler(&skb)) {
3679                 case RX_HANDLER_CONSUMED:
3680                         ret = NET_RX_SUCCESS;
3681                         goto unlock;
3682                 case RX_HANDLER_ANOTHER:
3683                         goto another_round;
3684                 case RX_HANDLER_EXACT:
3685                         deliver_exact = true;
3686                 case RX_HANDLER_PASS:
3687                         break;
3688                 default:
3689                         BUG();
3690                 }
3691         }
3692
3693         if (unlikely(vlan_tx_tag_present(skb))) {
3694                 if (vlan_tx_tag_get_id(skb))
3695                         skb->pkt_type = PACKET_OTHERHOST;
3696                 /* Note: we might in the future use prio bits
3697                  * and set skb->priority like in vlan_do_receive()
3698                  * For the time being, just ignore Priority Code Point
3699                  */
3700                 skb->vlan_tci = 0;
3701         }
3702
3703         /* deliver only exact match when indicated */
3704         null_or_dev = deliver_exact ? skb->dev : NULL;
3705
3706         type = skb->protocol;
3707         list_for_each_entry_rcu(ptype,
3708                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3709                 if (ptype->type == type &&
3710                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3711                      ptype->dev == orig_dev)) {
3712                         if (pt_prev)
3713                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3714                         pt_prev = ptype;
3715                 }
3716         }
3717
3718         if (pt_prev) {
3719                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3720                         goto drop;
3721                 else
3722                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3723         } else {
3724 drop:
3725                 atomic_long_inc(&skb->dev->rx_dropped);
3726                 kfree_skb(skb);
3727                 /* Jamal, now you will not able to escape explaining
3728                  * me how you were going to use this. :-)
3729                  */
3730                 ret = NET_RX_DROP;
3731         }
3732
3733 unlock:
3734         rcu_read_unlock();
3735         return ret;
3736 }
3737
3738 static int __netif_receive_skb(struct sk_buff *skb)
3739 {
3740         int ret;
3741
3742         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3743                 unsigned long pflags = current->flags;
3744
3745                 /*
3746                  * PFMEMALLOC skbs are special, they should
3747                  * - be delivered to SOCK_MEMALLOC sockets only
3748                  * - stay away from userspace
3749                  * - have bounded memory usage
3750                  *
3751                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3752                  * context down to all allocation sites.
3753                  */
3754                 current->flags |= PF_MEMALLOC;
3755                 ret = __netif_receive_skb_core(skb, true);
3756                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3757         } else
3758                 ret = __netif_receive_skb_core(skb, false);
3759
3760         return ret;
3761 }
3762
3763 static int netif_receive_skb_internal(struct sk_buff *skb)
3764 {
3765         net_timestamp_check(netdev_tstamp_prequeue, skb);
3766
3767         if (skb_defer_rx_timestamp(skb))
3768                 return NET_RX_SUCCESS;
3769
3770 #ifdef CONFIG_RPS
3771         if (static_key_false(&rps_needed)) {
3772                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3773                 int cpu, ret;
3774
3775                 rcu_read_lock();
3776
3777                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3778
3779                 if (cpu >= 0) {
3780                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3781                         rcu_read_unlock();
3782                         return ret;
3783                 }
3784                 rcu_read_unlock();
3785         }
3786 #endif
3787         return __netif_receive_skb(skb);
3788 }
3789
3790 /**
3791  *      netif_receive_skb - process receive buffer from network
3792  *      @skb: buffer to process
3793  *
3794  *      netif_receive_skb() is the main receive data processing function.
3795  *      It always succeeds. The buffer may be dropped during processing
3796  *      for congestion control or by the protocol layers.
3797  *
3798  *      This function may only be called from softirq context and interrupts
3799  *      should be enabled.
3800  *
3801  *      Return values (usually ignored):
3802  *      NET_RX_SUCCESS: no congestion
3803  *      NET_RX_DROP: packet was dropped
3804  */
3805 int netif_receive_skb(struct sk_buff *skb)
3806 {
3807         trace_netif_receive_skb_entry(skb);
3808
3809         return netif_receive_skb_internal(skb);
3810 }
3811 EXPORT_SYMBOL(netif_receive_skb);
3812
3813 /* Network device is going away, flush any packets still pending
3814  * Called with irqs disabled.
3815  */
3816 static void flush_backlog(void *arg)
3817 {
3818         struct net_device *dev = arg;
3819         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3820         struct sk_buff *skb, *tmp;
3821
3822         rps_lock(sd);
3823         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3824                 if (skb->dev == dev) {
3825                         __skb_unlink(skb, &sd->input_pkt_queue);
3826                         kfree_skb(skb);
3827                         input_queue_head_incr(sd);
3828                 }
3829         }
3830         rps_unlock(sd);
3831
3832         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3833                 if (skb->dev == dev) {
3834                         __skb_unlink(skb, &sd->process_queue);
3835                         kfree_skb(skb);
3836                         input_queue_head_incr(sd);
3837                 }
3838         }
3839 }
3840
3841 static int napi_gro_complete(struct sk_buff *skb)
3842 {
3843         struct packet_offload *ptype;
3844         __be16 type = skb->protocol;
3845         struct list_head *head = &offload_base;
3846         int err = -ENOENT;
3847
3848         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3849
3850         if (NAPI_GRO_CB(skb)->count == 1) {
3851                 skb_shinfo(skb)->gso_size = 0;
3852                 goto out;
3853         }
3854
3855         rcu_read_lock();
3856         list_for_each_entry_rcu(ptype, head, list) {
3857                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3858                         continue;
3859
3860                 err = ptype->callbacks.gro_complete(skb, 0);
3861                 break;
3862         }
3863         rcu_read_unlock();
3864
3865         if (err) {
3866                 WARN_ON(&ptype->list == head);
3867                 kfree_skb(skb);
3868                 return NET_RX_SUCCESS;
3869         }
3870
3871 out:
3872         return netif_receive_skb_internal(skb);
3873 }
3874
3875 /* napi->gro_list contains packets ordered by age.
3876  * youngest packets at the head of it.
3877  * Complete skbs in reverse order to reduce latencies.
3878  */
3879 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3880 {
3881         struct sk_buff *skb, *prev = NULL;
3882
3883         /* scan list and build reverse chain */
3884         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3885                 skb->prev = prev;
3886                 prev = skb;
3887         }
3888
3889         for (skb = prev; skb; skb = prev) {
3890                 skb->next = NULL;
3891
3892                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3893                         return;
3894
3895                 prev = skb->prev;
3896                 napi_gro_complete(skb);
3897                 napi->gro_count--;
3898         }
3899
3900         napi->gro_list = NULL;
3901 }
3902 EXPORT_SYMBOL(napi_gro_flush);
3903
3904 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3905 {
3906         struct sk_buff *p;
3907         unsigned int maclen = skb->dev->hard_header_len;
3908         u32 hash = skb_get_hash_raw(skb);
3909
3910         for (p = napi->gro_list; p; p = p->next) {
3911                 unsigned long diffs;
3912
3913                 NAPI_GRO_CB(p)->flush = 0;
3914
3915                 if (hash != skb_get_hash_raw(p)) {
3916                         NAPI_GRO_CB(p)->same_flow = 0;
3917                         continue;
3918                 }
3919
3920                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3921                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3922                 if (maclen == ETH_HLEN)
3923                         diffs |= compare_ether_header(skb_mac_header(p),
3924                                                       skb_mac_header(skb));
3925                 else if (!diffs)
3926                         diffs = memcmp(skb_mac_header(p),
3927                                        skb_mac_header(skb),
3928                                        maclen);
3929                 NAPI_GRO_CB(p)->same_flow = !diffs;
3930         }
3931 }
3932
3933 static void skb_gro_reset_offset(struct sk_buff *skb)
3934 {
3935         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3936         const skb_frag_t *frag0 = &pinfo->frags[0];
3937
3938         NAPI_GRO_CB(skb)->data_offset = 0;
3939         NAPI_GRO_CB(skb)->frag0 = NULL;
3940         NAPI_GRO_CB(skb)->frag0_len = 0;
3941
3942         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3943             pinfo->nr_frags &&
3944             !PageHighMem(skb_frag_page(frag0))) {
3945                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3946                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3947         }
3948 }
3949
3950 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3951 {
3952         struct skb_shared_info *pinfo = skb_shinfo(skb);
3953
3954         BUG_ON(skb->end - skb->tail < grow);
3955
3956         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3957
3958         skb->data_len -= grow;
3959         skb->tail += grow;
3960
3961         pinfo->frags[0].page_offset += grow;
3962         skb_frag_size_sub(&pinfo->frags[0], grow);
3963
3964         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3965                 skb_frag_unref(skb, 0);
3966                 memmove(pinfo->frags, pinfo->frags + 1,
3967                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3968         }
3969 }
3970
3971 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3972 {
3973         struct sk_buff **pp = NULL;
3974         struct packet_offload *ptype;
3975         __be16 type = skb->protocol;
3976         struct list_head *head = &offload_base;
3977         int same_flow;
3978         enum gro_result ret;
3979         int grow;
3980
3981         if (!(skb->dev->features & NETIF_F_GRO))
3982                 goto normal;
3983
3984         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3985                 goto normal;
3986
3987         gro_list_prepare(napi, skb);
3988
3989         rcu_read_lock();
3990         list_for_each_entry_rcu(ptype, head, list) {
3991                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3992                         continue;
3993
3994                 skb_set_network_header(skb, skb_gro_offset(skb));
3995                 skb_reset_mac_len(skb);
3996                 NAPI_GRO_CB(skb)->same_flow = 0;
3997                 NAPI_GRO_CB(skb)->flush = 0;
3998                 NAPI_GRO_CB(skb)->free = 0;
3999                 NAPI_GRO_CB(skb)->udp_mark = 0;
4000
4001                 /* Setup for GRO checksum validation */
4002                 switch (skb->ip_summed) {
4003                 case CHECKSUM_COMPLETE:
4004                         NAPI_GRO_CB(skb)->csum = skb->csum;
4005                         NAPI_GRO_CB(skb)->csum_valid = 1;
4006                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4007                         break;
4008                 case CHECKSUM_UNNECESSARY:
4009                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4010                         NAPI_GRO_CB(skb)->csum_valid = 0;
4011                         break;
4012                 default:
4013                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4014                         NAPI_GRO_CB(skb)->csum_valid = 0;
4015                 }
4016
4017                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4018                 break;
4019         }
4020         rcu_read_unlock();
4021
4022         if (&ptype->list == head)
4023                 goto normal;
4024
4025         same_flow = NAPI_GRO_CB(skb)->same_flow;
4026         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4027
4028         if (pp) {
4029                 struct sk_buff *nskb = *pp;
4030
4031                 *pp = nskb->next;
4032                 nskb->next = NULL;
4033                 napi_gro_complete(nskb);
4034                 napi->gro_count--;
4035         }
4036
4037         if (same_flow)
4038                 goto ok;
4039
4040         if (NAPI_GRO_CB(skb)->flush)
4041                 goto normal;
4042
4043         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4044                 struct sk_buff *nskb = napi->gro_list;
4045
4046                 /* locate the end of the list to select the 'oldest' flow */
4047                 while (nskb->next) {
4048                         pp = &nskb->next;
4049                         nskb = *pp;
4050                 }
4051                 *pp = NULL;
4052                 nskb->next = NULL;
4053                 napi_gro_complete(nskb);
4054         } else {
4055                 napi->gro_count++;
4056         }
4057         NAPI_GRO_CB(skb)->count = 1;
4058         NAPI_GRO_CB(skb)->age = jiffies;
4059         NAPI_GRO_CB(skb)->last = skb;
4060         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4061         skb->next = napi->gro_list;
4062         napi->gro_list = skb;
4063         ret = GRO_HELD;
4064
4065 pull:
4066         grow = skb_gro_offset(skb) - skb_headlen(skb);
4067         if (grow > 0)
4068                 gro_pull_from_frag0(skb, grow);
4069 ok:
4070         return ret;
4071
4072 normal:
4073         ret = GRO_NORMAL;
4074         goto pull;
4075 }
4076
4077 struct packet_offload *gro_find_receive_by_type(__be16 type)
4078 {
4079         struct list_head *offload_head = &offload_base;
4080         struct packet_offload *ptype;
4081
4082         list_for_each_entry_rcu(ptype, offload_head, list) {
4083                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4084                         continue;
4085                 return ptype;
4086         }
4087         return NULL;
4088 }
4089 EXPORT_SYMBOL(gro_find_receive_by_type);
4090
4091 struct packet_offload *gro_find_complete_by_type(__be16 type)
4092 {
4093         struct list_head *offload_head = &offload_base;
4094         struct packet_offload *ptype;
4095
4096         list_for_each_entry_rcu(ptype, offload_head, list) {
4097                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4098                         continue;
4099                 return ptype;
4100         }
4101         return NULL;
4102 }
4103 EXPORT_SYMBOL(gro_find_complete_by_type);
4104
4105 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4106 {
4107         switch (ret) {
4108         case GRO_NORMAL:
4109                 if (netif_receive_skb_internal(skb))
4110                         ret = GRO_DROP;
4111                 break;
4112
4113         case GRO_DROP:
4114                 kfree_skb(skb);
4115                 break;
4116
4117         case GRO_MERGED_FREE:
4118                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4119                         kmem_cache_free(skbuff_head_cache, skb);
4120                 else
4121                         __kfree_skb(skb);
4122                 break;
4123
4124         case GRO_HELD:
4125         case GRO_MERGED:
4126                 break;
4127         }
4128
4129         return ret;
4130 }
4131
4132 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4133 {
4134         trace_napi_gro_receive_entry(skb);
4135
4136         skb_gro_reset_offset(skb);
4137
4138         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4139 }
4140 EXPORT_SYMBOL(napi_gro_receive);
4141
4142 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4143 {
4144         __skb_pull(skb, skb_headlen(skb));
4145         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4146         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4147         skb->vlan_tci = 0;
4148         skb->dev = napi->dev;
4149         skb->skb_iif = 0;
4150         skb->encapsulation = 0;
4151         skb_shinfo(skb)->gso_type = 0;
4152         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4153
4154         napi->skb = skb;
4155 }
4156
4157 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4158 {
4159         struct sk_buff *skb = napi->skb;
4160
4161         if (!skb) {
4162                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4163                 napi->skb = skb;
4164         }
4165         return skb;
4166 }
4167 EXPORT_SYMBOL(napi_get_frags);
4168
4169 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4170                                       struct sk_buff *skb,
4171                                       gro_result_t ret)
4172 {
4173         switch (ret) {
4174         case GRO_NORMAL:
4175         case GRO_HELD:
4176                 __skb_push(skb, ETH_HLEN);
4177                 skb->protocol = eth_type_trans(skb, skb->dev);
4178                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4179                         ret = GRO_DROP;
4180                 break;
4181
4182         case GRO_DROP:
4183         case GRO_MERGED_FREE:
4184                 napi_reuse_skb(napi, skb);
4185                 break;
4186
4187         case GRO_MERGED:
4188                 break;
4189         }
4190
4191         return ret;
4192 }
4193
4194 /* Upper GRO stack assumes network header starts at gro_offset=0
4195  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4196  * We copy ethernet header into skb->data to have a common layout.
4197  */
4198 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4199 {
4200         struct sk_buff *skb = napi->skb;
4201         const struct ethhdr *eth;
4202         unsigned int hlen = sizeof(*eth);
4203
4204         napi->skb = NULL;
4205
4206         skb_reset_mac_header(skb);
4207         skb_gro_reset_offset(skb);
4208
4209         eth = skb_gro_header_fast(skb, 0);
4210         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4211                 eth = skb_gro_header_slow(skb, hlen, 0);
4212                 if (unlikely(!eth)) {
4213                         napi_reuse_skb(napi, skb);
4214                         return NULL;
4215                 }
4216         } else {
4217                 gro_pull_from_frag0(skb, hlen);
4218                 NAPI_GRO_CB(skb)->frag0 += hlen;
4219                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4220         }
4221         __skb_pull(skb, hlen);
4222
4223         /*
4224          * This works because the only protocols we care about don't require
4225          * special handling.
4226          * We'll fix it up properly in napi_frags_finish()
4227          */
4228         skb->protocol = eth->h_proto;
4229
4230         return skb;
4231 }
4232
4233 gro_result_t napi_gro_frags(struct napi_struct *napi)
4234 {
4235         struct sk_buff *skb = napi_frags_skb(napi);
4236
4237         if (!skb)
4238                 return GRO_DROP;
4239
4240         trace_napi_gro_frags_entry(skb);
4241
4242         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4243 }
4244 EXPORT_SYMBOL(napi_gro_frags);
4245
4246 /* Compute the checksum from gro_offset and return the folded value
4247  * after adding in any pseudo checksum.
4248  */
4249 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4250 {
4251         __wsum wsum;
4252         __sum16 sum;
4253
4254         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4255
4256         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4257         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4258         if (likely(!sum)) {
4259                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4260                     !skb->csum_complete_sw)
4261                         netdev_rx_csum_fault(skb->dev);
4262         }
4263
4264         NAPI_GRO_CB(skb)->csum = wsum;
4265         NAPI_GRO_CB(skb)->csum_valid = 1;
4266
4267         return sum;
4268 }
4269 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4270
4271 /*
4272  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4273  * Note: called with local irq disabled, but exits with local irq enabled.
4274  */
4275 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4276 {
4277 #ifdef CONFIG_RPS
4278         struct softnet_data *remsd = sd->rps_ipi_list;
4279
4280         if (remsd) {
4281                 sd->rps_ipi_list = NULL;
4282
4283                 local_irq_enable();
4284
4285                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4286                 while (remsd) {
4287                         struct softnet_data *next = remsd->rps_ipi_next;
4288
4289                         if (cpu_online(remsd->cpu))
4290                                 smp_call_function_single_async(remsd->cpu,
4291                                                            &remsd->csd);
4292                         remsd = next;
4293                 }
4294         } else
4295 #endif
4296                 local_irq_enable();
4297 }
4298
4299 static int process_backlog(struct napi_struct *napi, int quota)
4300 {
4301         int work = 0;
4302         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4303
4304 #ifdef CONFIG_RPS
4305         /* Check if we have pending ipi, its better to send them now,
4306          * not waiting net_rx_action() end.
4307          */
4308         if (sd->rps_ipi_list) {
4309                 local_irq_disable();
4310                 net_rps_action_and_irq_enable(sd);
4311         }
4312 #endif
4313         napi->weight = weight_p;
4314         local_irq_disable();
4315         while (1) {
4316                 struct sk_buff *skb;
4317
4318                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4319                         local_irq_enable();
4320                         __netif_receive_skb(skb);
4321                         local_irq_disable();
4322                         input_queue_head_incr(sd);
4323                         if (++work >= quota) {
4324                                 local_irq_enable();
4325                                 return work;
4326                         }
4327                 }
4328
4329                 rps_lock(sd);
4330                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4331                         /*
4332                          * Inline a custom version of __napi_complete().
4333                          * only current cpu owns and manipulates this napi,
4334                          * and NAPI_STATE_SCHED is the only possible flag set
4335                          * on backlog.
4336                          * We can use a plain write instead of clear_bit(),
4337                          * and we dont need an smp_mb() memory barrier.
4338                          */
4339                         list_del(&napi->poll_list);
4340                         napi->state = 0;
4341                         rps_unlock(sd);
4342
4343                         break;
4344                 }
4345
4346                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4347                                            &sd->process_queue);
4348                 rps_unlock(sd);
4349         }
4350         local_irq_enable();
4351
4352         return work;
4353 }
4354
4355 /**
4356  * __napi_schedule - schedule for receive
4357  * @n: entry to schedule
4358  *
4359  * The entry's receive function will be scheduled to run
4360  */
4361 void __napi_schedule(struct napi_struct *n)
4362 {
4363         unsigned long flags;
4364
4365         local_irq_save(flags);
4366         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4367         local_irq_restore(flags);
4368 }
4369 EXPORT_SYMBOL(__napi_schedule);
4370
4371 void __napi_complete(struct napi_struct *n)
4372 {
4373         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4374         BUG_ON(n->gro_list);
4375
4376         list_del(&n->poll_list);
4377         smp_mb__before_atomic();
4378         clear_bit(NAPI_STATE_SCHED, &n->state);
4379 }
4380 EXPORT_SYMBOL(__napi_complete);
4381
4382 void napi_complete(struct napi_struct *n)
4383 {
4384         unsigned long flags;
4385
4386         /*
4387          * don't let napi dequeue from the cpu poll list
4388          * just in case its running on a different cpu
4389          */
4390         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4391                 return;
4392
4393         napi_gro_flush(n, false);
4394         local_irq_save(flags);
4395         __napi_complete(n);
4396         local_irq_restore(flags);
4397 }
4398 EXPORT_SYMBOL(napi_complete);
4399
4400 /* must be called under rcu_read_lock(), as we dont take a reference */
4401 struct napi_struct *napi_by_id(unsigned int napi_id)
4402 {
4403         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4404         struct napi_struct *napi;
4405
4406         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4407                 if (napi->napi_id == napi_id)
4408                         return napi;
4409
4410         return NULL;
4411 }
4412 EXPORT_SYMBOL_GPL(napi_by_id);
4413
4414 void napi_hash_add(struct napi_struct *napi)
4415 {
4416         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4417
4418                 spin_lock(&napi_hash_lock);
4419
4420                 /* 0 is not a valid id, we also skip an id that is taken
4421                  * we expect both events to be extremely rare
4422                  */
4423                 napi->napi_id = 0;
4424                 while (!napi->napi_id) {
4425                         napi->napi_id = ++napi_gen_id;
4426                         if (napi_by_id(napi->napi_id))
4427                                 napi->napi_id = 0;
4428                 }
4429
4430                 hlist_add_head_rcu(&napi->napi_hash_node,
4431                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4432
4433                 spin_unlock(&napi_hash_lock);
4434         }
4435 }
4436 EXPORT_SYMBOL_GPL(napi_hash_add);
4437
4438 /* Warning : caller is responsible to make sure rcu grace period
4439  * is respected before freeing memory containing @napi
4440  */
4441 void napi_hash_del(struct napi_struct *napi)
4442 {
4443         spin_lock(&napi_hash_lock);
4444
4445         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4446                 hlist_del_rcu(&napi->napi_hash_node);
4447
4448         spin_unlock(&napi_hash_lock);
4449 }
4450 EXPORT_SYMBOL_GPL(napi_hash_del);
4451
4452 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4453                     int (*poll)(struct napi_struct *, int), int weight)
4454 {
4455         INIT_LIST_HEAD(&napi->poll_list);
4456         napi->gro_count = 0;
4457         napi->gro_list = NULL;
4458         napi->skb = NULL;
4459         napi->poll = poll;
4460         if (weight > NAPI_POLL_WEIGHT)
4461                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4462                             weight, dev->name);
4463         napi->weight = weight;
4464         list_add(&napi->dev_list, &dev->napi_list);
4465         napi->dev = dev;
4466 #ifdef CONFIG_NETPOLL
4467         spin_lock_init(&napi->poll_lock);
4468         napi->poll_owner = -1;
4469 #endif
4470         set_bit(NAPI_STATE_SCHED, &napi->state);
4471 }
4472 EXPORT_SYMBOL(netif_napi_add);
4473
4474 void netif_napi_del(struct napi_struct *napi)
4475 {
4476         list_del_init(&napi->dev_list);
4477         napi_free_frags(napi);
4478
4479         kfree_skb_list(napi->gro_list);
4480         napi->gro_list = NULL;
4481         napi->gro_count = 0;
4482 }
4483 EXPORT_SYMBOL(netif_napi_del);
4484
4485 static void net_rx_action(struct softirq_action *h)
4486 {
4487         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4488         unsigned long time_limit = jiffies + 2;
4489         int budget = netdev_budget;
4490         void *have;
4491
4492         local_irq_disable();
4493
4494         while (!list_empty(&sd->poll_list)) {
4495                 struct napi_struct *n;
4496                 int work, weight;
4497
4498                 /* If softirq window is exhuasted then punt.
4499                  * Allow this to run for 2 jiffies since which will allow
4500                  * an average latency of 1.5/HZ.
4501                  */
4502                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4503                         goto softnet_break;
4504
4505                 local_irq_enable();
4506
4507                 /* Even though interrupts have been re-enabled, this
4508                  * access is safe because interrupts can only add new
4509                  * entries to the tail of this list, and only ->poll()
4510                  * calls can remove this head entry from the list.
4511                  */
4512                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4513
4514                 have = netpoll_poll_lock(n);
4515
4516                 weight = n->weight;
4517
4518                 /* This NAPI_STATE_SCHED test is for avoiding a race
4519                  * with netpoll's poll_napi().  Only the entity which
4520                  * obtains the lock and sees NAPI_STATE_SCHED set will
4521                  * actually make the ->poll() call.  Therefore we avoid
4522                  * accidentally calling ->poll() when NAPI is not scheduled.
4523                  */
4524                 work = 0;
4525                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4526                         work = n->poll(n, weight);
4527                         trace_napi_poll(n);
4528                 }
4529
4530                 WARN_ON_ONCE(work > weight);
4531
4532                 budget -= work;
4533
4534                 local_irq_disable();
4535
4536                 /* Drivers must not modify the NAPI state if they
4537                  * consume the entire weight.  In such cases this code
4538                  * still "owns" the NAPI instance and therefore can
4539                  * move the instance around on the list at-will.
4540                  */
4541                 if (unlikely(work == weight)) {
4542                         if (unlikely(napi_disable_pending(n))) {
4543                                 local_irq_enable();
4544                                 napi_complete(n);
4545                                 local_irq_disable();
4546                         } else {
4547                                 if (n->gro_list) {
4548                                         /* flush too old packets
4549                                          * If HZ < 1000, flush all packets.
4550                                          */
4551                                         local_irq_enable();
4552                                         napi_gro_flush(n, HZ >= 1000);
4553                                         local_irq_disable();
4554                                 }
4555                                 list_move_tail(&n->poll_list, &sd->poll_list);
4556                         }
4557                 }
4558
4559                 netpoll_poll_unlock(have);
4560         }
4561 out:
4562         net_rps_action_and_irq_enable(sd);
4563
4564 #ifdef CONFIG_NET_DMA
4565         /*
4566          * There may not be any more sk_buffs coming right now, so push
4567          * any pending DMA copies to hardware
4568          */
4569         dma_issue_pending_all();
4570 #endif
4571
4572         return;
4573
4574 softnet_break:
4575         sd->time_squeeze++;
4576         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4577         goto out;
4578 }
4579
4580 struct netdev_adjacent {
4581         struct net_device *dev;
4582
4583         /* upper master flag, there can only be one master device per list */
4584         bool master;
4585
4586         /* counter for the number of times this device was added to us */
4587         u16 ref_nr;
4588
4589         /* private field for the users */
4590         void *private;
4591
4592         struct list_head list;
4593         struct rcu_head rcu;
4594 };
4595
4596 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4597                                                  struct net_device *adj_dev,
4598                                                  struct list_head *adj_list)
4599 {
4600         struct netdev_adjacent *adj;
4601
4602         list_for_each_entry(adj, adj_list, list) {
4603                 if (adj->dev == adj_dev)
4604                         return adj;
4605         }
4606         return NULL;
4607 }
4608
4609 /**
4610  * netdev_has_upper_dev - Check if device is linked to an upper device
4611  * @dev: device
4612  * @upper_dev: upper device to check
4613  *
4614  * Find out if a device is linked to specified upper device and return true
4615  * in case it is. Note that this checks only immediate upper device,
4616  * not through a complete stack of devices. The caller must hold the RTNL lock.
4617  */
4618 bool netdev_has_upper_dev(struct net_device *dev,
4619                           struct net_device *upper_dev)
4620 {
4621         ASSERT_RTNL();
4622
4623         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4624 }
4625 EXPORT_SYMBOL(netdev_has_upper_dev);
4626
4627 /**
4628  * netdev_has_any_upper_dev - Check if device is linked to some device
4629  * @dev: device
4630  *
4631  * Find out if a device is linked to an upper device and return true in case
4632  * it is. The caller must hold the RTNL lock.
4633  */
4634 static bool netdev_has_any_upper_dev(struct net_device *dev)
4635 {
4636         ASSERT_RTNL();
4637
4638         return !list_empty(&dev->all_adj_list.upper);
4639 }
4640
4641 /**
4642  * netdev_master_upper_dev_get - Get master upper device
4643  * @dev: device
4644  *
4645  * Find a master upper device and return pointer to it or NULL in case
4646  * it's not there. The caller must hold the RTNL lock.
4647  */
4648 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4649 {
4650         struct netdev_adjacent *upper;
4651
4652         ASSERT_RTNL();
4653
4654         if (list_empty(&dev->adj_list.upper))
4655                 return NULL;
4656
4657         upper = list_first_entry(&dev->adj_list.upper,
4658                                  struct netdev_adjacent, list);
4659         if (likely(upper->master))
4660                 return upper->dev;
4661         return NULL;
4662 }
4663 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4664
4665 void *netdev_adjacent_get_private(struct list_head *adj_list)
4666 {
4667         struct netdev_adjacent *adj;
4668
4669         adj = list_entry(adj_list, struct netdev_adjacent, list);
4670
4671         return adj->private;
4672 }
4673 EXPORT_SYMBOL(netdev_adjacent_get_private);
4674
4675 /**
4676  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4677  * @dev: device
4678  * @iter: list_head ** of the current position
4679  *
4680  * Gets the next device from the dev's upper list, starting from iter
4681  * position. The caller must hold RCU read lock.
4682  */
4683 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4684                                                  struct list_head **iter)
4685 {
4686         struct netdev_adjacent *upper;
4687
4688         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4689
4690         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4691
4692         if (&upper->list == &dev->adj_list.upper)
4693                 return NULL;
4694
4695         *iter = &upper->list;
4696
4697         return upper->dev;
4698 }
4699 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4700
4701 /**
4702  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4703  * @dev: device
4704  * @iter: list_head ** of the current position
4705  *
4706  * Gets the next device from the dev's upper list, starting from iter
4707  * position. The caller must hold RCU read lock.
4708  */
4709 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4710                                                      struct list_head **iter)
4711 {
4712         struct netdev_adjacent *upper;
4713
4714         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4715
4716         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4717
4718         if (&upper->list == &dev->all_adj_list.upper)
4719                 return NULL;
4720
4721         *iter = &upper->list;
4722
4723         return upper->dev;
4724 }
4725 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4726
4727 /**
4728  * netdev_lower_get_next_private - Get the next ->private from the
4729  *                                 lower neighbour list
4730  * @dev: device
4731  * @iter: list_head ** of the current position
4732  *
4733  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4734  * list, starting from iter position. The caller must hold either hold the
4735  * RTNL lock or its own locking that guarantees that the neighbour lower
4736  * list will remain unchainged.
4737  */
4738 void *netdev_lower_get_next_private(struct net_device *dev,
4739                                     struct list_head **iter)
4740 {
4741         struct netdev_adjacent *lower;
4742
4743         lower = list_entry(*iter, struct netdev_adjacent, list);
4744
4745         if (&lower->list == &dev->adj_list.lower)
4746                 return NULL;
4747
4748         *iter = lower->list.next;
4749
4750         return lower->private;
4751 }
4752 EXPORT_SYMBOL(netdev_lower_get_next_private);
4753
4754 /**
4755  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4756  *                                     lower neighbour list, RCU
4757  *                                     variant
4758  * @dev: device
4759  * @iter: list_head ** of the current position
4760  *
4761  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4762  * list, starting from iter position. The caller must hold RCU read lock.
4763  */
4764 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4765                                         struct list_head **iter)
4766 {
4767         struct netdev_adjacent *lower;
4768
4769         WARN_ON_ONCE(!rcu_read_lock_held());
4770
4771         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4772
4773         if (&lower->list == &dev->adj_list.lower)
4774                 return NULL;
4775
4776         *iter = &lower->list;
4777
4778         return lower->private;
4779 }
4780 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4781
4782 /**
4783  * netdev_lower_get_next - Get the next device from the lower neighbour
4784  *                         list
4785  * @dev: device
4786  * @iter: list_head ** of the current position
4787  *
4788  * Gets the next netdev_adjacent from the dev's lower neighbour
4789  * list, starting from iter position. The caller must hold RTNL lock or
4790  * its own locking that guarantees that the neighbour lower
4791  * list will remain unchainged.
4792  */
4793 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4794 {
4795         struct netdev_adjacent *lower;
4796
4797         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4798
4799         if (&lower->list == &dev->adj_list.lower)
4800                 return NULL;
4801
4802         *iter = &lower->list;
4803
4804         return lower->dev;
4805 }
4806 EXPORT_SYMBOL(netdev_lower_get_next);
4807
4808 /**
4809  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4810  *                                     lower neighbour list, RCU
4811  *                                     variant
4812  * @dev: device
4813  *
4814  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4815  * list. The caller must hold RCU read lock.
4816  */
4817 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4818 {
4819         struct netdev_adjacent *lower;
4820
4821         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4822                         struct netdev_adjacent, list);
4823         if (lower)
4824                 return lower->private;
4825         return NULL;
4826 }
4827 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4828
4829 /**
4830  * netdev_master_upper_dev_get_rcu - Get master upper device
4831  * @dev: device
4832  *
4833  * Find a master upper device and return pointer to it or NULL in case
4834  * it's not there. The caller must hold the RCU read lock.
4835  */
4836 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4837 {
4838         struct netdev_adjacent *upper;
4839
4840         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4841                                        struct netdev_adjacent, list);
4842         if (upper && likely(upper->master))
4843                 return upper->dev;
4844         return NULL;
4845 }
4846 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4847
4848 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4849                               struct net_device *adj_dev,
4850                               struct list_head *dev_list)
4851 {
4852         char linkname[IFNAMSIZ+7];
4853         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4854                 "upper_%s" : "lower_%s", adj_dev->name);
4855         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4856                                  linkname);
4857 }
4858 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4859                                char *name,
4860                                struct list_head *dev_list)
4861 {
4862         char linkname[IFNAMSIZ+7];
4863         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4864                 "upper_%s" : "lower_%s", name);
4865         sysfs_remove_link(&(dev->dev.kobj), linkname);
4866 }
4867
4868 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4869                 (dev_list == &dev->adj_list.upper || \
4870                  dev_list == &dev->adj_list.lower)
4871
4872 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4873                                         struct net_device *adj_dev,
4874                                         struct list_head *dev_list,
4875                                         void *private, bool master)
4876 {
4877         struct netdev_adjacent *adj;
4878         int ret;
4879
4880         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4881
4882         if (adj) {
4883                 adj->ref_nr++;
4884                 return 0;
4885         }
4886
4887         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4888         if (!adj)
4889                 return -ENOMEM;
4890
4891         adj->dev = adj_dev;
4892         adj->master = master;
4893         adj->ref_nr = 1;
4894         adj->private = private;
4895         dev_hold(adj_dev);
4896
4897         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4898                  adj_dev->name, dev->name, adj_dev->name);
4899
4900         if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4901                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4902                 if (ret)
4903                         goto free_adj;
4904         }
4905
4906         /* Ensure that master link is always the first item in list. */
4907         if (master) {
4908                 ret = sysfs_create_link(&(dev->dev.kobj),
4909                                         &(adj_dev->dev.kobj), "master");
4910                 if (ret)
4911                         goto remove_symlinks;
4912
4913                 list_add_rcu(&adj->list, dev_list);
4914         } else {
4915                 list_add_tail_rcu(&adj->list, dev_list);
4916         }
4917
4918         return 0;
4919
4920 remove_symlinks:
4921         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4922                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4923 free_adj:
4924         kfree(adj);
4925         dev_put(adj_dev);
4926
4927         return ret;
4928 }
4929
4930 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4931                                          struct net_device *adj_dev,
4932                                          struct list_head *dev_list)
4933 {
4934         struct netdev_adjacent *adj;
4935
4936         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4937
4938         if (!adj) {
4939                 pr_err("tried to remove device %s from %s\n",
4940                        dev->name, adj_dev->name);
4941                 BUG();
4942         }
4943
4944         if (adj->ref_nr > 1) {
4945                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4946                          adj->ref_nr-1);
4947                 adj->ref_nr--;
4948                 return;
4949         }
4950
4951         if (adj->master)
4952                 sysfs_remove_link(&(dev->dev.kobj), "master");
4953
4954         if (netdev_adjacent_is_neigh_list(dev, dev_list) &&
4955             net_eq(dev_net(dev),dev_net(adj_dev)))
4956                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4957
4958         list_del_rcu(&adj->list);
4959         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4960                  adj_dev->name, dev->name, adj_dev->name);
4961         dev_put(adj_dev);
4962         kfree_rcu(adj, rcu);
4963 }
4964
4965 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4966                                             struct net_device *upper_dev,
4967                                             struct list_head *up_list,
4968                                             struct list_head *down_list,
4969                                             void *private, bool master)
4970 {
4971         int ret;
4972
4973         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4974                                            master);
4975         if (ret)
4976                 return ret;
4977
4978         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4979                                            false);
4980         if (ret) {
4981                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4982                 return ret;
4983         }
4984
4985         return 0;
4986 }
4987
4988 static int __netdev_adjacent_dev_link(struct net_device *dev,
4989                                       struct net_device *upper_dev)
4990 {
4991         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4992                                                 &dev->all_adj_list.upper,
4993                                                 &upper_dev->all_adj_list.lower,
4994                                                 NULL, false);
4995 }
4996
4997 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4998                                                struct net_device *upper_dev,
4999                                                struct list_head *up_list,
5000                                                struct list_head *down_list)
5001 {
5002         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5003         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5004 }
5005
5006 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5007                                          struct net_device *upper_dev)
5008 {
5009         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5010                                            &dev->all_adj_list.upper,
5011                                            &upper_dev->all_adj_list.lower);
5012 }
5013
5014 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5015                                                 struct net_device *upper_dev,
5016                                                 void *private, bool master)
5017 {
5018         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5019
5020         if (ret)
5021                 return ret;
5022
5023         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5024                                                &dev->adj_list.upper,
5025                                                &upper_dev->adj_list.lower,
5026                                                private, master);
5027         if (ret) {
5028                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5029                 return ret;
5030         }
5031
5032         return 0;
5033 }
5034
5035 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5036                                                    struct net_device *upper_dev)
5037 {
5038         __netdev_adjacent_dev_unlink(dev, upper_dev);
5039         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5040                                            &dev->adj_list.upper,
5041                                            &upper_dev->adj_list.lower);
5042 }
5043
5044 static int __netdev_upper_dev_link(struct net_device *dev,
5045                                    struct net_device *upper_dev, bool master,
5046                                    void *private)
5047 {
5048         struct netdev_adjacent *i, *j, *to_i, *to_j;
5049         int ret = 0;
5050
5051         ASSERT_RTNL();
5052
5053         if (dev == upper_dev)
5054                 return -EBUSY;
5055
5056         /* To prevent loops, check if dev is not upper device to upper_dev. */
5057         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5058                 return -EBUSY;
5059
5060         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5061                 return -EEXIST;
5062
5063         if (master && netdev_master_upper_dev_get(dev))
5064                 return -EBUSY;
5065
5066         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5067                                                    master);
5068         if (ret)
5069                 return ret;
5070
5071         /* Now that we linked these devs, make all the upper_dev's
5072          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5073          * versa, and don't forget the devices itself. All of these
5074          * links are non-neighbours.
5075          */
5076         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5077                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5078                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5079                                  i->dev->name, j->dev->name);
5080                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5081                         if (ret)
5082                                 goto rollback_mesh;
5083                 }
5084         }
5085
5086         /* add dev to every upper_dev's upper device */
5087         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5088                 pr_debug("linking %s's upper device %s with %s\n",
5089                          upper_dev->name, i->dev->name, dev->name);
5090                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5091                 if (ret)
5092                         goto rollback_upper_mesh;
5093         }
5094
5095         /* add upper_dev to every dev's lower device */
5096         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5097                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5098                          i->dev->name, upper_dev->name);
5099                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5100                 if (ret)
5101                         goto rollback_lower_mesh;
5102         }
5103
5104         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5105         return 0;
5106
5107 rollback_lower_mesh:
5108         to_i = i;
5109         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5110                 if (i == to_i)
5111                         break;
5112                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5113         }
5114
5115         i = NULL;
5116
5117 rollback_upper_mesh:
5118         to_i = i;
5119         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5120                 if (i == to_i)
5121                         break;
5122                 __netdev_adjacent_dev_unlink(dev, i->dev);
5123         }
5124
5125         i = j = NULL;
5126
5127 rollback_mesh:
5128         to_i = i;
5129         to_j = j;
5130         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5131                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5132                         if (i == to_i && j == to_j)
5133                                 break;
5134                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5135                 }
5136                 if (i == to_i)
5137                         break;
5138         }
5139
5140         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5141
5142         return ret;
5143 }
5144
5145 /**
5146  * netdev_upper_dev_link - Add a link to the upper device
5147  * @dev: device
5148  * @upper_dev: new upper device
5149  *
5150  * Adds a link to device which is upper to this one. The caller must hold
5151  * the RTNL lock. On a failure a negative errno code is returned.
5152  * On success the reference counts are adjusted and the function
5153  * returns zero.
5154  */
5155 int netdev_upper_dev_link(struct net_device *dev,
5156                           struct net_device *upper_dev)
5157 {
5158         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5159 }
5160 EXPORT_SYMBOL(netdev_upper_dev_link);
5161
5162 /**
5163  * netdev_master_upper_dev_link - Add a master link to the upper device
5164  * @dev: device
5165  * @upper_dev: new upper device
5166  *
5167  * Adds a link to device which is upper to this one. In this case, only
5168  * one master upper device can be linked, although other non-master devices
5169  * might be linked as well. The caller must hold the RTNL lock.
5170  * On a failure a negative errno code is returned. On success the reference
5171  * counts are adjusted and the function returns zero.
5172  */
5173 int netdev_master_upper_dev_link(struct net_device *dev,
5174                                  struct net_device *upper_dev)
5175 {
5176         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5177 }
5178 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5179
5180 int netdev_master_upper_dev_link_private(struct net_device *dev,
5181                                          struct net_device *upper_dev,
5182                                          void *private)
5183 {
5184         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5185 }
5186 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5187
5188 /**
5189  * netdev_upper_dev_unlink - Removes a link to upper device
5190  * @dev: device
5191  * @upper_dev: new upper device
5192  *
5193  * Removes a link to device which is upper to this one. The caller must hold
5194  * the RTNL lock.
5195  */
5196 void netdev_upper_dev_unlink(struct net_device *dev,
5197                              struct net_device *upper_dev)
5198 {
5199         struct netdev_adjacent *i, *j;
5200         ASSERT_RTNL();
5201
5202         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5203
5204         /* Here is the tricky part. We must remove all dev's lower
5205          * devices from all upper_dev's upper devices and vice
5206          * versa, to maintain the graph relationship.
5207          */
5208         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5209                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5210                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5211
5212         /* remove also the devices itself from lower/upper device
5213          * list
5214          */
5215         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5216                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5217
5218         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5219                 __netdev_adjacent_dev_unlink(dev, i->dev);
5220
5221         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5222 }
5223 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5224
5225 void netdev_adjacent_add_links(struct net_device *dev)
5226 {
5227         struct netdev_adjacent *iter;
5228
5229         struct net *net = dev_net(dev);
5230
5231         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5232                 if (!net_eq(net,dev_net(iter->dev)))
5233                         continue;
5234                 netdev_adjacent_sysfs_add(iter->dev, dev,
5235                                           &iter->dev->adj_list.lower);
5236                 netdev_adjacent_sysfs_add(dev, iter->dev,
5237                                           &dev->adj_list.upper);
5238         }
5239
5240         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5241                 if (!net_eq(net,dev_net(iter->dev)))
5242                         continue;
5243                 netdev_adjacent_sysfs_add(iter->dev, dev,
5244                                           &iter->dev->adj_list.upper);
5245                 netdev_adjacent_sysfs_add(dev, iter->dev,
5246                                           &dev->adj_list.lower);
5247         }
5248 }
5249
5250 void netdev_adjacent_del_links(struct net_device *dev)
5251 {
5252         struct netdev_adjacent *iter;
5253
5254         struct net *net = dev_net(dev);
5255
5256         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5257                 if (!net_eq(net,dev_net(iter->dev)))
5258                         continue;
5259                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5260                                           &iter->dev->adj_list.lower);
5261                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5262                                           &dev->adj_list.upper);
5263         }
5264
5265         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5266                 if (!net_eq(net,dev_net(iter->dev)))
5267                         continue;
5268                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5269                                           &iter->dev->adj_list.upper);
5270                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5271                                           &dev->adj_list.lower);
5272         }
5273 }
5274
5275 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5276 {
5277         struct netdev_adjacent *iter;
5278
5279         struct net *net = dev_net(dev);
5280
5281         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5282                 if (!net_eq(net,dev_net(iter->dev)))
5283                         continue;
5284                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5285                                           &iter->dev->adj_list.lower);
5286                 netdev_adjacent_sysfs_add(iter->dev, dev,
5287                                           &iter->dev->adj_list.lower);
5288         }
5289
5290         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5291                 if (!net_eq(net,dev_net(iter->dev)))
5292                         continue;
5293                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5294                                           &iter->dev->adj_list.upper);
5295                 netdev_adjacent_sysfs_add(iter->dev, dev,
5296                                           &iter->dev->adj_list.upper);
5297         }
5298 }
5299
5300 void *netdev_lower_dev_get_private(struct net_device *dev,
5301                                    struct net_device *lower_dev)
5302 {
5303         struct netdev_adjacent *lower;
5304
5305         if (!lower_dev)
5306                 return NULL;
5307         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5308         if (!lower)
5309                 return NULL;
5310
5311         return lower->private;
5312 }
5313 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5314
5315
5316 int dev_get_nest_level(struct net_device *dev,
5317                        bool (*type_check)(struct net_device *dev))
5318 {
5319         struct net_device *lower = NULL;
5320         struct list_head *iter;
5321         int max_nest = -1;
5322         int nest;
5323
5324         ASSERT_RTNL();
5325
5326         netdev_for_each_lower_dev(dev, lower, iter) {
5327                 nest = dev_get_nest_level(lower, type_check);
5328                 if (max_nest < nest)
5329                         max_nest = nest;
5330         }
5331
5332         if (type_check(dev))
5333                 max_nest++;
5334
5335         return max_nest;
5336 }
5337 EXPORT_SYMBOL(dev_get_nest_level);
5338
5339 static void dev_change_rx_flags(struct net_device *dev, int flags)
5340 {
5341         const struct net_device_ops *ops = dev->netdev_ops;
5342
5343         if (ops->ndo_change_rx_flags)
5344                 ops->ndo_change_rx_flags(dev, flags);
5345 }
5346
5347 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5348 {
5349         unsigned int old_flags = dev->flags;
5350         kuid_t uid;
5351         kgid_t gid;
5352
5353         ASSERT_RTNL();
5354
5355         dev->flags |= IFF_PROMISC;
5356         dev->promiscuity += inc;
5357         if (dev->promiscuity == 0) {
5358                 /*
5359                  * Avoid overflow.
5360                  * If inc causes overflow, untouch promisc and return error.
5361                  */
5362                 if (inc < 0)
5363                         dev->flags &= ~IFF_PROMISC;
5364                 else {
5365                         dev->promiscuity -= inc;
5366                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5367                                 dev->name);
5368                         return -EOVERFLOW;
5369                 }
5370         }
5371         if (dev->flags != old_flags) {
5372                 pr_info("device %s %s promiscuous mode\n",
5373                         dev->name,
5374                         dev->flags & IFF_PROMISC ? "entered" : "left");
5375                 if (audit_enabled) {
5376                         current_uid_gid(&uid, &gid);
5377                         audit_log(current->audit_context, GFP_ATOMIC,
5378                                 AUDIT_ANOM_PROMISCUOUS,
5379                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5380                                 dev->name, (dev->flags & IFF_PROMISC),
5381                                 (old_flags & IFF_PROMISC),
5382                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5383                                 from_kuid(&init_user_ns, uid),
5384                                 from_kgid(&init_user_ns, gid),
5385                                 audit_get_sessionid(current));
5386                 }
5387
5388                 dev_change_rx_flags(dev, IFF_PROMISC);
5389         }
5390         if (notify)
5391                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5392         return 0;
5393 }
5394
5395 /**
5396  *      dev_set_promiscuity     - update promiscuity count on a device
5397  *      @dev: device
5398  *      @inc: modifier
5399  *
5400  *      Add or remove promiscuity from a device. While the count in the device
5401  *      remains above zero the interface remains promiscuous. Once it hits zero
5402  *      the device reverts back to normal filtering operation. A negative inc
5403  *      value is used to drop promiscuity on the device.
5404  *      Return 0 if successful or a negative errno code on error.
5405  */
5406 int dev_set_promiscuity(struct net_device *dev, int inc)
5407 {
5408         unsigned int old_flags = dev->flags;
5409         int err;
5410
5411         err = __dev_set_promiscuity(dev, inc, true);
5412         if (err < 0)
5413                 return err;
5414         if (dev->flags != old_flags)
5415                 dev_set_rx_mode(dev);
5416         return err;
5417 }
5418 EXPORT_SYMBOL(dev_set_promiscuity);
5419
5420 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5421 {
5422         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5423
5424         ASSERT_RTNL();
5425
5426         dev->flags |= IFF_ALLMULTI;
5427         dev->allmulti += inc;
5428         if (dev->allmulti == 0) {
5429                 /*
5430                  * Avoid overflow.
5431                  * If inc causes overflow, untouch allmulti and return error.
5432                  */
5433                 if (inc < 0)
5434                         dev->flags &= ~IFF_ALLMULTI;
5435                 else {
5436                         dev->allmulti -= inc;
5437                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5438                                 dev->name);
5439                         return -EOVERFLOW;
5440                 }
5441         }
5442         if (dev->flags ^ old_flags) {
5443                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5444                 dev_set_rx_mode(dev);
5445                 if (notify)
5446                         __dev_notify_flags(dev, old_flags,
5447                                            dev->gflags ^ old_gflags);
5448         }
5449         return 0;
5450 }
5451
5452 /**
5453  *      dev_set_allmulti        - update allmulti count on a device
5454  *      @dev: device
5455  *      @inc: modifier
5456  *
5457  *      Add or remove reception of all multicast frames to a device. While the
5458  *      count in the device remains above zero the interface remains listening
5459  *      to all interfaces. Once it hits zero the device reverts back to normal
5460  *      filtering operation. A negative @inc value is used to drop the counter
5461  *      when releasing a resource needing all multicasts.
5462  *      Return 0 if successful or a negative errno code on error.
5463  */
5464
5465 int dev_set_allmulti(struct net_device *dev, int inc)
5466 {
5467         return __dev_set_allmulti(dev, inc, true);
5468 }
5469 EXPORT_SYMBOL(dev_set_allmulti);
5470
5471 /*
5472  *      Upload unicast and multicast address lists to device and
5473  *      configure RX filtering. When the device doesn't support unicast
5474  *      filtering it is put in promiscuous mode while unicast addresses
5475  *      are present.
5476  */
5477 void __dev_set_rx_mode(struct net_device *dev)
5478 {
5479         const struct net_device_ops *ops = dev->netdev_ops;
5480
5481         /* dev_open will call this function so the list will stay sane. */
5482         if (!(dev->flags&IFF_UP))
5483                 return;
5484
5485         if (!netif_device_present(dev))
5486                 return;
5487
5488         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5489                 /* Unicast addresses changes may only happen under the rtnl,
5490                  * therefore calling __dev_set_promiscuity here is safe.
5491                  */
5492                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5493                         __dev_set_promiscuity(dev, 1, false);
5494                         dev->uc_promisc = true;
5495                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5496                         __dev_set_promiscuity(dev, -1, false);
5497                         dev->uc_promisc = false;
5498                 }
5499         }
5500
5501         if (ops->ndo_set_rx_mode)
5502                 ops->ndo_set_rx_mode(dev);
5503 }
5504
5505 void dev_set_rx_mode(struct net_device *dev)
5506 {
5507         netif_addr_lock_bh(dev);
5508         __dev_set_rx_mode(dev);
5509         netif_addr_unlock_bh(dev);
5510 }
5511
5512 /**
5513  *      dev_get_flags - get flags reported to userspace
5514  *      @dev: device
5515  *
5516  *      Get the combination of flag bits exported through APIs to userspace.
5517  */
5518 unsigned int dev_get_flags(const struct net_device *dev)
5519 {
5520         unsigned int flags;
5521
5522         flags = (dev->flags & ~(IFF_PROMISC |
5523                                 IFF_ALLMULTI |
5524                                 IFF_RUNNING |
5525                                 IFF_LOWER_UP |
5526                                 IFF_DORMANT)) |
5527                 (dev->gflags & (IFF_PROMISC |
5528                                 IFF_ALLMULTI));
5529
5530         if (netif_running(dev)) {
5531                 if (netif_oper_up(dev))
5532                         flags |= IFF_RUNNING;
5533                 if (netif_carrier_ok(dev))
5534                         flags |= IFF_LOWER_UP;
5535                 if (netif_dormant(dev))
5536                         flags |= IFF_DORMANT;
5537         }
5538
5539         return flags;
5540 }
5541 EXPORT_SYMBOL(dev_get_flags);
5542
5543 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5544 {
5545         unsigned int old_flags = dev->flags;
5546         int ret;
5547
5548         ASSERT_RTNL();
5549
5550         /*
5551          *      Set the flags on our device.
5552          */
5553
5554         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5555                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5556                                IFF_AUTOMEDIA)) |
5557                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5558                                     IFF_ALLMULTI));
5559
5560         /*
5561          *      Load in the correct multicast list now the flags have changed.
5562          */
5563
5564         if ((old_flags ^ flags) & IFF_MULTICAST)
5565                 dev_change_rx_flags(dev, IFF_MULTICAST);
5566
5567         dev_set_rx_mode(dev);
5568
5569         /*
5570          *      Have we downed the interface. We handle IFF_UP ourselves
5571          *      according to user attempts to set it, rather than blindly
5572          *      setting it.
5573          */
5574
5575         ret = 0;
5576         if ((old_flags ^ flags) & IFF_UP)
5577                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5578
5579         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5580                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5581                 unsigned int old_flags = dev->flags;
5582
5583                 dev->gflags ^= IFF_PROMISC;
5584
5585                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5586                         if (dev->flags != old_flags)
5587                                 dev_set_rx_mode(dev);
5588         }
5589
5590         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5591            is important. Some (broken) drivers set IFF_PROMISC, when
5592            IFF_ALLMULTI is requested not asking us and not reporting.
5593          */
5594         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5595                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5596
5597                 dev->gflags ^= IFF_ALLMULTI;
5598                 __dev_set_allmulti(dev, inc, false);
5599         }
5600
5601         return ret;
5602 }
5603
5604 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5605                         unsigned int gchanges)
5606 {
5607         unsigned int changes = dev->flags ^ old_flags;
5608
5609         if (gchanges)
5610                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5611
5612         if (changes & IFF_UP) {
5613                 if (dev->flags & IFF_UP)
5614                         call_netdevice_notifiers(NETDEV_UP, dev);
5615                 else
5616                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5617         }
5618
5619         if (dev->flags & IFF_UP &&
5620             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5621                 struct netdev_notifier_change_info change_info;
5622
5623                 change_info.flags_changed = changes;
5624                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5625                                               &change_info.info);
5626         }
5627 }
5628
5629 /**
5630  *      dev_change_flags - change device settings
5631  *      @dev: device
5632  *      @flags: device state flags
5633  *
5634  *      Change settings on device based state flags. The flags are
5635  *      in the userspace exported format.
5636  */
5637 int dev_change_flags(struct net_device *dev, unsigned int flags)
5638 {
5639         int ret;
5640         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5641
5642         ret = __dev_change_flags(dev, flags);
5643         if (ret < 0)
5644                 return ret;
5645
5646         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5647         __dev_notify_flags(dev, old_flags, changes);
5648         return ret;
5649 }
5650 EXPORT_SYMBOL(dev_change_flags);
5651
5652 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5653 {
5654         const struct net_device_ops *ops = dev->netdev_ops;
5655
5656         if (ops->ndo_change_mtu)
5657                 return ops->ndo_change_mtu(dev, new_mtu);
5658
5659         dev->mtu = new_mtu;
5660         return 0;
5661 }
5662
5663 /**
5664  *      dev_set_mtu - Change maximum transfer unit
5665  *      @dev: device
5666  *      @new_mtu: new transfer unit
5667  *
5668  *      Change the maximum transfer size of the network device.
5669  */
5670 int dev_set_mtu(struct net_device *dev, int new_mtu)
5671 {
5672         int err, orig_mtu;
5673
5674         if (new_mtu == dev->mtu)
5675                 return 0;
5676
5677         /*      MTU must be positive.    */
5678         if (new_mtu < 0)
5679                 return -EINVAL;
5680
5681         if (!netif_device_present(dev))
5682                 return -ENODEV;
5683
5684         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5685         err = notifier_to_errno(err);
5686         if (err)
5687                 return err;
5688
5689         orig_mtu = dev->mtu;
5690         err = __dev_set_mtu(dev, new_mtu);
5691
5692         if (!err) {
5693                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5694                 err = notifier_to_errno(err);
5695                 if (err) {
5696                         /* setting mtu back and notifying everyone again,
5697                          * so that they have a chance to revert changes.
5698                          */
5699                         __dev_set_mtu(dev, orig_mtu);
5700                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5701                 }
5702         }
5703         return err;
5704 }
5705 EXPORT_SYMBOL(dev_set_mtu);
5706
5707 /**
5708  *      dev_set_group - Change group this device belongs to
5709  *      @dev: device
5710  *      @new_group: group this device should belong to
5711  */
5712 void dev_set_group(struct net_device *dev, int new_group)
5713 {
5714         dev->group = new_group;
5715 }
5716 EXPORT_SYMBOL(dev_set_group);
5717
5718 /**
5719  *      dev_set_mac_address - Change Media Access Control Address
5720  *      @dev: device
5721  *      @sa: new address
5722  *
5723  *      Change the hardware (MAC) address of the device
5724  */
5725 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5726 {
5727         const struct net_device_ops *ops = dev->netdev_ops;
5728         int err;
5729
5730         if (!ops->ndo_set_mac_address)
5731                 return -EOPNOTSUPP;
5732         if (sa->sa_family != dev->type)
5733                 return -EINVAL;
5734         if (!netif_device_present(dev))
5735                 return -ENODEV;
5736         err = ops->ndo_set_mac_address(dev, sa);
5737         if (err)
5738                 return err;
5739         dev->addr_assign_type = NET_ADDR_SET;
5740         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5741         add_device_randomness(dev->dev_addr, dev->addr_len);
5742         return 0;
5743 }
5744 EXPORT_SYMBOL(dev_set_mac_address);
5745
5746 /**
5747  *      dev_change_carrier - Change device carrier
5748  *      @dev: device
5749  *      @new_carrier: new value
5750  *
5751  *      Change device carrier
5752  */
5753 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5754 {
5755         const struct net_device_ops *ops = dev->netdev_ops;
5756
5757         if (!ops->ndo_change_carrier)
5758                 return -EOPNOTSUPP;
5759         if (!netif_device_present(dev))
5760                 return -ENODEV;
5761         return ops->ndo_change_carrier(dev, new_carrier);
5762 }
5763 EXPORT_SYMBOL(dev_change_carrier);
5764
5765 /**
5766  *      dev_get_phys_port_id - Get device physical port ID
5767  *      @dev: device
5768  *      @ppid: port ID
5769  *
5770  *      Get device physical port ID
5771  */
5772 int dev_get_phys_port_id(struct net_device *dev,
5773                          struct netdev_phys_port_id *ppid)
5774 {
5775         const struct net_device_ops *ops = dev->netdev_ops;
5776
5777         if (!ops->ndo_get_phys_port_id)
5778                 return -EOPNOTSUPP;
5779         return ops->ndo_get_phys_port_id(dev, ppid);
5780 }
5781 EXPORT_SYMBOL(dev_get_phys_port_id);
5782
5783 /**
5784  *      dev_new_index   -       allocate an ifindex
5785  *      @net: the applicable net namespace
5786  *
5787  *      Returns a suitable unique value for a new device interface
5788  *      number.  The caller must hold the rtnl semaphore or the
5789  *      dev_base_lock to be sure it remains unique.
5790  */
5791 static int dev_new_index(struct net *net)
5792 {
5793         int ifindex = net->ifindex;
5794         for (;;) {
5795                 if (++ifindex <= 0)
5796                         ifindex = 1;
5797                 if (!__dev_get_by_index(net, ifindex))
5798                         return net->ifindex = ifindex;
5799         }
5800 }
5801
5802 /* Delayed registration/unregisteration */
5803 static LIST_HEAD(net_todo_list);
5804 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5805
5806 static void net_set_todo(struct net_device *dev)
5807 {
5808         list_add_tail(&dev->todo_list, &net_todo_list);
5809         dev_net(dev)->dev_unreg_count++;
5810 }
5811
5812 static void rollback_registered_many(struct list_head *head)
5813 {
5814         struct net_device *dev, *tmp;
5815         LIST_HEAD(close_head);
5816
5817         BUG_ON(dev_boot_phase);
5818         ASSERT_RTNL();
5819
5820         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5821                 /* Some devices call without registering
5822                  * for initialization unwind. Remove those
5823                  * devices and proceed with the remaining.
5824                  */
5825                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5826                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5827                                  dev->name, dev);
5828
5829                         WARN_ON(1);
5830                         list_del(&dev->unreg_list);
5831                         continue;
5832                 }
5833                 dev->dismantle = true;
5834                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5835         }
5836
5837         /* If device is running, close it first. */
5838         list_for_each_entry(dev, head, unreg_list)
5839                 list_add_tail(&dev->close_list, &close_head);
5840         dev_close_many(&close_head);
5841
5842         list_for_each_entry(dev, head, unreg_list) {
5843                 /* And unlink it from device chain. */
5844                 unlist_netdevice(dev);
5845
5846                 dev->reg_state = NETREG_UNREGISTERING;
5847         }
5848
5849         synchronize_net();
5850
5851         list_for_each_entry(dev, head, unreg_list) {
5852                 /* Shutdown queueing discipline. */
5853                 dev_shutdown(dev);
5854
5855
5856                 /* Notify protocols, that we are about to destroy
5857                    this device. They should clean all the things.
5858                 */
5859                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5860
5861                 /*
5862                  *      Flush the unicast and multicast chains
5863                  */
5864                 dev_uc_flush(dev);
5865                 dev_mc_flush(dev);
5866
5867                 if (dev->netdev_ops->ndo_uninit)
5868                         dev->netdev_ops->ndo_uninit(dev);
5869
5870                 if (!dev->rtnl_link_ops ||
5871                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5872                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5873
5874                 /* Notifier chain MUST detach us all upper devices. */
5875                 WARN_ON(netdev_has_any_upper_dev(dev));
5876
5877                 /* Remove entries from kobject tree */
5878                 netdev_unregister_kobject(dev);
5879 #ifdef CONFIG_XPS
5880                 /* Remove XPS queueing entries */
5881                 netif_reset_xps_queues_gt(dev, 0);
5882 #endif
5883         }
5884
5885         synchronize_net();
5886
5887         list_for_each_entry(dev, head, unreg_list)
5888                 dev_put(dev);
5889 }
5890
5891 static void rollback_registered(struct net_device *dev)
5892 {
5893         LIST_HEAD(single);
5894
5895         list_add(&dev->unreg_list, &single);
5896         rollback_registered_many(&single);
5897         list_del(&single);
5898 }
5899
5900 static netdev_features_t netdev_fix_features(struct net_device *dev,
5901         netdev_features_t features)
5902 {
5903         /* Fix illegal checksum combinations */
5904         if ((features & NETIF_F_HW_CSUM) &&
5905             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5906                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5907                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5908         }
5909
5910         /* TSO requires that SG is present as well. */
5911         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5912                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5913                 features &= ~NETIF_F_ALL_TSO;
5914         }
5915
5916         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5917                                         !(features & NETIF_F_IP_CSUM)) {
5918                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5919                 features &= ~NETIF_F_TSO;
5920                 features &= ~NETIF_F_TSO_ECN;
5921         }
5922
5923         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5924                                          !(features & NETIF_F_IPV6_CSUM)) {
5925                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5926                 features &= ~NETIF_F_TSO6;
5927         }
5928
5929         /* TSO ECN requires that TSO is present as well. */
5930         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5931                 features &= ~NETIF_F_TSO_ECN;
5932
5933         /* Software GSO depends on SG. */
5934         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5935                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5936                 features &= ~NETIF_F_GSO;
5937         }
5938
5939         /* UFO needs SG and checksumming */
5940         if (features & NETIF_F_UFO) {
5941                 /* maybe split UFO into V4 and V6? */
5942                 if (!((features & NETIF_F_GEN_CSUM) ||
5943                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5944                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5945                         netdev_dbg(dev,
5946                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5947                         features &= ~NETIF_F_UFO;
5948                 }
5949
5950                 if (!(features & NETIF_F_SG)) {
5951                         netdev_dbg(dev,
5952                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5953                         features &= ~NETIF_F_UFO;
5954                 }
5955         }
5956
5957 #ifdef CONFIG_NET_RX_BUSY_POLL
5958         if (dev->netdev_ops->ndo_busy_poll)
5959                 features |= NETIF_F_BUSY_POLL;
5960         else
5961 #endif
5962                 features &= ~NETIF_F_BUSY_POLL;
5963
5964         return features;
5965 }
5966
5967 int __netdev_update_features(struct net_device *dev)
5968 {
5969         netdev_features_t features;
5970         int err = 0;
5971
5972         ASSERT_RTNL();
5973
5974         features = netdev_get_wanted_features(dev);
5975
5976         if (dev->netdev_ops->ndo_fix_features)
5977                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5978
5979         /* driver might be less strict about feature dependencies */
5980         features = netdev_fix_features(dev, features);
5981
5982         if (dev->features == features)
5983                 return 0;
5984
5985         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5986                 &dev->features, &features);
5987
5988         if (dev->netdev_ops->ndo_set_features)
5989                 err = dev->netdev_ops->ndo_set_features(dev, features);
5990
5991         if (unlikely(err < 0)) {
5992                 netdev_err(dev,
5993                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5994                         err, &features, &dev->features);
5995                 return -1;
5996         }
5997
5998         if (!err)
5999                 dev->features = features;
6000
6001         return 1;
6002 }
6003
6004 /**
6005  *      netdev_update_features - recalculate device features
6006  *      @dev: the device to check
6007  *
6008  *      Recalculate dev->features set and send notifications if it
6009  *      has changed. Should be called after driver or hardware dependent
6010  *      conditions might have changed that influence the features.
6011  */
6012 void netdev_update_features(struct net_device *dev)
6013 {
6014         if (__netdev_update_features(dev))
6015                 netdev_features_change(dev);
6016 }
6017 EXPORT_SYMBOL(netdev_update_features);
6018
6019 /**
6020  *      netdev_change_features - recalculate device features
6021  *      @dev: the device to check
6022  *
6023  *      Recalculate dev->features set and send notifications even
6024  *      if they have not changed. Should be called instead of
6025  *      netdev_update_features() if also dev->vlan_features might
6026  *      have changed to allow the changes to be propagated to stacked
6027  *      VLAN devices.
6028  */
6029 void netdev_change_features(struct net_device *dev)
6030 {
6031         __netdev_update_features(dev);
6032         netdev_features_change(dev);
6033 }
6034 EXPORT_SYMBOL(netdev_change_features);
6035
6036 /**
6037  *      netif_stacked_transfer_operstate -      transfer operstate
6038  *      @rootdev: the root or lower level device to transfer state from
6039  *      @dev: the device to transfer operstate to
6040  *
6041  *      Transfer operational state from root to device. This is normally
6042  *      called when a stacking relationship exists between the root
6043  *      device and the device(a leaf device).
6044  */
6045 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6046                                         struct net_device *dev)
6047 {
6048         if (rootdev->operstate == IF_OPER_DORMANT)
6049                 netif_dormant_on(dev);
6050         else
6051                 netif_dormant_off(dev);
6052
6053         if (netif_carrier_ok(rootdev)) {
6054                 if (!netif_carrier_ok(dev))
6055                         netif_carrier_on(dev);
6056         } else {
6057                 if (netif_carrier_ok(dev))
6058                         netif_carrier_off(dev);
6059         }
6060 }
6061 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6062
6063 #ifdef CONFIG_SYSFS
6064 static int netif_alloc_rx_queues(struct net_device *dev)
6065 {
6066         unsigned int i, count = dev->num_rx_queues;
6067         struct netdev_rx_queue *rx;
6068
6069         BUG_ON(count < 1);
6070
6071         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6072         if (!rx)
6073                 return -ENOMEM;
6074
6075         dev->_rx = rx;
6076
6077         for (i = 0; i < count; i++)
6078                 rx[i].dev = dev;
6079         return 0;
6080 }
6081 #endif
6082
6083 static void netdev_init_one_queue(struct net_device *dev,
6084                                   struct netdev_queue *queue, void *_unused)
6085 {
6086         /* Initialize queue lock */
6087         spin_lock_init(&queue->_xmit_lock);
6088         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6089         queue->xmit_lock_owner = -1;
6090         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6091         queue->dev = dev;
6092 #ifdef CONFIG_BQL
6093         dql_init(&queue->dql, HZ);
6094 #endif
6095 }
6096
6097 static void netif_free_tx_queues(struct net_device *dev)
6098 {
6099         kvfree(dev->_tx);
6100 }
6101
6102 static int netif_alloc_netdev_queues(struct net_device *dev)
6103 {
6104         unsigned int count = dev->num_tx_queues;
6105         struct netdev_queue *tx;
6106         size_t sz = count * sizeof(*tx);
6107
6108         BUG_ON(count < 1 || count > 0xffff);
6109
6110         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6111         if (!tx) {
6112                 tx = vzalloc(sz);
6113                 if (!tx)
6114                         return -ENOMEM;
6115         }
6116         dev->_tx = tx;
6117
6118         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6119         spin_lock_init(&dev->tx_global_lock);
6120
6121         return 0;
6122 }
6123
6124 /**
6125  *      register_netdevice      - register a network device
6126  *      @dev: device to register
6127  *
6128  *      Take a completed network device structure and add it to the kernel
6129  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6130  *      chain. 0 is returned on success. A negative errno code is returned
6131  *      on a failure to set up the device, or if the name is a duplicate.
6132  *
6133  *      Callers must hold the rtnl semaphore. You may want
6134  *      register_netdev() instead of this.
6135  *
6136  *      BUGS:
6137  *      The locking appears insufficient to guarantee two parallel registers
6138  *      will not get the same name.
6139  */
6140
6141 int register_netdevice(struct net_device *dev)
6142 {
6143         int ret;
6144         struct net *net = dev_net(dev);
6145
6146         BUG_ON(dev_boot_phase);
6147         ASSERT_RTNL();
6148
6149         might_sleep();
6150
6151         /* When net_device's are persistent, this will be fatal. */
6152         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6153         BUG_ON(!net);
6154
6155         spin_lock_init(&dev->addr_list_lock);
6156         netdev_set_addr_lockdep_class(dev);
6157
6158         dev->iflink = -1;
6159
6160         ret = dev_get_valid_name(net, dev, dev->name);
6161         if (ret < 0)
6162                 goto out;
6163
6164         /* Init, if this function is available */
6165         if (dev->netdev_ops->ndo_init) {
6166                 ret = dev->netdev_ops->ndo_init(dev);
6167                 if (ret) {
6168                         if (ret > 0)
6169                                 ret = -EIO;
6170                         goto out;
6171                 }
6172         }
6173
6174         if (((dev->hw_features | dev->features) &
6175              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6176             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6177              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6178                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6179                 ret = -EINVAL;
6180                 goto err_uninit;
6181         }
6182
6183         ret = -EBUSY;
6184         if (!dev->ifindex)
6185                 dev->ifindex = dev_new_index(net);
6186         else if (__dev_get_by_index(net, dev->ifindex))
6187                 goto err_uninit;
6188
6189         if (dev->iflink == -1)
6190                 dev->iflink = dev->ifindex;
6191
6192         /* Transfer changeable features to wanted_features and enable
6193          * software offloads (GSO and GRO).
6194          */
6195         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6196         dev->features |= NETIF_F_SOFT_FEATURES;
6197         dev->wanted_features = dev->features & dev->hw_features;
6198
6199         if (!(dev->flags & IFF_LOOPBACK)) {
6200                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6201         }
6202
6203         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6204          */
6205         dev->vlan_features |= NETIF_F_HIGHDMA;
6206
6207         /* Make NETIF_F_SG inheritable to tunnel devices.
6208          */
6209         dev->hw_enc_features |= NETIF_F_SG;
6210
6211         /* Make NETIF_F_SG inheritable to MPLS.
6212          */
6213         dev->mpls_features |= NETIF_F_SG;
6214
6215         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6216         ret = notifier_to_errno(ret);
6217         if (ret)
6218                 goto err_uninit;
6219
6220         ret = netdev_register_kobject(dev);
6221         if (ret)
6222                 goto err_uninit;
6223         dev->reg_state = NETREG_REGISTERED;
6224
6225         __netdev_update_features(dev);
6226
6227         /*
6228          *      Default initial state at registry is that the
6229          *      device is present.
6230          */
6231
6232         set_bit(__LINK_STATE_PRESENT, &dev->state);
6233
6234         linkwatch_init_dev(dev);
6235
6236         dev_init_scheduler(dev);
6237         dev_hold(dev);
6238         list_netdevice(dev);
6239         add_device_randomness(dev->dev_addr, dev->addr_len);
6240
6241         /* If the device has permanent device address, driver should
6242          * set dev_addr and also addr_assign_type should be set to
6243          * NET_ADDR_PERM (default value).
6244          */
6245         if (dev->addr_assign_type == NET_ADDR_PERM)
6246                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6247
6248         /* Notify protocols, that a new device appeared. */
6249         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6250         ret = notifier_to_errno(ret);
6251         if (ret) {
6252                 rollback_registered(dev);
6253                 dev->reg_state = NETREG_UNREGISTERED;
6254         }
6255         /*
6256          *      Prevent userspace races by waiting until the network
6257          *      device is fully setup before sending notifications.
6258          */
6259         if (!dev->rtnl_link_ops ||
6260             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6261                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6262
6263 out:
6264         return ret;
6265
6266 err_uninit:
6267         if (dev->netdev_ops->ndo_uninit)
6268                 dev->netdev_ops->ndo_uninit(dev);
6269         goto out;
6270 }
6271 EXPORT_SYMBOL(register_netdevice);
6272
6273 /**
6274  *      init_dummy_netdev       - init a dummy network device for NAPI
6275  *      @dev: device to init
6276  *
6277  *      This takes a network device structure and initialize the minimum
6278  *      amount of fields so it can be used to schedule NAPI polls without
6279  *      registering a full blown interface. This is to be used by drivers
6280  *      that need to tie several hardware interfaces to a single NAPI
6281  *      poll scheduler due to HW limitations.
6282  */
6283 int init_dummy_netdev(struct net_device *dev)
6284 {
6285         /* Clear everything. Note we don't initialize spinlocks
6286          * are they aren't supposed to be taken by any of the
6287          * NAPI code and this dummy netdev is supposed to be
6288          * only ever used for NAPI polls
6289          */
6290         memset(dev, 0, sizeof(struct net_device));
6291
6292         /* make sure we BUG if trying to hit standard
6293          * register/unregister code path
6294          */
6295         dev->reg_state = NETREG_DUMMY;
6296
6297         /* NAPI wants this */
6298         INIT_LIST_HEAD(&dev->napi_list);
6299
6300         /* a dummy interface is started by default */
6301         set_bit(__LINK_STATE_PRESENT, &dev->state);
6302         set_bit(__LINK_STATE_START, &dev->state);
6303
6304         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6305          * because users of this 'device' dont need to change
6306          * its refcount.
6307          */
6308
6309         return 0;
6310 }
6311 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6312
6313
6314 /**
6315  *      register_netdev - register a network device
6316  *      @dev: device to register
6317  *
6318  *      Take a completed network device structure and add it to the kernel
6319  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6320  *      chain. 0 is returned on success. A negative errno code is returned
6321  *      on a failure to set up the device, or if the name is a duplicate.
6322  *
6323  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6324  *      and expands the device name if you passed a format string to
6325  *      alloc_netdev.
6326  */
6327 int register_netdev(struct net_device *dev)
6328 {
6329         int err;
6330
6331         rtnl_lock();
6332         err = register_netdevice(dev);
6333         rtnl_unlock();
6334         return err;
6335 }
6336 EXPORT_SYMBOL(register_netdev);
6337
6338 int netdev_refcnt_read(const struct net_device *dev)
6339 {
6340         int i, refcnt = 0;
6341
6342         for_each_possible_cpu(i)
6343                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6344         return refcnt;
6345 }
6346 EXPORT_SYMBOL(netdev_refcnt_read);
6347
6348 /**
6349  * netdev_wait_allrefs - wait until all references are gone.
6350  * @dev: target net_device
6351  *
6352  * This is called when unregistering network devices.
6353  *
6354  * Any protocol or device that holds a reference should register
6355  * for netdevice notification, and cleanup and put back the
6356  * reference if they receive an UNREGISTER event.
6357  * We can get stuck here if buggy protocols don't correctly
6358  * call dev_put.
6359  */
6360 static void netdev_wait_allrefs(struct net_device *dev)
6361 {
6362         unsigned long rebroadcast_time, warning_time;
6363         int refcnt;
6364
6365         linkwatch_forget_dev(dev);
6366
6367         rebroadcast_time = warning_time = jiffies;
6368         refcnt = netdev_refcnt_read(dev);
6369
6370         while (refcnt != 0) {
6371                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6372                         rtnl_lock();
6373
6374                         /* Rebroadcast unregister notification */
6375                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6376
6377                         __rtnl_unlock();
6378                         rcu_barrier();
6379                         rtnl_lock();
6380
6381                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6382                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6383                                      &dev->state)) {
6384                                 /* We must not have linkwatch events
6385                                  * pending on unregister. If this
6386                                  * happens, we simply run the queue
6387                                  * unscheduled, resulting in a noop
6388                                  * for this device.
6389                                  */
6390                                 linkwatch_run_queue();
6391                         }
6392
6393                         __rtnl_unlock();
6394
6395                         rebroadcast_time = jiffies;
6396                 }
6397
6398                 msleep(250);
6399
6400                 refcnt = netdev_refcnt_read(dev);
6401
6402                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6403                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6404                                  dev->name, refcnt);
6405                         warning_time = jiffies;
6406                 }
6407         }
6408 }
6409
6410 /* The sequence is:
6411  *
6412  *      rtnl_lock();
6413  *      ...
6414  *      register_netdevice(x1);
6415  *      register_netdevice(x2);
6416  *      ...
6417  *      unregister_netdevice(y1);
6418  *      unregister_netdevice(y2);
6419  *      ...
6420  *      rtnl_unlock();
6421  *      free_netdev(y1);
6422  *      free_netdev(y2);
6423  *
6424  * We are invoked by rtnl_unlock().
6425  * This allows us to deal with problems:
6426  * 1) We can delete sysfs objects which invoke hotplug
6427  *    without deadlocking with linkwatch via keventd.
6428  * 2) Since we run with the RTNL semaphore not held, we can sleep
6429  *    safely in order to wait for the netdev refcnt to drop to zero.
6430  *
6431  * We must not return until all unregister events added during
6432  * the interval the lock was held have been completed.
6433  */
6434 void netdev_run_todo(void)
6435 {
6436         struct list_head list;
6437
6438         /* Snapshot list, allow later requests */
6439         list_replace_init(&net_todo_list, &list);
6440
6441         __rtnl_unlock();
6442
6443
6444         /* Wait for rcu callbacks to finish before next phase */
6445         if (!list_empty(&list))
6446                 rcu_barrier();
6447
6448         while (!list_empty(&list)) {
6449                 struct net_device *dev
6450                         = list_first_entry(&list, struct net_device, todo_list);
6451                 list_del(&dev->todo_list);
6452
6453                 rtnl_lock();
6454                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6455                 __rtnl_unlock();
6456
6457                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6458                         pr_err("network todo '%s' but state %d\n",
6459                                dev->name, dev->reg_state);
6460                         dump_stack();
6461                         continue;
6462                 }
6463
6464                 dev->reg_state = NETREG_UNREGISTERED;
6465
6466                 on_each_cpu(flush_backlog, dev, 1);
6467
6468                 netdev_wait_allrefs(dev);
6469
6470                 /* paranoia */
6471                 BUG_ON(netdev_refcnt_read(dev));
6472                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6473                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6474                 WARN_ON(dev->dn_ptr);
6475
6476                 if (dev->destructor)
6477                         dev->destructor(dev);
6478
6479                 /* Report a network device has been unregistered */
6480                 rtnl_lock();
6481                 dev_net(dev)->dev_unreg_count--;
6482                 __rtnl_unlock();
6483                 wake_up(&netdev_unregistering_wq);
6484
6485                 /* Free network device */
6486                 kobject_put(&dev->dev.kobj);
6487         }
6488 }
6489
6490 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6491  * fields in the same order, with only the type differing.
6492  */
6493 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6494                              const struct net_device_stats *netdev_stats)
6495 {
6496 #if BITS_PER_LONG == 64
6497         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6498         memcpy(stats64, netdev_stats, sizeof(*stats64));
6499 #else
6500         size_t i, n = sizeof(*stats64) / sizeof(u64);
6501         const unsigned long *src = (const unsigned long *)netdev_stats;
6502         u64 *dst = (u64 *)stats64;
6503
6504         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6505                      sizeof(*stats64) / sizeof(u64));
6506         for (i = 0; i < n; i++)
6507                 dst[i] = src[i];
6508 #endif
6509 }
6510 EXPORT_SYMBOL(netdev_stats_to_stats64);
6511
6512 /**
6513  *      dev_get_stats   - get network device statistics
6514  *      @dev: device to get statistics from
6515  *      @storage: place to store stats
6516  *
6517  *      Get network statistics from device. Return @storage.
6518  *      The device driver may provide its own method by setting
6519  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6520  *      otherwise the internal statistics structure is used.
6521  */
6522 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6523                                         struct rtnl_link_stats64 *storage)
6524 {
6525         const struct net_device_ops *ops = dev->netdev_ops;
6526
6527         if (ops->ndo_get_stats64) {
6528                 memset(storage, 0, sizeof(*storage));
6529                 ops->ndo_get_stats64(dev, storage);
6530         } else if (ops->ndo_get_stats) {
6531                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6532         } else {
6533                 netdev_stats_to_stats64(storage, &dev->stats);
6534         }
6535         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6536         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6537         return storage;
6538 }
6539 EXPORT_SYMBOL(dev_get_stats);
6540
6541 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6542 {
6543         struct netdev_queue *queue = dev_ingress_queue(dev);
6544
6545 #ifdef CONFIG_NET_CLS_ACT
6546         if (queue)
6547                 return queue;
6548         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6549         if (!queue)
6550                 return NULL;
6551         netdev_init_one_queue(dev, queue, NULL);
6552         queue->qdisc = &noop_qdisc;
6553         queue->qdisc_sleeping = &noop_qdisc;
6554         rcu_assign_pointer(dev->ingress_queue, queue);
6555 #endif
6556         return queue;
6557 }
6558
6559 static const struct ethtool_ops default_ethtool_ops;
6560
6561 void netdev_set_default_ethtool_ops(struct net_device *dev,
6562                                     const struct ethtool_ops *ops)
6563 {
6564         if (dev->ethtool_ops == &default_ethtool_ops)
6565                 dev->ethtool_ops = ops;
6566 }
6567 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6568
6569 void netdev_freemem(struct net_device *dev)
6570 {
6571         char *addr = (char *)dev - dev->padded;
6572
6573         kvfree(addr);
6574 }
6575
6576 /**
6577  *      alloc_netdev_mqs - allocate network device
6578  *      @sizeof_priv:           size of private data to allocate space for
6579  *      @name:                  device name format string
6580  *      @name_assign_type:      origin of device name
6581  *      @setup:                 callback to initialize device
6582  *      @txqs:                  the number of TX subqueues to allocate
6583  *      @rxqs:                  the number of RX subqueues to allocate
6584  *
6585  *      Allocates a struct net_device with private data area for driver use
6586  *      and performs basic initialization.  Also allocates subqueue structs
6587  *      for each queue on the device.
6588  */
6589 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6590                 unsigned char name_assign_type,
6591                 void (*setup)(struct net_device *),
6592                 unsigned int txqs, unsigned int rxqs)
6593 {
6594         struct net_device *dev;
6595         size_t alloc_size;
6596         struct net_device *p;
6597
6598         BUG_ON(strlen(name) >= sizeof(dev->name));
6599
6600         if (txqs < 1) {
6601                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6602                 return NULL;
6603         }
6604
6605 #ifdef CONFIG_SYSFS
6606         if (rxqs < 1) {
6607                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6608                 return NULL;
6609         }
6610 #endif
6611
6612         alloc_size = sizeof(struct net_device);
6613         if (sizeof_priv) {
6614                 /* ensure 32-byte alignment of private area */
6615                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6616                 alloc_size += sizeof_priv;
6617         }
6618         /* ensure 32-byte alignment of whole construct */
6619         alloc_size += NETDEV_ALIGN - 1;
6620
6621         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6622         if (!p)
6623                 p = vzalloc(alloc_size);
6624         if (!p)
6625                 return NULL;
6626
6627         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6628         dev->padded = (char *)dev - (char *)p;
6629
6630         dev->pcpu_refcnt = alloc_percpu(int);
6631         if (!dev->pcpu_refcnt)
6632                 goto free_dev;
6633
6634         if (dev_addr_init(dev))
6635                 goto free_pcpu;
6636
6637         dev_mc_init(dev);
6638         dev_uc_init(dev);
6639
6640         dev_net_set(dev, &init_net);
6641
6642         dev->gso_max_size = GSO_MAX_SIZE;
6643         dev->gso_max_segs = GSO_MAX_SEGS;
6644
6645         INIT_LIST_HEAD(&dev->napi_list);
6646         INIT_LIST_HEAD(&dev->unreg_list);
6647         INIT_LIST_HEAD(&dev->close_list);
6648         INIT_LIST_HEAD(&dev->link_watch_list);
6649         INIT_LIST_HEAD(&dev->adj_list.upper);
6650         INIT_LIST_HEAD(&dev->adj_list.lower);
6651         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6652         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6653         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6654         setup(dev);
6655
6656         dev->num_tx_queues = txqs;
6657         dev->real_num_tx_queues = txqs;
6658         if (netif_alloc_netdev_queues(dev))
6659                 goto free_all;
6660
6661 #ifdef CONFIG_SYSFS
6662         dev->num_rx_queues = rxqs;
6663         dev->real_num_rx_queues = rxqs;
6664         if (netif_alloc_rx_queues(dev))
6665                 goto free_all;
6666 #endif
6667
6668         strcpy(dev->name, name);
6669         dev->name_assign_type = name_assign_type;
6670         dev->group = INIT_NETDEV_GROUP;
6671         if (!dev->ethtool_ops)
6672                 dev->ethtool_ops = &default_ethtool_ops;
6673         return dev;
6674
6675 free_all:
6676         free_netdev(dev);
6677         return NULL;
6678
6679 free_pcpu:
6680         free_percpu(dev->pcpu_refcnt);
6681 free_dev:
6682         netdev_freemem(dev);
6683         return NULL;
6684 }
6685 EXPORT_SYMBOL(alloc_netdev_mqs);
6686
6687 /**
6688  *      free_netdev - free network device
6689  *      @dev: device
6690  *
6691  *      This function does the last stage of destroying an allocated device
6692  *      interface. The reference to the device object is released.
6693  *      If this is the last reference then it will be freed.
6694  */
6695 void free_netdev(struct net_device *dev)
6696 {
6697         struct napi_struct *p, *n;
6698
6699         release_net(dev_net(dev));
6700
6701         netif_free_tx_queues(dev);
6702 #ifdef CONFIG_SYSFS
6703         kfree(dev->_rx);
6704 #endif
6705
6706         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6707
6708         /* Flush device addresses */
6709         dev_addr_flush(dev);
6710
6711         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6712                 netif_napi_del(p);
6713
6714         free_percpu(dev->pcpu_refcnt);
6715         dev->pcpu_refcnt = NULL;
6716
6717         /*  Compatibility with error handling in drivers */
6718         if (dev->reg_state == NETREG_UNINITIALIZED) {
6719                 netdev_freemem(dev);
6720                 return;
6721         }
6722
6723         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6724         dev->reg_state = NETREG_RELEASED;
6725
6726         /* will free via device release */
6727         put_device(&dev->dev);
6728 }
6729 EXPORT_SYMBOL(free_netdev);
6730
6731 /**
6732  *      synchronize_net -  Synchronize with packet receive processing
6733  *
6734  *      Wait for packets currently being received to be done.
6735  *      Does not block later packets from starting.
6736  */
6737 void synchronize_net(void)
6738 {
6739         might_sleep();
6740         if (rtnl_is_locked())
6741                 synchronize_rcu_expedited();
6742         else
6743                 synchronize_rcu();
6744 }
6745 EXPORT_SYMBOL(synchronize_net);
6746
6747 /**
6748  *      unregister_netdevice_queue - remove device from the kernel
6749  *      @dev: device
6750  *      @head: list
6751  *
6752  *      This function shuts down a device interface and removes it
6753  *      from the kernel tables.
6754  *      If head not NULL, device is queued to be unregistered later.
6755  *
6756  *      Callers must hold the rtnl semaphore.  You may want
6757  *      unregister_netdev() instead of this.
6758  */
6759
6760 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6761 {
6762         ASSERT_RTNL();
6763
6764         if (head) {
6765                 list_move_tail(&dev->unreg_list, head);
6766         } else {
6767                 rollback_registered(dev);
6768                 /* Finish processing unregister after unlock */
6769                 net_set_todo(dev);
6770         }
6771 }
6772 EXPORT_SYMBOL(unregister_netdevice_queue);
6773
6774 /**
6775  *      unregister_netdevice_many - unregister many devices
6776  *      @head: list of devices
6777  *
6778  *  Note: As most callers use a stack allocated list_head,
6779  *  we force a list_del() to make sure stack wont be corrupted later.
6780  */
6781 void unregister_netdevice_many(struct list_head *head)
6782 {
6783         struct net_device *dev;
6784
6785         if (!list_empty(head)) {
6786                 rollback_registered_many(head);
6787                 list_for_each_entry(dev, head, unreg_list)
6788                         net_set_todo(dev);
6789                 list_del(head);
6790         }
6791 }
6792 EXPORT_SYMBOL(unregister_netdevice_many);
6793
6794 /**
6795  *      unregister_netdev - remove device from the kernel
6796  *      @dev: device
6797  *
6798  *      This function shuts down a device interface and removes it
6799  *      from the kernel tables.
6800  *
6801  *      This is just a wrapper for unregister_netdevice that takes
6802  *      the rtnl semaphore.  In general you want to use this and not
6803  *      unregister_netdevice.
6804  */
6805 void unregister_netdev(struct net_device *dev)
6806 {
6807         rtnl_lock();
6808         unregister_netdevice(dev);
6809         rtnl_unlock();
6810 }
6811 EXPORT_SYMBOL(unregister_netdev);
6812
6813 /**
6814  *      dev_change_net_namespace - move device to different nethost namespace
6815  *      @dev: device
6816  *      @net: network namespace
6817  *      @pat: If not NULL name pattern to try if the current device name
6818  *            is already taken in the destination network namespace.
6819  *
6820  *      This function shuts down a device interface and moves it
6821  *      to a new network namespace. On success 0 is returned, on
6822  *      a failure a netagive errno code is returned.
6823  *
6824  *      Callers must hold the rtnl semaphore.
6825  */
6826
6827 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6828 {
6829         int err;
6830
6831         ASSERT_RTNL();
6832
6833         /* Don't allow namespace local devices to be moved. */
6834         err = -EINVAL;
6835         if (dev->features & NETIF_F_NETNS_LOCAL)
6836                 goto out;
6837
6838         /* Ensure the device has been registrered */
6839         if (dev->reg_state != NETREG_REGISTERED)
6840                 goto out;
6841
6842         /* Get out if there is nothing todo */
6843         err = 0;
6844         if (net_eq(dev_net(dev), net))
6845                 goto out;
6846
6847         /* Pick the destination device name, and ensure
6848          * we can use it in the destination network namespace.
6849          */
6850         err = -EEXIST;
6851         if (__dev_get_by_name(net, dev->name)) {
6852                 /* We get here if we can't use the current device name */
6853                 if (!pat)
6854                         goto out;
6855                 if (dev_get_valid_name(net, dev, pat) < 0)
6856                         goto out;
6857         }
6858
6859         /*
6860          * And now a mini version of register_netdevice unregister_netdevice.
6861          */
6862
6863         /* If device is running close it first. */
6864         dev_close(dev);
6865
6866         /* And unlink it from device chain */
6867         err = -ENODEV;
6868         unlist_netdevice(dev);
6869
6870         synchronize_net();
6871
6872         /* Shutdown queueing discipline. */
6873         dev_shutdown(dev);
6874
6875         /* Notify protocols, that we are about to destroy
6876            this device. They should clean all the things.
6877
6878            Note that dev->reg_state stays at NETREG_REGISTERED.
6879            This is wanted because this way 8021q and macvlan know
6880            the device is just moving and can keep their slaves up.
6881         */
6882         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6883         rcu_barrier();
6884         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6885         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6886
6887         /*
6888          *      Flush the unicast and multicast chains
6889          */
6890         dev_uc_flush(dev);
6891         dev_mc_flush(dev);
6892
6893         /* Send a netdev-removed uevent to the old namespace */
6894         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6895         netdev_adjacent_del_links(dev);
6896
6897         /* Actually switch the network namespace */
6898         dev_net_set(dev, net);
6899
6900         /* If there is an ifindex conflict assign a new one */
6901         if (__dev_get_by_index(net, dev->ifindex)) {
6902                 int iflink = (dev->iflink == dev->ifindex);
6903                 dev->ifindex = dev_new_index(net);
6904                 if (iflink)
6905                         dev->iflink = dev->ifindex;
6906         }
6907
6908         /* Send a netdev-add uevent to the new namespace */
6909         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6910         netdev_adjacent_add_links(dev);
6911
6912         /* Fixup kobjects */
6913         err = device_rename(&dev->dev, dev->name);
6914         WARN_ON(err);
6915
6916         /* Add the device back in the hashes */
6917         list_netdevice(dev);
6918
6919         /* Notify protocols, that a new device appeared. */
6920         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6921
6922         /*
6923          *      Prevent userspace races by waiting until the network
6924          *      device is fully setup before sending notifications.
6925          */
6926         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6927
6928         synchronize_net();
6929         err = 0;
6930 out:
6931         return err;
6932 }
6933 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6934
6935 static int dev_cpu_callback(struct notifier_block *nfb,
6936                             unsigned long action,
6937                             void *ocpu)
6938 {
6939         struct sk_buff **list_skb;
6940         struct sk_buff *skb;
6941         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6942         struct softnet_data *sd, *oldsd;
6943
6944         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6945                 return NOTIFY_OK;
6946
6947         local_irq_disable();
6948         cpu = smp_processor_id();
6949         sd = &per_cpu(softnet_data, cpu);
6950         oldsd = &per_cpu(softnet_data, oldcpu);
6951
6952         /* Find end of our completion_queue. */
6953         list_skb = &sd->completion_queue;
6954         while (*list_skb)
6955                 list_skb = &(*list_skb)->next;
6956         /* Append completion queue from offline CPU. */
6957         *list_skb = oldsd->completion_queue;
6958         oldsd->completion_queue = NULL;
6959
6960         /* Append output queue from offline CPU. */
6961         if (oldsd->output_queue) {
6962                 *sd->output_queue_tailp = oldsd->output_queue;
6963                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6964                 oldsd->output_queue = NULL;
6965                 oldsd->output_queue_tailp = &oldsd->output_queue;
6966         }
6967         /* Append NAPI poll list from offline CPU. */
6968         if (!list_empty(&oldsd->poll_list)) {
6969                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6970                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6971         }
6972
6973         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6974         local_irq_enable();
6975
6976         /* Process offline CPU's input_pkt_queue */
6977         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6978                 netif_rx_internal(skb);
6979                 input_queue_head_incr(oldsd);
6980         }
6981         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6982                 netif_rx_internal(skb);
6983                 input_queue_head_incr(oldsd);
6984         }
6985
6986         return NOTIFY_OK;
6987 }
6988
6989
6990 /**
6991  *      netdev_increment_features - increment feature set by one
6992  *      @all: current feature set
6993  *      @one: new feature set
6994  *      @mask: mask feature set
6995  *
6996  *      Computes a new feature set after adding a device with feature set
6997  *      @one to the master device with current feature set @all.  Will not
6998  *      enable anything that is off in @mask. Returns the new feature set.
6999  */
7000 netdev_features_t netdev_increment_features(netdev_features_t all,
7001         netdev_features_t one, netdev_features_t mask)
7002 {
7003         if (mask & NETIF_F_GEN_CSUM)
7004                 mask |= NETIF_F_ALL_CSUM;
7005         mask |= NETIF_F_VLAN_CHALLENGED;
7006
7007         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7008         all &= one | ~NETIF_F_ALL_FOR_ALL;
7009
7010         /* If one device supports hw checksumming, set for all. */
7011         if (all & NETIF_F_GEN_CSUM)
7012                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7013
7014         return all;
7015 }
7016 EXPORT_SYMBOL(netdev_increment_features);
7017
7018 static struct hlist_head * __net_init netdev_create_hash(void)
7019 {
7020         int i;
7021         struct hlist_head *hash;
7022
7023         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7024         if (hash != NULL)
7025                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7026                         INIT_HLIST_HEAD(&hash[i]);
7027
7028         return hash;
7029 }
7030
7031 /* Initialize per network namespace state */
7032 static int __net_init netdev_init(struct net *net)
7033 {
7034         if (net != &init_net)
7035                 INIT_LIST_HEAD(&net->dev_base_head);
7036
7037         net->dev_name_head = netdev_create_hash();
7038         if (net->dev_name_head == NULL)
7039                 goto err_name;
7040
7041         net->dev_index_head = netdev_create_hash();
7042         if (net->dev_index_head == NULL)
7043                 goto err_idx;
7044
7045         return 0;
7046
7047 err_idx:
7048         kfree(net->dev_name_head);
7049 err_name:
7050         return -ENOMEM;
7051 }
7052
7053 /**
7054  *      netdev_drivername - network driver for the device
7055  *      @dev: network device
7056  *
7057  *      Determine network driver for device.
7058  */
7059 const char *netdev_drivername(const struct net_device *dev)
7060 {
7061         const struct device_driver *driver;
7062         const struct device *parent;
7063         const char *empty = "";
7064
7065         parent = dev->dev.parent;
7066         if (!parent)
7067                 return empty;
7068
7069         driver = parent->driver;
7070         if (driver && driver->name)
7071                 return driver->name;
7072         return empty;
7073 }
7074
7075 static int __netdev_printk(const char *level, const struct net_device *dev,
7076                            struct va_format *vaf)
7077 {
7078         int r;
7079
7080         if (dev && dev->dev.parent) {
7081                 r = dev_printk_emit(level[1] - '0',
7082                                     dev->dev.parent,
7083                                     "%s %s %s%s: %pV",
7084                                     dev_driver_string(dev->dev.parent),
7085                                     dev_name(dev->dev.parent),
7086                                     netdev_name(dev), netdev_reg_state(dev),
7087                                     vaf);
7088         } else if (dev) {
7089                 r = printk("%s%s%s: %pV", level, netdev_name(dev),
7090                            netdev_reg_state(dev), vaf);
7091         } else {
7092                 r = printk("%s(NULL net_device): %pV", level, vaf);
7093         }
7094
7095         return r;
7096 }
7097
7098 int netdev_printk(const char *level, const struct net_device *dev,
7099                   const char *format, ...)
7100 {
7101         struct va_format vaf;
7102         va_list args;
7103         int r;
7104
7105         va_start(args, format);
7106
7107         vaf.fmt = format;
7108         vaf.va = &args;
7109
7110         r = __netdev_printk(level, dev, &vaf);
7111
7112         va_end(args);
7113
7114         return r;
7115 }
7116 EXPORT_SYMBOL(netdev_printk);
7117
7118 #define define_netdev_printk_level(func, level)                 \
7119 int func(const struct net_device *dev, const char *fmt, ...)    \
7120 {                                                               \
7121         int r;                                                  \
7122         struct va_format vaf;                                   \
7123         va_list args;                                           \
7124                                                                 \
7125         va_start(args, fmt);                                    \
7126                                                                 \
7127         vaf.fmt = fmt;                                          \
7128         vaf.va = &args;                                         \
7129                                                                 \
7130         r = __netdev_printk(level, dev, &vaf);                  \
7131                                                                 \
7132         va_end(args);                                           \
7133                                                                 \
7134         return r;                                               \
7135 }                                                               \
7136 EXPORT_SYMBOL(func);
7137
7138 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7139 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7140 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7141 define_netdev_printk_level(netdev_err, KERN_ERR);
7142 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7143 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7144 define_netdev_printk_level(netdev_info, KERN_INFO);
7145
7146 static void __net_exit netdev_exit(struct net *net)
7147 {
7148         kfree(net->dev_name_head);
7149         kfree(net->dev_index_head);
7150 }
7151
7152 static struct pernet_operations __net_initdata netdev_net_ops = {
7153         .init = netdev_init,
7154         .exit = netdev_exit,
7155 };
7156
7157 static void __net_exit default_device_exit(struct net *net)
7158 {
7159         struct net_device *dev, *aux;
7160         /*
7161          * Push all migratable network devices back to the
7162          * initial network namespace
7163          */
7164         rtnl_lock();
7165         for_each_netdev_safe(net, dev, aux) {
7166                 int err;
7167                 char fb_name[IFNAMSIZ];
7168
7169                 /* Ignore unmoveable devices (i.e. loopback) */
7170                 if (dev->features & NETIF_F_NETNS_LOCAL)
7171                         continue;
7172
7173                 /* Leave virtual devices for the generic cleanup */
7174                 if (dev->rtnl_link_ops)
7175                         continue;
7176
7177                 /* Push remaining network devices to init_net */
7178                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7179                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7180                 if (err) {
7181                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7182                                  __func__, dev->name, err);
7183                         BUG();
7184                 }
7185         }
7186         rtnl_unlock();
7187 }
7188
7189 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7190 {
7191         /* Return with the rtnl_lock held when there are no network
7192          * devices unregistering in any network namespace in net_list.
7193          */
7194         struct net *net;
7195         bool unregistering;
7196         DEFINE_WAIT(wait);
7197
7198         for (;;) {
7199                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7200                                 TASK_UNINTERRUPTIBLE);
7201                 unregistering = false;
7202                 rtnl_lock();
7203                 list_for_each_entry(net, net_list, exit_list) {
7204                         if (net->dev_unreg_count > 0) {
7205                                 unregistering = true;
7206                                 break;
7207                         }
7208                 }
7209                 if (!unregistering)
7210                         break;
7211                 __rtnl_unlock();
7212                 schedule();
7213         }
7214         finish_wait(&netdev_unregistering_wq, &wait);
7215 }
7216
7217 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7218 {
7219         /* At exit all network devices most be removed from a network
7220          * namespace.  Do this in the reverse order of registration.
7221          * Do this across as many network namespaces as possible to
7222          * improve batching efficiency.
7223          */
7224         struct net_device *dev;
7225         struct net *net;
7226         LIST_HEAD(dev_kill_list);
7227
7228         /* To prevent network device cleanup code from dereferencing
7229          * loopback devices or network devices that have been freed
7230          * wait here for all pending unregistrations to complete,
7231          * before unregistring the loopback device and allowing the
7232          * network namespace be freed.
7233          *
7234          * The netdev todo list containing all network devices
7235          * unregistrations that happen in default_device_exit_batch
7236          * will run in the rtnl_unlock() at the end of
7237          * default_device_exit_batch.
7238          */
7239         rtnl_lock_unregistering(net_list);
7240         list_for_each_entry(net, net_list, exit_list) {
7241                 for_each_netdev_reverse(net, dev) {
7242                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7243                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7244                         else
7245                                 unregister_netdevice_queue(dev, &dev_kill_list);
7246                 }
7247         }
7248         unregister_netdevice_many(&dev_kill_list);
7249         rtnl_unlock();
7250 }
7251
7252 static struct pernet_operations __net_initdata default_device_ops = {
7253         .exit = default_device_exit,
7254         .exit_batch = default_device_exit_batch,
7255 };
7256
7257 /*
7258  *      Initialize the DEV module. At boot time this walks the device list and
7259  *      unhooks any devices that fail to initialise (normally hardware not
7260  *      present) and leaves us with a valid list of present and active devices.
7261  *
7262  */
7263
7264 /*
7265  *       This is called single threaded during boot, so no need
7266  *       to take the rtnl semaphore.
7267  */
7268 static int __init net_dev_init(void)
7269 {
7270         int i, rc = -ENOMEM;
7271
7272         BUG_ON(!dev_boot_phase);
7273
7274         if (dev_proc_init())
7275                 goto out;
7276
7277         if (netdev_kobject_init())
7278                 goto out;
7279
7280         INIT_LIST_HEAD(&ptype_all);
7281         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7282                 INIT_LIST_HEAD(&ptype_base[i]);
7283
7284         INIT_LIST_HEAD(&offload_base);
7285
7286         if (register_pernet_subsys(&netdev_net_ops))
7287                 goto out;
7288
7289         /*
7290          *      Initialise the packet receive queues.
7291          */
7292
7293         for_each_possible_cpu(i) {
7294                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7295
7296                 skb_queue_head_init(&sd->input_pkt_queue);
7297                 skb_queue_head_init(&sd->process_queue);
7298                 INIT_LIST_HEAD(&sd->poll_list);
7299                 sd->output_queue_tailp = &sd->output_queue;
7300 #ifdef CONFIG_RPS
7301                 sd->csd.func = rps_trigger_softirq;
7302                 sd->csd.info = sd;
7303                 sd->cpu = i;
7304 #endif
7305
7306                 sd->backlog.poll = process_backlog;
7307                 sd->backlog.weight = weight_p;
7308         }
7309
7310         dev_boot_phase = 0;
7311
7312         /* The loopback device is special if any other network devices
7313          * is present in a network namespace the loopback device must
7314          * be present. Since we now dynamically allocate and free the
7315          * loopback device ensure this invariant is maintained by
7316          * keeping the loopback device as the first device on the
7317          * list of network devices.  Ensuring the loopback devices
7318          * is the first device that appears and the last network device
7319          * that disappears.
7320          */
7321         if (register_pernet_device(&loopback_net_ops))
7322                 goto out;
7323
7324         if (register_pernet_device(&default_device_ops))
7325                 goto out;
7326
7327         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7328         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7329
7330         hotcpu_notifier(dev_cpu_callback, 0);
7331         dst_init();
7332         rc = 0;
7333 out:
7334         return rc;
7335 }
7336
7337 subsys_initcall(net_dev_init);