net_sched: transform qdisc running bit into a seqcount
[cascardo/linux.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <net/busy_poll.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141 #include <linux/sctp.h>
142
143 #include "net-sysfs.h"
144
145 /* Instead of increasing this, you should create a hash table. */
146 #define MAX_GRO_SKBS 8
147
148 /* This should be increased if a protocol with a bigger head is added. */
149 #define GRO_MAX_HEAD (MAX_HEADER + 128)
150
151 static DEFINE_SPINLOCK(ptype_lock);
152 static DEFINE_SPINLOCK(offload_lock);
153 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
154 struct list_head ptype_all __read_mostly;       /* Taps */
155 static struct list_head offload_base __read_mostly;
156
157 static int netif_rx_internal(struct sk_buff *skb);
158 static int call_netdevice_notifiers_info(unsigned long val,
159                                          struct net_device *dev,
160                                          struct netdev_notifier_info *info);
161
162 /*
163  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
164  * semaphore.
165  *
166  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
167  *
168  * Writers must hold the rtnl semaphore while they loop through the
169  * dev_base_head list, and hold dev_base_lock for writing when they do the
170  * actual updates.  This allows pure readers to access the list even
171  * while a writer is preparing to update it.
172  *
173  * To put it another way, dev_base_lock is held for writing only to
174  * protect against pure readers; the rtnl semaphore provides the
175  * protection against other writers.
176  *
177  * See, for example usages, register_netdevice() and
178  * unregister_netdevice(), which must be called with the rtnl
179  * semaphore held.
180  */
181 DEFINE_RWLOCK(dev_base_lock);
182 EXPORT_SYMBOL(dev_base_lock);
183
184 /* protects napi_hash addition/deletion and napi_gen_id */
185 static DEFINE_SPINLOCK(napi_hash_lock);
186
187 static unsigned int napi_gen_id = NR_CPUS;
188 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
189
190 static seqcount_t devnet_rename_seq;
191
192 static inline void dev_base_seq_inc(struct net *net)
193 {
194         while (++net->dev_base_seq == 0);
195 }
196
197 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
198 {
199         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
200
201         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
202 }
203
204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
205 {
206         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
207 }
208
209 static inline void rps_lock(struct softnet_data *sd)
210 {
211 #ifdef CONFIG_RPS
212         spin_lock(&sd->input_pkt_queue.lock);
213 #endif
214 }
215
216 static inline void rps_unlock(struct softnet_data *sd)
217 {
218 #ifdef CONFIG_RPS
219         spin_unlock(&sd->input_pkt_queue.lock);
220 #endif
221 }
222
223 /* Device list insertion */
224 static void list_netdevice(struct net_device *dev)
225 {
226         struct net *net = dev_net(dev);
227
228         ASSERT_RTNL();
229
230         write_lock_bh(&dev_base_lock);
231         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
232         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
233         hlist_add_head_rcu(&dev->index_hlist,
234                            dev_index_hash(net, dev->ifindex));
235         write_unlock_bh(&dev_base_lock);
236
237         dev_base_seq_inc(net);
238 }
239
240 /* Device list removal
241  * caller must respect a RCU grace period before freeing/reusing dev
242  */
243 static void unlist_netdevice(struct net_device *dev)
244 {
245         ASSERT_RTNL();
246
247         /* Unlink dev from the device chain */
248         write_lock_bh(&dev_base_lock);
249         list_del_rcu(&dev->dev_list);
250         hlist_del_rcu(&dev->name_hlist);
251         hlist_del_rcu(&dev->index_hlist);
252         write_unlock_bh(&dev_base_lock);
253
254         dev_base_seq_inc(dev_net(dev));
255 }
256
257 /*
258  *      Our notifier list
259  */
260
261 static RAW_NOTIFIER_HEAD(netdev_chain);
262
263 /*
264  *      Device drivers call our routines to queue packets here. We empty the
265  *      queue in the local softnet handler.
266  */
267
268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269 EXPORT_PER_CPU_SYMBOL(softnet_data);
270
271 #ifdef CONFIG_LOCKDEP
272 /*
273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274  * according to dev->type
275  */
276 static const unsigned short netdev_lock_type[] =
277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
290          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
291          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
292
293 static const char *const netdev_lock_name[] =
294         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
307          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
308          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
309
310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
312
313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
314 {
315         int i;
316
317         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
318                 if (netdev_lock_type[i] == dev_type)
319                         return i;
320         /* the last key is used by default */
321         return ARRAY_SIZE(netdev_lock_type) - 1;
322 }
323
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325                                                  unsigned short dev_type)
326 {
327         int i;
328
329         i = netdev_lock_pos(dev_type);
330         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
331                                    netdev_lock_name[i]);
332 }
333
334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
335 {
336         int i;
337
338         i = netdev_lock_pos(dev->type);
339         lockdep_set_class_and_name(&dev->addr_list_lock,
340                                    &netdev_addr_lock_key[i],
341                                    netdev_lock_name[i]);
342 }
343 #else
344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
345                                                  unsigned short dev_type)
346 {
347 }
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 }
351 #endif
352
353 /*******************************************************************************
354
355                 Protocol management and registration routines
356
357 *******************************************************************************/
358
359 /*
360  *      Add a protocol ID to the list. Now that the input handler is
361  *      smarter we can dispense with all the messy stuff that used to be
362  *      here.
363  *
364  *      BEWARE!!! Protocol handlers, mangling input packets,
365  *      MUST BE last in hash buckets and checking protocol handlers
366  *      MUST start from promiscuous ptype_all chain in net_bh.
367  *      It is true now, do not change it.
368  *      Explanation follows: if protocol handler, mangling packet, will
369  *      be the first on list, it is not able to sense, that packet
370  *      is cloned and should be copied-on-write, so that it will
371  *      change it and subsequent readers will get broken packet.
372  *                                                      --ANK (980803)
373  */
374
375 static inline struct list_head *ptype_head(const struct packet_type *pt)
376 {
377         if (pt->type == htons(ETH_P_ALL))
378                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
379         else
380                 return pt->dev ? &pt->dev->ptype_specific :
381                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
382 }
383
384 /**
385  *      dev_add_pack - add packet handler
386  *      @pt: packet type declaration
387  *
388  *      Add a protocol handler to the networking stack. The passed &packet_type
389  *      is linked into kernel lists and may not be freed until it has been
390  *      removed from the kernel lists.
391  *
392  *      This call does not sleep therefore it can not
393  *      guarantee all CPU's that are in middle of receiving packets
394  *      will see the new packet type (until the next received packet).
395  */
396
397 void dev_add_pack(struct packet_type *pt)
398 {
399         struct list_head *head = ptype_head(pt);
400
401         spin_lock(&ptype_lock);
402         list_add_rcu(&pt->list, head);
403         spin_unlock(&ptype_lock);
404 }
405 EXPORT_SYMBOL(dev_add_pack);
406
407 /**
408  *      __dev_remove_pack        - remove packet handler
409  *      @pt: packet type declaration
410  *
411  *      Remove a protocol handler that was previously added to the kernel
412  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
413  *      from the kernel lists and can be freed or reused once this function
414  *      returns.
415  *
416  *      The packet type might still be in use by receivers
417  *      and must not be freed until after all the CPU's have gone
418  *      through a quiescent state.
419  */
420 void __dev_remove_pack(struct packet_type *pt)
421 {
422         struct list_head *head = ptype_head(pt);
423         struct packet_type *pt1;
424
425         spin_lock(&ptype_lock);
426
427         list_for_each_entry(pt1, head, list) {
428                 if (pt == pt1) {
429                         list_del_rcu(&pt->list);
430                         goto out;
431                 }
432         }
433
434         pr_warn("dev_remove_pack: %p not found\n", pt);
435 out:
436         spin_unlock(&ptype_lock);
437 }
438 EXPORT_SYMBOL(__dev_remove_pack);
439
440 /**
441  *      dev_remove_pack  - remove packet handler
442  *      @pt: packet type declaration
443  *
444  *      Remove a protocol handler that was previously added to the kernel
445  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
446  *      from the kernel lists and can be freed or reused once this function
447  *      returns.
448  *
449  *      This call sleeps to guarantee that no CPU is looking at the packet
450  *      type after return.
451  */
452 void dev_remove_pack(struct packet_type *pt)
453 {
454         __dev_remove_pack(pt);
455
456         synchronize_net();
457 }
458 EXPORT_SYMBOL(dev_remove_pack);
459
460
461 /**
462  *      dev_add_offload - register offload handlers
463  *      @po: protocol offload declaration
464  *
465  *      Add protocol offload handlers to the networking stack. The passed
466  *      &proto_offload is linked into kernel lists and may not be freed until
467  *      it has been removed from the kernel lists.
468  *
469  *      This call does not sleep therefore it can not
470  *      guarantee all CPU's that are in middle of receiving packets
471  *      will see the new offload handlers (until the next received packet).
472  */
473 void dev_add_offload(struct packet_offload *po)
474 {
475         struct packet_offload *elem;
476
477         spin_lock(&offload_lock);
478         list_for_each_entry(elem, &offload_base, list) {
479                 if (po->priority < elem->priority)
480                         break;
481         }
482         list_add_rcu(&po->list, elem->list.prev);
483         spin_unlock(&offload_lock);
484 }
485 EXPORT_SYMBOL(dev_add_offload);
486
487 /**
488  *      __dev_remove_offload     - remove offload handler
489  *      @po: packet offload declaration
490  *
491  *      Remove a protocol offload handler that was previously added to the
492  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
493  *      is removed from the kernel lists and can be freed or reused once this
494  *      function returns.
495  *
496  *      The packet type might still be in use by receivers
497  *      and must not be freed until after all the CPU's have gone
498  *      through a quiescent state.
499  */
500 static void __dev_remove_offload(struct packet_offload *po)
501 {
502         struct list_head *head = &offload_base;
503         struct packet_offload *po1;
504
505         spin_lock(&offload_lock);
506
507         list_for_each_entry(po1, head, list) {
508                 if (po == po1) {
509                         list_del_rcu(&po->list);
510                         goto out;
511                 }
512         }
513
514         pr_warn("dev_remove_offload: %p not found\n", po);
515 out:
516         spin_unlock(&offload_lock);
517 }
518
519 /**
520  *      dev_remove_offload       - remove packet offload handler
521  *      @po: packet offload declaration
522  *
523  *      Remove a packet offload handler that was previously added to the kernel
524  *      offload handlers by dev_add_offload(). The passed &offload_type is
525  *      removed from the kernel lists and can be freed or reused once this
526  *      function returns.
527  *
528  *      This call sleeps to guarantee that no CPU is looking at the packet
529  *      type after return.
530  */
531 void dev_remove_offload(struct packet_offload *po)
532 {
533         __dev_remove_offload(po);
534
535         synchronize_net();
536 }
537 EXPORT_SYMBOL(dev_remove_offload);
538
539 /******************************************************************************
540
541                       Device Boot-time Settings Routines
542
543 *******************************************************************************/
544
545 /* Boot time configuration table */
546 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
547
548 /**
549  *      netdev_boot_setup_add   - add new setup entry
550  *      @name: name of the device
551  *      @map: configured settings for the device
552  *
553  *      Adds new setup entry to the dev_boot_setup list.  The function
554  *      returns 0 on error and 1 on success.  This is a generic routine to
555  *      all netdevices.
556  */
557 static int netdev_boot_setup_add(char *name, struct ifmap *map)
558 {
559         struct netdev_boot_setup *s;
560         int i;
561
562         s = dev_boot_setup;
563         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
564                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
565                         memset(s[i].name, 0, sizeof(s[i].name));
566                         strlcpy(s[i].name, name, IFNAMSIZ);
567                         memcpy(&s[i].map, map, sizeof(s[i].map));
568                         break;
569                 }
570         }
571
572         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
573 }
574
575 /**
576  *      netdev_boot_setup_check - check boot time settings
577  *      @dev: the netdevice
578  *
579  *      Check boot time settings for the device.
580  *      The found settings are set for the device to be used
581  *      later in the device probing.
582  *      Returns 0 if no settings found, 1 if they are.
583  */
584 int netdev_boot_setup_check(struct net_device *dev)
585 {
586         struct netdev_boot_setup *s = dev_boot_setup;
587         int i;
588
589         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
590                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
591                     !strcmp(dev->name, s[i].name)) {
592                         dev->irq        = s[i].map.irq;
593                         dev->base_addr  = s[i].map.base_addr;
594                         dev->mem_start  = s[i].map.mem_start;
595                         dev->mem_end    = s[i].map.mem_end;
596                         return 1;
597                 }
598         }
599         return 0;
600 }
601 EXPORT_SYMBOL(netdev_boot_setup_check);
602
603
604 /**
605  *      netdev_boot_base        - get address from boot time settings
606  *      @prefix: prefix for network device
607  *      @unit: id for network device
608  *
609  *      Check boot time settings for the base address of device.
610  *      The found settings are set for the device to be used
611  *      later in the device probing.
612  *      Returns 0 if no settings found.
613  */
614 unsigned long netdev_boot_base(const char *prefix, int unit)
615 {
616         const struct netdev_boot_setup *s = dev_boot_setup;
617         char name[IFNAMSIZ];
618         int i;
619
620         sprintf(name, "%s%d", prefix, unit);
621
622         /*
623          * If device already registered then return base of 1
624          * to indicate not to probe for this interface
625          */
626         if (__dev_get_by_name(&init_net, name))
627                 return 1;
628
629         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
630                 if (!strcmp(name, s[i].name))
631                         return s[i].map.base_addr;
632         return 0;
633 }
634
635 /*
636  * Saves at boot time configured settings for any netdevice.
637  */
638 int __init netdev_boot_setup(char *str)
639 {
640         int ints[5];
641         struct ifmap map;
642
643         str = get_options(str, ARRAY_SIZE(ints), ints);
644         if (!str || !*str)
645                 return 0;
646
647         /* Save settings */
648         memset(&map, 0, sizeof(map));
649         if (ints[0] > 0)
650                 map.irq = ints[1];
651         if (ints[0] > 1)
652                 map.base_addr = ints[2];
653         if (ints[0] > 2)
654                 map.mem_start = ints[3];
655         if (ints[0] > 3)
656                 map.mem_end = ints[4];
657
658         /* Add new entry to the list */
659         return netdev_boot_setup_add(str, &map);
660 }
661
662 __setup("netdev=", netdev_boot_setup);
663
664 /*******************************************************************************
665
666                             Device Interface Subroutines
667
668 *******************************************************************************/
669
670 /**
671  *      dev_get_iflink  - get 'iflink' value of a interface
672  *      @dev: targeted interface
673  *
674  *      Indicates the ifindex the interface is linked to.
675  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
676  */
677
678 int dev_get_iflink(const struct net_device *dev)
679 {
680         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
681                 return dev->netdev_ops->ndo_get_iflink(dev);
682
683         return dev->ifindex;
684 }
685 EXPORT_SYMBOL(dev_get_iflink);
686
687 /**
688  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
689  *      @dev: targeted interface
690  *      @skb: The packet.
691  *
692  *      For better visibility of tunnel traffic OVS needs to retrieve
693  *      egress tunnel information for a packet. Following API allows
694  *      user to get this info.
695  */
696 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
697 {
698         struct ip_tunnel_info *info;
699
700         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
701                 return -EINVAL;
702
703         info = skb_tunnel_info_unclone(skb);
704         if (!info)
705                 return -ENOMEM;
706         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
707                 return -EINVAL;
708
709         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
710 }
711 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
712
713 /**
714  *      __dev_get_by_name       - find a device by its name
715  *      @net: the applicable net namespace
716  *      @name: name to find
717  *
718  *      Find an interface by name. Must be called under RTNL semaphore
719  *      or @dev_base_lock. If the name is found a pointer to the device
720  *      is returned. If the name is not found then %NULL is returned. The
721  *      reference counters are not incremented so the caller must be
722  *      careful with locks.
723  */
724
725 struct net_device *__dev_get_by_name(struct net *net, const char *name)
726 {
727         struct net_device *dev;
728         struct hlist_head *head = dev_name_hash(net, name);
729
730         hlist_for_each_entry(dev, head, name_hlist)
731                 if (!strncmp(dev->name, name, IFNAMSIZ))
732                         return dev;
733
734         return NULL;
735 }
736 EXPORT_SYMBOL(__dev_get_by_name);
737
738 /**
739  *      dev_get_by_name_rcu     - find a device by its name
740  *      @net: the applicable net namespace
741  *      @name: name to find
742  *
743  *      Find an interface by name.
744  *      If the name is found a pointer to the device is returned.
745  *      If the name is not found then %NULL is returned.
746  *      The reference counters are not incremented so the caller must be
747  *      careful with locks. The caller must hold RCU lock.
748  */
749
750 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
751 {
752         struct net_device *dev;
753         struct hlist_head *head = dev_name_hash(net, name);
754
755         hlist_for_each_entry_rcu(dev, head, name_hlist)
756                 if (!strncmp(dev->name, name, IFNAMSIZ))
757                         return dev;
758
759         return NULL;
760 }
761 EXPORT_SYMBOL(dev_get_by_name_rcu);
762
763 /**
764  *      dev_get_by_name         - find a device by its name
765  *      @net: the applicable net namespace
766  *      @name: name to find
767  *
768  *      Find an interface by name. This can be called from any
769  *      context and does its own locking. The returned handle has
770  *      the usage count incremented and the caller must use dev_put() to
771  *      release it when it is no longer needed. %NULL is returned if no
772  *      matching device is found.
773  */
774
775 struct net_device *dev_get_by_name(struct net *net, const char *name)
776 {
777         struct net_device *dev;
778
779         rcu_read_lock();
780         dev = dev_get_by_name_rcu(net, name);
781         if (dev)
782                 dev_hold(dev);
783         rcu_read_unlock();
784         return dev;
785 }
786 EXPORT_SYMBOL(dev_get_by_name);
787
788 /**
789  *      __dev_get_by_index - find a device by its ifindex
790  *      @net: the applicable net namespace
791  *      @ifindex: index of device
792  *
793  *      Search for an interface by index. Returns %NULL if the device
794  *      is not found or a pointer to the device. The device has not
795  *      had its reference counter increased so the caller must be careful
796  *      about locking. The caller must hold either the RTNL semaphore
797  *      or @dev_base_lock.
798  */
799
800 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
801 {
802         struct net_device *dev;
803         struct hlist_head *head = dev_index_hash(net, ifindex);
804
805         hlist_for_each_entry(dev, head, index_hlist)
806                 if (dev->ifindex == ifindex)
807                         return dev;
808
809         return NULL;
810 }
811 EXPORT_SYMBOL(__dev_get_by_index);
812
813 /**
814  *      dev_get_by_index_rcu - find a device by its ifindex
815  *      @net: the applicable net namespace
816  *      @ifindex: index of device
817  *
818  *      Search for an interface by index. Returns %NULL if the device
819  *      is not found or a pointer to the device. The device has not
820  *      had its reference counter increased so the caller must be careful
821  *      about locking. The caller must hold RCU lock.
822  */
823
824 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
825 {
826         struct net_device *dev;
827         struct hlist_head *head = dev_index_hash(net, ifindex);
828
829         hlist_for_each_entry_rcu(dev, head, index_hlist)
830                 if (dev->ifindex == ifindex)
831                         return dev;
832
833         return NULL;
834 }
835 EXPORT_SYMBOL(dev_get_by_index_rcu);
836
837
838 /**
839  *      dev_get_by_index - find a device by its ifindex
840  *      @net: the applicable net namespace
841  *      @ifindex: index of device
842  *
843  *      Search for an interface by index. Returns NULL if the device
844  *      is not found or a pointer to the device. The device returned has
845  *      had a reference added and the pointer is safe until the user calls
846  *      dev_put to indicate they have finished with it.
847  */
848
849 struct net_device *dev_get_by_index(struct net *net, int ifindex)
850 {
851         struct net_device *dev;
852
853         rcu_read_lock();
854         dev = dev_get_by_index_rcu(net, ifindex);
855         if (dev)
856                 dev_hold(dev);
857         rcu_read_unlock();
858         return dev;
859 }
860 EXPORT_SYMBOL(dev_get_by_index);
861
862 /**
863  *      netdev_get_name - get a netdevice name, knowing its ifindex.
864  *      @net: network namespace
865  *      @name: a pointer to the buffer where the name will be stored.
866  *      @ifindex: the ifindex of the interface to get the name from.
867  *
868  *      The use of raw_seqcount_begin() and cond_resched() before
869  *      retrying is required as we want to give the writers a chance
870  *      to complete when CONFIG_PREEMPT is not set.
871  */
872 int netdev_get_name(struct net *net, char *name, int ifindex)
873 {
874         struct net_device *dev;
875         unsigned int seq;
876
877 retry:
878         seq = raw_seqcount_begin(&devnet_rename_seq);
879         rcu_read_lock();
880         dev = dev_get_by_index_rcu(net, ifindex);
881         if (!dev) {
882                 rcu_read_unlock();
883                 return -ENODEV;
884         }
885
886         strcpy(name, dev->name);
887         rcu_read_unlock();
888         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
889                 cond_resched();
890                 goto retry;
891         }
892
893         return 0;
894 }
895
896 /**
897  *      dev_getbyhwaddr_rcu - find a device by its hardware address
898  *      @net: the applicable net namespace
899  *      @type: media type of device
900  *      @ha: hardware address
901  *
902  *      Search for an interface by MAC address. Returns NULL if the device
903  *      is not found or a pointer to the device.
904  *      The caller must hold RCU or RTNL.
905  *      The returned device has not had its ref count increased
906  *      and the caller must therefore be careful about locking
907  *
908  */
909
910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
911                                        const char *ha)
912 {
913         struct net_device *dev;
914
915         for_each_netdev_rcu(net, dev)
916                 if (dev->type == type &&
917                     !memcmp(dev->dev_addr, ha, dev->addr_len))
918                         return dev;
919
920         return NULL;
921 }
922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
923
924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
925 {
926         struct net_device *dev;
927
928         ASSERT_RTNL();
929         for_each_netdev(net, dev)
930                 if (dev->type == type)
931                         return dev;
932
933         return NULL;
934 }
935 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
936
937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
938 {
939         struct net_device *dev, *ret = NULL;
940
941         rcu_read_lock();
942         for_each_netdev_rcu(net, dev)
943                 if (dev->type == type) {
944                         dev_hold(dev);
945                         ret = dev;
946                         break;
947                 }
948         rcu_read_unlock();
949         return ret;
950 }
951 EXPORT_SYMBOL(dev_getfirstbyhwtype);
952
953 /**
954  *      __dev_get_by_flags - find any device with given flags
955  *      @net: the applicable net namespace
956  *      @if_flags: IFF_* values
957  *      @mask: bitmask of bits in if_flags to check
958  *
959  *      Search for any interface with the given flags. Returns NULL if a device
960  *      is not found or a pointer to the device. Must be called inside
961  *      rtnl_lock(), and result refcount is unchanged.
962  */
963
964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
965                                       unsigned short mask)
966 {
967         struct net_device *dev, *ret;
968
969         ASSERT_RTNL();
970
971         ret = NULL;
972         for_each_netdev(net, dev) {
973                 if (((dev->flags ^ if_flags) & mask) == 0) {
974                         ret = dev;
975                         break;
976                 }
977         }
978         return ret;
979 }
980 EXPORT_SYMBOL(__dev_get_by_flags);
981
982 /**
983  *      dev_valid_name - check if name is okay for network device
984  *      @name: name string
985  *
986  *      Network device names need to be valid file names to
987  *      to allow sysfs to work.  We also disallow any kind of
988  *      whitespace.
989  */
990 bool dev_valid_name(const char *name)
991 {
992         if (*name == '\0')
993                 return false;
994         if (strlen(name) >= IFNAMSIZ)
995                 return false;
996         if (!strcmp(name, ".") || !strcmp(name, ".."))
997                 return false;
998
999         while (*name) {
1000                 if (*name == '/' || *name == ':' || isspace(*name))
1001                         return false;
1002                 name++;
1003         }
1004         return true;
1005 }
1006 EXPORT_SYMBOL(dev_valid_name);
1007
1008 /**
1009  *      __dev_alloc_name - allocate a name for a device
1010  *      @net: network namespace to allocate the device name in
1011  *      @name: name format string
1012  *      @buf:  scratch buffer and result name string
1013  *
1014  *      Passed a format string - eg "lt%d" it will try and find a suitable
1015  *      id. It scans list of devices to build up a free map, then chooses
1016  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017  *      while allocating the name and adding the device in order to avoid
1018  *      duplicates.
1019  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020  *      Returns the number of the unit assigned or a negative errno code.
1021  */
1022
1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024 {
1025         int i = 0;
1026         const char *p;
1027         const int max_netdevices = 8*PAGE_SIZE;
1028         unsigned long *inuse;
1029         struct net_device *d;
1030
1031         p = strnchr(name, IFNAMSIZ-1, '%');
1032         if (p) {
1033                 /*
1034                  * Verify the string as this thing may have come from
1035                  * the user.  There must be either one "%d" and no other "%"
1036                  * characters.
1037                  */
1038                 if (p[1] != 'd' || strchr(p + 2, '%'))
1039                         return -EINVAL;
1040
1041                 /* Use one page as a bit array of possible slots */
1042                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                 if (!inuse)
1044                         return -ENOMEM;
1045
1046                 for_each_netdev(net, d) {
1047                         if (!sscanf(d->name, name, &i))
1048                                 continue;
1049                         if (i < 0 || i >= max_netdevices)
1050                                 continue;
1051
1052                         /*  avoid cases where sscanf is not exact inverse of printf */
1053                         snprintf(buf, IFNAMSIZ, name, i);
1054                         if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                 set_bit(i, inuse);
1056                 }
1057
1058                 i = find_first_zero_bit(inuse, max_netdevices);
1059                 free_page((unsigned long) inuse);
1060         }
1061
1062         if (buf != name)
1063                 snprintf(buf, IFNAMSIZ, name, i);
1064         if (!__dev_get_by_name(net, buf))
1065                 return i;
1066
1067         /* It is possible to run out of possible slots
1068          * when the name is long and there isn't enough space left
1069          * for the digits, or if all bits are used.
1070          */
1071         return -ENFILE;
1072 }
1073
1074 /**
1075  *      dev_alloc_name - allocate a name for a device
1076  *      @dev: device
1077  *      @name: name format string
1078  *
1079  *      Passed a format string - eg "lt%d" it will try and find a suitable
1080  *      id. It scans list of devices to build up a free map, then chooses
1081  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082  *      while allocating the name and adding the device in order to avoid
1083  *      duplicates.
1084  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085  *      Returns the number of the unit assigned or a negative errno code.
1086  */
1087
1088 int dev_alloc_name(struct net_device *dev, const char *name)
1089 {
1090         char buf[IFNAMSIZ];
1091         struct net *net;
1092         int ret;
1093
1094         BUG_ON(!dev_net(dev));
1095         net = dev_net(dev);
1096         ret = __dev_alloc_name(net, name, buf);
1097         if (ret >= 0)
1098                 strlcpy(dev->name, buf, IFNAMSIZ);
1099         return ret;
1100 }
1101 EXPORT_SYMBOL(dev_alloc_name);
1102
1103 static int dev_alloc_name_ns(struct net *net,
1104                              struct net_device *dev,
1105                              const char *name)
1106 {
1107         char buf[IFNAMSIZ];
1108         int ret;
1109
1110         ret = __dev_alloc_name(net, name, buf);
1111         if (ret >= 0)
1112                 strlcpy(dev->name, buf, IFNAMSIZ);
1113         return ret;
1114 }
1115
1116 static int dev_get_valid_name(struct net *net,
1117                               struct net_device *dev,
1118                               const char *name)
1119 {
1120         BUG_ON(!net);
1121
1122         if (!dev_valid_name(name))
1123                 return -EINVAL;
1124
1125         if (strchr(name, '%'))
1126                 return dev_alloc_name_ns(net, dev, name);
1127         else if (__dev_get_by_name(net, name))
1128                 return -EEXIST;
1129         else if (dev->name != name)
1130                 strlcpy(dev->name, name, IFNAMSIZ);
1131
1132         return 0;
1133 }
1134
1135 /**
1136  *      dev_change_name - change name of a device
1137  *      @dev: device
1138  *      @newname: name (or format string) must be at least IFNAMSIZ
1139  *
1140  *      Change name of a device, can pass format strings "eth%d".
1141  *      for wildcarding.
1142  */
1143 int dev_change_name(struct net_device *dev, const char *newname)
1144 {
1145         unsigned char old_assign_type;
1146         char oldname[IFNAMSIZ];
1147         int err = 0;
1148         int ret;
1149         struct net *net;
1150
1151         ASSERT_RTNL();
1152         BUG_ON(!dev_net(dev));
1153
1154         net = dev_net(dev);
1155         if (dev->flags & IFF_UP)
1156                 return -EBUSY;
1157
1158         write_seqcount_begin(&devnet_rename_seq);
1159
1160         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161                 write_seqcount_end(&devnet_rename_seq);
1162                 return 0;
1163         }
1164
1165         memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167         err = dev_get_valid_name(net, dev, newname);
1168         if (err < 0) {
1169                 write_seqcount_end(&devnet_rename_seq);
1170                 return err;
1171         }
1172
1173         if (oldname[0] && !strchr(oldname, '%'))
1174                 netdev_info(dev, "renamed from %s\n", oldname);
1175
1176         old_assign_type = dev->name_assign_type;
1177         dev->name_assign_type = NET_NAME_RENAMED;
1178
1179 rollback:
1180         ret = device_rename(&dev->dev, dev->name);
1181         if (ret) {
1182                 memcpy(dev->name, oldname, IFNAMSIZ);
1183                 dev->name_assign_type = old_assign_type;
1184                 write_seqcount_end(&devnet_rename_seq);
1185                 return ret;
1186         }
1187
1188         write_seqcount_end(&devnet_rename_seq);
1189
1190         netdev_adjacent_rename_links(dev, oldname);
1191
1192         write_lock_bh(&dev_base_lock);
1193         hlist_del_rcu(&dev->name_hlist);
1194         write_unlock_bh(&dev_base_lock);
1195
1196         synchronize_rcu();
1197
1198         write_lock_bh(&dev_base_lock);
1199         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200         write_unlock_bh(&dev_base_lock);
1201
1202         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203         ret = notifier_to_errno(ret);
1204
1205         if (ret) {
1206                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1207                 if (err >= 0) {
1208                         err = ret;
1209                         write_seqcount_begin(&devnet_rename_seq);
1210                         memcpy(dev->name, oldname, IFNAMSIZ);
1211                         memcpy(oldname, newname, IFNAMSIZ);
1212                         dev->name_assign_type = old_assign_type;
1213                         old_assign_type = NET_NAME_RENAMED;
1214                         goto rollback;
1215                 } else {
1216                         pr_err("%s: name change rollback failed: %d\n",
1217                                dev->name, ret);
1218                 }
1219         }
1220
1221         return err;
1222 }
1223
1224 /**
1225  *      dev_set_alias - change ifalias of a device
1226  *      @dev: device
1227  *      @alias: name up to IFALIASZ
1228  *      @len: limit of bytes to copy from info
1229  *
1230  *      Set ifalias for a device,
1231  */
1232 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233 {
1234         char *new_ifalias;
1235
1236         ASSERT_RTNL();
1237
1238         if (len >= IFALIASZ)
1239                 return -EINVAL;
1240
1241         if (!len) {
1242                 kfree(dev->ifalias);
1243                 dev->ifalias = NULL;
1244                 return 0;
1245         }
1246
1247         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248         if (!new_ifalias)
1249                 return -ENOMEM;
1250         dev->ifalias = new_ifalias;
1251
1252         strlcpy(dev->ifalias, alias, len+1);
1253         return len;
1254 }
1255
1256
1257 /**
1258  *      netdev_features_change - device changes features
1259  *      @dev: device to cause notification
1260  *
1261  *      Called to indicate a device has changed features.
1262  */
1263 void netdev_features_change(struct net_device *dev)
1264 {
1265         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266 }
1267 EXPORT_SYMBOL(netdev_features_change);
1268
1269 /**
1270  *      netdev_state_change - device changes state
1271  *      @dev: device to cause notification
1272  *
1273  *      Called to indicate a device has changed state. This function calls
1274  *      the notifier chains for netdev_chain and sends a NEWLINK message
1275  *      to the routing socket.
1276  */
1277 void netdev_state_change(struct net_device *dev)
1278 {
1279         if (dev->flags & IFF_UP) {
1280                 struct netdev_notifier_change_info change_info;
1281
1282                 change_info.flags_changed = 0;
1283                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284                                               &change_info.info);
1285                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286         }
1287 }
1288 EXPORT_SYMBOL(netdev_state_change);
1289
1290 /**
1291  *      netdev_notify_peers - notify network peers about existence of @dev
1292  *      @dev: network device
1293  *
1294  * Generate traffic such that interested network peers are aware of
1295  * @dev, such as by generating a gratuitous ARP. This may be used when
1296  * a device wants to inform the rest of the network about some sort of
1297  * reconfiguration such as a failover event or virtual machine
1298  * migration.
1299  */
1300 void netdev_notify_peers(struct net_device *dev)
1301 {
1302         rtnl_lock();
1303         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304         rtnl_unlock();
1305 }
1306 EXPORT_SYMBOL(netdev_notify_peers);
1307
1308 static int __dev_open(struct net_device *dev)
1309 {
1310         const struct net_device_ops *ops = dev->netdev_ops;
1311         int ret;
1312
1313         ASSERT_RTNL();
1314
1315         if (!netif_device_present(dev))
1316                 return -ENODEV;
1317
1318         /* Block netpoll from trying to do any rx path servicing.
1319          * If we don't do this there is a chance ndo_poll_controller
1320          * or ndo_poll may be running while we open the device
1321          */
1322         netpoll_poll_disable(dev);
1323
1324         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325         ret = notifier_to_errno(ret);
1326         if (ret)
1327                 return ret;
1328
1329         set_bit(__LINK_STATE_START, &dev->state);
1330
1331         if (ops->ndo_validate_addr)
1332                 ret = ops->ndo_validate_addr(dev);
1333
1334         if (!ret && ops->ndo_open)
1335                 ret = ops->ndo_open(dev);
1336
1337         netpoll_poll_enable(dev);
1338
1339         if (ret)
1340                 clear_bit(__LINK_STATE_START, &dev->state);
1341         else {
1342                 dev->flags |= IFF_UP;
1343                 dev_set_rx_mode(dev);
1344                 dev_activate(dev);
1345                 add_device_randomness(dev->dev_addr, dev->addr_len);
1346         }
1347
1348         return ret;
1349 }
1350
1351 /**
1352  *      dev_open        - prepare an interface for use.
1353  *      @dev:   device to open
1354  *
1355  *      Takes a device from down to up state. The device's private open
1356  *      function is invoked and then the multicast lists are loaded. Finally
1357  *      the device is moved into the up state and a %NETDEV_UP message is
1358  *      sent to the netdev notifier chain.
1359  *
1360  *      Calling this function on an active interface is a nop. On a failure
1361  *      a negative errno code is returned.
1362  */
1363 int dev_open(struct net_device *dev)
1364 {
1365         int ret;
1366
1367         if (dev->flags & IFF_UP)
1368                 return 0;
1369
1370         ret = __dev_open(dev);
1371         if (ret < 0)
1372                 return ret;
1373
1374         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375         call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377         return ret;
1378 }
1379 EXPORT_SYMBOL(dev_open);
1380
1381 static int __dev_close_many(struct list_head *head)
1382 {
1383         struct net_device *dev;
1384
1385         ASSERT_RTNL();
1386         might_sleep();
1387
1388         list_for_each_entry(dev, head, close_list) {
1389                 /* Temporarily disable netpoll until the interface is down */
1390                 netpoll_poll_disable(dev);
1391
1392                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394                 clear_bit(__LINK_STATE_START, &dev->state);
1395
1396                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1397                  * can be even on different cpu. So just clear netif_running().
1398                  *
1399                  * dev->stop() will invoke napi_disable() on all of it's
1400                  * napi_struct instances on this device.
1401                  */
1402                 smp_mb__after_atomic(); /* Commit netif_running(). */
1403         }
1404
1405         dev_deactivate_many(head);
1406
1407         list_for_each_entry(dev, head, close_list) {
1408                 const struct net_device_ops *ops = dev->netdev_ops;
1409
1410                 /*
1411                  *      Call the device specific close. This cannot fail.
1412                  *      Only if device is UP
1413                  *
1414                  *      We allow it to be called even after a DETACH hot-plug
1415                  *      event.
1416                  */
1417                 if (ops->ndo_stop)
1418                         ops->ndo_stop(dev);
1419
1420                 dev->flags &= ~IFF_UP;
1421                 netpoll_poll_enable(dev);
1422         }
1423
1424         return 0;
1425 }
1426
1427 static int __dev_close(struct net_device *dev)
1428 {
1429         int retval;
1430         LIST_HEAD(single);
1431
1432         list_add(&dev->close_list, &single);
1433         retval = __dev_close_many(&single);
1434         list_del(&single);
1435
1436         return retval;
1437 }
1438
1439 int dev_close_many(struct list_head *head, bool unlink)
1440 {
1441         struct net_device *dev, *tmp;
1442
1443         /* Remove the devices that don't need to be closed */
1444         list_for_each_entry_safe(dev, tmp, head, close_list)
1445                 if (!(dev->flags & IFF_UP))
1446                         list_del_init(&dev->close_list);
1447
1448         __dev_close_many(head);
1449
1450         list_for_each_entry_safe(dev, tmp, head, close_list) {
1451                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1453                 if (unlink)
1454                         list_del_init(&dev->close_list);
1455         }
1456
1457         return 0;
1458 }
1459 EXPORT_SYMBOL(dev_close_many);
1460
1461 /**
1462  *      dev_close - shutdown an interface.
1463  *      @dev: device to shutdown
1464  *
1465  *      This function moves an active device into down state. A
1466  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468  *      chain.
1469  */
1470 int dev_close(struct net_device *dev)
1471 {
1472         if (dev->flags & IFF_UP) {
1473                 LIST_HEAD(single);
1474
1475                 list_add(&dev->close_list, &single);
1476                 dev_close_many(&single, true);
1477                 list_del(&single);
1478         }
1479         return 0;
1480 }
1481 EXPORT_SYMBOL(dev_close);
1482
1483
1484 /**
1485  *      dev_disable_lro - disable Large Receive Offload on a device
1486  *      @dev: device
1487  *
1488  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1489  *      called under RTNL.  This is needed if received packets may be
1490  *      forwarded to another interface.
1491  */
1492 void dev_disable_lro(struct net_device *dev)
1493 {
1494         struct net_device *lower_dev;
1495         struct list_head *iter;
1496
1497         dev->wanted_features &= ~NETIF_F_LRO;
1498         netdev_update_features(dev);
1499
1500         if (unlikely(dev->features & NETIF_F_LRO))
1501                 netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503         netdev_for_each_lower_dev(dev, lower_dev, iter)
1504                 dev_disable_lro(lower_dev);
1505 }
1506 EXPORT_SYMBOL(dev_disable_lro);
1507
1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509                                    struct net_device *dev)
1510 {
1511         struct netdev_notifier_info info;
1512
1513         netdev_notifier_info_init(&info, dev);
1514         return nb->notifier_call(nb, val, &info);
1515 }
1516
1517 static int dev_boot_phase = 1;
1518
1519 /**
1520  *      register_netdevice_notifier - register a network notifier block
1521  *      @nb: notifier
1522  *
1523  *      Register a notifier to be called when network device events occur.
1524  *      The notifier passed is linked into the kernel structures and must
1525  *      not be reused until it has been unregistered. A negative errno code
1526  *      is returned on a failure.
1527  *
1528  *      When registered all registration and up events are replayed
1529  *      to the new notifier to allow device to have a race free
1530  *      view of the network device list.
1531  */
1532
1533 int register_netdevice_notifier(struct notifier_block *nb)
1534 {
1535         struct net_device *dev;
1536         struct net_device *last;
1537         struct net *net;
1538         int err;
1539
1540         rtnl_lock();
1541         err = raw_notifier_chain_register(&netdev_chain, nb);
1542         if (err)
1543                 goto unlock;
1544         if (dev_boot_phase)
1545                 goto unlock;
1546         for_each_net(net) {
1547                 for_each_netdev(net, dev) {
1548                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549                         err = notifier_to_errno(err);
1550                         if (err)
1551                                 goto rollback;
1552
1553                         if (!(dev->flags & IFF_UP))
1554                                 continue;
1555
1556                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1557                 }
1558         }
1559
1560 unlock:
1561         rtnl_unlock();
1562         return err;
1563
1564 rollback:
1565         last = dev;
1566         for_each_net(net) {
1567                 for_each_netdev(net, dev) {
1568                         if (dev == last)
1569                                 goto outroll;
1570
1571                         if (dev->flags & IFF_UP) {
1572                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573                                                         dev);
1574                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575                         }
1576                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577                 }
1578         }
1579
1580 outroll:
1581         raw_notifier_chain_unregister(&netdev_chain, nb);
1582         goto unlock;
1583 }
1584 EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586 /**
1587  *      unregister_netdevice_notifier - unregister a network notifier block
1588  *      @nb: notifier
1589  *
1590  *      Unregister a notifier previously registered by
1591  *      register_netdevice_notifier(). The notifier is unlinked into the
1592  *      kernel structures and may then be reused. A negative errno code
1593  *      is returned on a failure.
1594  *
1595  *      After unregistering unregister and down device events are synthesized
1596  *      for all devices on the device list to the removed notifier to remove
1597  *      the need for special case cleanup code.
1598  */
1599
1600 int unregister_netdevice_notifier(struct notifier_block *nb)
1601 {
1602         struct net_device *dev;
1603         struct net *net;
1604         int err;
1605
1606         rtnl_lock();
1607         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608         if (err)
1609                 goto unlock;
1610
1611         for_each_net(net) {
1612                 for_each_netdev(net, dev) {
1613                         if (dev->flags & IFF_UP) {
1614                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615                                                         dev);
1616                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617                         }
1618                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619                 }
1620         }
1621 unlock:
1622         rtnl_unlock();
1623         return err;
1624 }
1625 EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627 /**
1628  *      call_netdevice_notifiers_info - call all network notifier blocks
1629  *      @val: value passed unmodified to notifier function
1630  *      @dev: net_device pointer passed unmodified to notifier function
1631  *      @info: notifier information data
1632  *
1633  *      Call all network notifier blocks.  Parameters and return value
1634  *      are as for raw_notifier_call_chain().
1635  */
1636
1637 static int call_netdevice_notifiers_info(unsigned long val,
1638                                          struct net_device *dev,
1639                                          struct netdev_notifier_info *info)
1640 {
1641         ASSERT_RTNL();
1642         netdev_notifier_info_init(info, dev);
1643         return raw_notifier_call_chain(&netdev_chain, val, info);
1644 }
1645
1646 /**
1647  *      call_netdevice_notifiers - call all network notifier blocks
1648  *      @val: value passed unmodified to notifier function
1649  *      @dev: net_device pointer passed unmodified to notifier function
1650  *
1651  *      Call all network notifier blocks.  Parameters and return value
1652  *      are as for raw_notifier_call_chain().
1653  */
1654
1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656 {
1657         struct netdev_notifier_info info;
1658
1659         return call_netdevice_notifiers_info(val, dev, &info);
1660 }
1661 EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663 #ifdef CONFIG_NET_INGRESS
1664 static struct static_key ingress_needed __read_mostly;
1665
1666 void net_inc_ingress_queue(void)
1667 {
1668         static_key_slow_inc(&ingress_needed);
1669 }
1670 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672 void net_dec_ingress_queue(void)
1673 {
1674         static_key_slow_dec(&ingress_needed);
1675 }
1676 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677 #endif
1678
1679 #ifdef CONFIG_NET_EGRESS
1680 static struct static_key egress_needed __read_mostly;
1681
1682 void net_inc_egress_queue(void)
1683 {
1684         static_key_slow_inc(&egress_needed);
1685 }
1686 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688 void net_dec_egress_queue(void)
1689 {
1690         static_key_slow_dec(&egress_needed);
1691 }
1692 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693 #endif
1694
1695 static struct static_key netstamp_needed __read_mostly;
1696 #ifdef HAVE_JUMP_LABEL
1697 /* We are not allowed to call static_key_slow_dec() from irq context
1698  * If net_disable_timestamp() is called from irq context, defer the
1699  * static_key_slow_dec() calls.
1700  */
1701 static atomic_t netstamp_needed_deferred;
1702 #endif
1703
1704 void net_enable_timestamp(void)
1705 {
1706 #ifdef HAVE_JUMP_LABEL
1707         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709         if (deferred) {
1710                 while (--deferred)
1711                         static_key_slow_dec(&netstamp_needed);
1712                 return;
1713         }
1714 #endif
1715         static_key_slow_inc(&netstamp_needed);
1716 }
1717 EXPORT_SYMBOL(net_enable_timestamp);
1718
1719 void net_disable_timestamp(void)
1720 {
1721 #ifdef HAVE_JUMP_LABEL
1722         if (in_interrupt()) {
1723                 atomic_inc(&netstamp_needed_deferred);
1724                 return;
1725         }
1726 #endif
1727         static_key_slow_dec(&netstamp_needed);
1728 }
1729 EXPORT_SYMBOL(net_disable_timestamp);
1730
1731 static inline void net_timestamp_set(struct sk_buff *skb)
1732 {
1733         skb->tstamp.tv64 = 0;
1734         if (static_key_false(&netstamp_needed))
1735                 __net_timestamp(skb);
1736 }
1737
1738 #define net_timestamp_check(COND, SKB)                  \
1739         if (static_key_false(&netstamp_needed)) {               \
1740                 if ((COND) && !(SKB)->tstamp.tv64)      \
1741                         __net_timestamp(SKB);           \
1742         }                                               \
1743
1744 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1745 {
1746         unsigned int len;
1747
1748         if (!(dev->flags & IFF_UP))
1749                 return false;
1750
1751         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752         if (skb->len <= len)
1753                 return true;
1754
1755         /* if TSO is enabled, we don't care about the length as the packet
1756          * could be forwarded without being segmented before
1757          */
1758         if (skb_is_gso(skb))
1759                 return true;
1760
1761         return false;
1762 }
1763 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766 {
1767         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768             unlikely(!is_skb_forwardable(dev, skb))) {
1769                 atomic_long_inc(&dev->rx_dropped);
1770                 kfree_skb(skb);
1771                 return NET_RX_DROP;
1772         }
1773
1774         skb_scrub_packet(skb, true);
1775         skb->priority = 0;
1776         skb->protocol = eth_type_trans(skb, dev);
1777         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779         return 0;
1780 }
1781 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783 /**
1784  * dev_forward_skb - loopback an skb to another netif
1785  *
1786  * @dev: destination network device
1787  * @skb: buffer to forward
1788  *
1789  * return values:
1790  *      NET_RX_SUCCESS  (no congestion)
1791  *      NET_RX_DROP     (packet was dropped, but freed)
1792  *
1793  * dev_forward_skb can be used for injecting an skb from the
1794  * start_xmit function of one device into the receive queue
1795  * of another device.
1796  *
1797  * The receiving device may be in another namespace, so
1798  * we have to clear all information in the skb that could
1799  * impact namespace isolation.
1800  */
1801 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802 {
1803         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804 }
1805 EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807 static inline int deliver_skb(struct sk_buff *skb,
1808                               struct packet_type *pt_prev,
1809                               struct net_device *orig_dev)
1810 {
1811         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812                 return -ENOMEM;
1813         atomic_inc(&skb->users);
1814         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815 }
1816
1817 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818                                           struct packet_type **pt,
1819                                           struct net_device *orig_dev,
1820                                           __be16 type,
1821                                           struct list_head *ptype_list)
1822 {
1823         struct packet_type *ptype, *pt_prev = *pt;
1824
1825         list_for_each_entry_rcu(ptype, ptype_list, list) {
1826                 if (ptype->type != type)
1827                         continue;
1828                 if (pt_prev)
1829                         deliver_skb(skb, pt_prev, orig_dev);
1830                 pt_prev = ptype;
1831         }
1832         *pt = pt_prev;
1833 }
1834
1835 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836 {
1837         if (!ptype->af_packet_priv || !skb->sk)
1838                 return false;
1839
1840         if (ptype->id_match)
1841                 return ptype->id_match(ptype, skb->sk);
1842         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843                 return true;
1844
1845         return false;
1846 }
1847
1848 /*
1849  *      Support routine. Sends outgoing frames to any network
1850  *      taps currently in use.
1851  */
1852
1853 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854 {
1855         struct packet_type *ptype;
1856         struct sk_buff *skb2 = NULL;
1857         struct packet_type *pt_prev = NULL;
1858         struct list_head *ptype_list = &ptype_all;
1859
1860         rcu_read_lock();
1861 again:
1862         list_for_each_entry_rcu(ptype, ptype_list, list) {
1863                 /* Never send packets back to the socket
1864                  * they originated from - MvS (miquels@drinkel.ow.org)
1865                  */
1866                 if (skb_loop_sk(ptype, skb))
1867                         continue;
1868
1869                 if (pt_prev) {
1870                         deliver_skb(skb2, pt_prev, skb->dev);
1871                         pt_prev = ptype;
1872                         continue;
1873                 }
1874
1875                 /* need to clone skb, done only once */
1876                 skb2 = skb_clone(skb, GFP_ATOMIC);
1877                 if (!skb2)
1878                         goto out_unlock;
1879
1880                 net_timestamp_set(skb2);
1881
1882                 /* skb->nh should be correctly
1883                  * set by sender, so that the second statement is
1884                  * just protection against buggy protocols.
1885                  */
1886                 skb_reset_mac_header(skb2);
1887
1888                 if (skb_network_header(skb2) < skb2->data ||
1889                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891                                              ntohs(skb2->protocol),
1892                                              dev->name);
1893                         skb_reset_network_header(skb2);
1894                 }
1895
1896                 skb2->transport_header = skb2->network_header;
1897                 skb2->pkt_type = PACKET_OUTGOING;
1898                 pt_prev = ptype;
1899         }
1900
1901         if (ptype_list == &ptype_all) {
1902                 ptype_list = &dev->ptype_all;
1903                 goto again;
1904         }
1905 out_unlock:
1906         if (pt_prev)
1907                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908         rcu_read_unlock();
1909 }
1910 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1911
1912 /**
1913  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1914  * @dev: Network device
1915  * @txq: number of queues available
1916  *
1917  * If real_num_tx_queues is changed the tc mappings may no longer be
1918  * valid. To resolve this verify the tc mapping remains valid and if
1919  * not NULL the mapping. With no priorities mapping to this
1920  * offset/count pair it will no longer be used. In the worst case TC0
1921  * is invalid nothing can be done so disable priority mappings. If is
1922  * expected that drivers will fix this mapping if they can before
1923  * calling netif_set_real_num_tx_queues.
1924  */
1925 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1926 {
1927         int i;
1928         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1929
1930         /* If TC0 is invalidated disable TC mapping */
1931         if (tc->offset + tc->count > txq) {
1932                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1933                 dev->num_tc = 0;
1934                 return;
1935         }
1936
1937         /* Invalidated prio to tc mappings set to TC0 */
1938         for (i = 1; i < TC_BITMASK + 1; i++) {
1939                 int q = netdev_get_prio_tc_map(dev, i);
1940
1941                 tc = &dev->tc_to_txq[q];
1942                 if (tc->offset + tc->count > txq) {
1943                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1944                                 i, q);
1945                         netdev_set_prio_tc_map(dev, i, 0);
1946                 }
1947         }
1948 }
1949
1950 #ifdef CONFIG_XPS
1951 static DEFINE_MUTEX(xps_map_mutex);
1952 #define xmap_dereference(P)             \
1953         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1954
1955 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1956                                         int cpu, u16 index)
1957 {
1958         struct xps_map *map = NULL;
1959         int pos;
1960
1961         if (dev_maps)
1962                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1963
1964         for (pos = 0; map && pos < map->len; pos++) {
1965                 if (map->queues[pos] == index) {
1966                         if (map->len > 1) {
1967                                 map->queues[pos] = map->queues[--map->len];
1968                         } else {
1969                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1970                                 kfree_rcu(map, rcu);
1971                                 map = NULL;
1972                         }
1973                         break;
1974                 }
1975         }
1976
1977         return map;
1978 }
1979
1980 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1981 {
1982         struct xps_dev_maps *dev_maps;
1983         int cpu, i;
1984         bool active = false;
1985
1986         mutex_lock(&xps_map_mutex);
1987         dev_maps = xmap_dereference(dev->xps_maps);
1988
1989         if (!dev_maps)
1990                 goto out_no_maps;
1991
1992         for_each_possible_cpu(cpu) {
1993                 for (i = index; i < dev->num_tx_queues; i++) {
1994                         if (!remove_xps_queue(dev_maps, cpu, i))
1995                                 break;
1996                 }
1997                 if (i == dev->num_tx_queues)
1998                         active = true;
1999         }
2000
2001         if (!active) {
2002                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2003                 kfree_rcu(dev_maps, rcu);
2004         }
2005
2006         for (i = index; i < dev->num_tx_queues; i++)
2007                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2008                                              NUMA_NO_NODE);
2009
2010 out_no_maps:
2011         mutex_unlock(&xps_map_mutex);
2012 }
2013
2014 static struct xps_map *expand_xps_map(struct xps_map *map,
2015                                       int cpu, u16 index)
2016 {
2017         struct xps_map *new_map;
2018         int alloc_len = XPS_MIN_MAP_ALLOC;
2019         int i, pos;
2020
2021         for (pos = 0; map && pos < map->len; pos++) {
2022                 if (map->queues[pos] != index)
2023                         continue;
2024                 return map;
2025         }
2026
2027         /* Need to add queue to this CPU's existing map */
2028         if (map) {
2029                 if (pos < map->alloc_len)
2030                         return map;
2031
2032                 alloc_len = map->alloc_len * 2;
2033         }
2034
2035         /* Need to allocate new map to store queue on this CPU's map */
2036         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2037                                cpu_to_node(cpu));
2038         if (!new_map)
2039                 return NULL;
2040
2041         for (i = 0; i < pos; i++)
2042                 new_map->queues[i] = map->queues[i];
2043         new_map->alloc_len = alloc_len;
2044         new_map->len = pos;
2045
2046         return new_map;
2047 }
2048
2049 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2050                         u16 index)
2051 {
2052         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2053         struct xps_map *map, *new_map;
2054         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2055         int cpu, numa_node_id = -2;
2056         bool active = false;
2057
2058         mutex_lock(&xps_map_mutex);
2059
2060         dev_maps = xmap_dereference(dev->xps_maps);
2061
2062         /* allocate memory for queue storage */
2063         for_each_online_cpu(cpu) {
2064                 if (!cpumask_test_cpu(cpu, mask))
2065                         continue;
2066
2067                 if (!new_dev_maps)
2068                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2069                 if (!new_dev_maps) {
2070                         mutex_unlock(&xps_map_mutex);
2071                         return -ENOMEM;
2072                 }
2073
2074                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2075                                  NULL;
2076
2077                 map = expand_xps_map(map, cpu, index);
2078                 if (!map)
2079                         goto error;
2080
2081                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2082         }
2083
2084         if (!new_dev_maps)
2085                 goto out_no_new_maps;
2086
2087         for_each_possible_cpu(cpu) {
2088                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2089                         /* add queue to CPU maps */
2090                         int pos = 0;
2091
2092                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2093                         while ((pos < map->len) && (map->queues[pos] != index))
2094                                 pos++;
2095
2096                         if (pos == map->len)
2097                                 map->queues[map->len++] = index;
2098 #ifdef CONFIG_NUMA
2099                         if (numa_node_id == -2)
2100                                 numa_node_id = cpu_to_node(cpu);
2101                         else if (numa_node_id != cpu_to_node(cpu))
2102                                 numa_node_id = -1;
2103 #endif
2104                 } else if (dev_maps) {
2105                         /* fill in the new device map from the old device map */
2106                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2107                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2108                 }
2109
2110         }
2111
2112         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2113
2114         /* Cleanup old maps */
2115         if (dev_maps) {
2116                 for_each_possible_cpu(cpu) {
2117                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2118                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2119                         if (map && map != new_map)
2120                                 kfree_rcu(map, rcu);
2121                 }
2122
2123                 kfree_rcu(dev_maps, rcu);
2124         }
2125
2126         dev_maps = new_dev_maps;
2127         active = true;
2128
2129 out_no_new_maps:
2130         /* update Tx queue numa node */
2131         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2132                                      (numa_node_id >= 0) ? numa_node_id :
2133                                      NUMA_NO_NODE);
2134
2135         if (!dev_maps)
2136                 goto out_no_maps;
2137
2138         /* removes queue from unused CPUs */
2139         for_each_possible_cpu(cpu) {
2140                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2141                         continue;
2142
2143                 if (remove_xps_queue(dev_maps, cpu, index))
2144                         active = true;
2145         }
2146
2147         /* free map if not active */
2148         if (!active) {
2149                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2150                 kfree_rcu(dev_maps, rcu);
2151         }
2152
2153 out_no_maps:
2154         mutex_unlock(&xps_map_mutex);
2155
2156         return 0;
2157 error:
2158         /* remove any maps that we added */
2159         for_each_possible_cpu(cpu) {
2160                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2161                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2162                                  NULL;
2163                 if (new_map && new_map != map)
2164                         kfree(new_map);
2165         }
2166
2167         mutex_unlock(&xps_map_mutex);
2168
2169         kfree(new_dev_maps);
2170         return -ENOMEM;
2171 }
2172 EXPORT_SYMBOL(netif_set_xps_queue);
2173
2174 #endif
2175 /*
2176  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2177  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2178  */
2179 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2180 {
2181         int rc;
2182
2183         if (txq < 1 || txq > dev->num_tx_queues)
2184                 return -EINVAL;
2185
2186         if (dev->reg_state == NETREG_REGISTERED ||
2187             dev->reg_state == NETREG_UNREGISTERING) {
2188                 ASSERT_RTNL();
2189
2190                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2191                                                   txq);
2192                 if (rc)
2193                         return rc;
2194
2195                 if (dev->num_tc)
2196                         netif_setup_tc(dev, txq);
2197
2198                 if (txq < dev->real_num_tx_queues) {
2199                         qdisc_reset_all_tx_gt(dev, txq);
2200 #ifdef CONFIG_XPS
2201                         netif_reset_xps_queues_gt(dev, txq);
2202 #endif
2203                 }
2204         }
2205
2206         dev->real_num_tx_queues = txq;
2207         return 0;
2208 }
2209 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2210
2211 #ifdef CONFIG_SYSFS
2212 /**
2213  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2214  *      @dev: Network device
2215  *      @rxq: Actual number of RX queues
2216  *
2217  *      This must be called either with the rtnl_lock held or before
2218  *      registration of the net device.  Returns 0 on success, or a
2219  *      negative error code.  If called before registration, it always
2220  *      succeeds.
2221  */
2222 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2223 {
2224         int rc;
2225
2226         if (rxq < 1 || rxq > dev->num_rx_queues)
2227                 return -EINVAL;
2228
2229         if (dev->reg_state == NETREG_REGISTERED) {
2230                 ASSERT_RTNL();
2231
2232                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2233                                                   rxq);
2234                 if (rc)
2235                         return rc;
2236         }
2237
2238         dev->real_num_rx_queues = rxq;
2239         return 0;
2240 }
2241 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2242 #endif
2243
2244 /**
2245  * netif_get_num_default_rss_queues - default number of RSS queues
2246  *
2247  * This routine should set an upper limit on the number of RSS queues
2248  * used by default by multiqueue devices.
2249  */
2250 int netif_get_num_default_rss_queues(void)
2251 {
2252         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2253 }
2254 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2255
2256 static void __netif_reschedule(struct Qdisc *q)
2257 {
2258         struct softnet_data *sd;
2259         unsigned long flags;
2260
2261         local_irq_save(flags);
2262         sd = this_cpu_ptr(&softnet_data);
2263         q->next_sched = NULL;
2264         *sd->output_queue_tailp = q;
2265         sd->output_queue_tailp = &q->next_sched;
2266         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267         local_irq_restore(flags);
2268 }
2269
2270 void __netif_schedule(struct Qdisc *q)
2271 {
2272         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2273                 __netif_reschedule(q);
2274 }
2275 EXPORT_SYMBOL(__netif_schedule);
2276
2277 struct dev_kfree_skb_cb {
2278         enum skb_free_reason reason;
2279 };
2280
2281 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2282 {
2283         return (struct dev_kfree_skb_cb *)skb->cb;
2284 }
2285
2286 void netif_schedule_queue(struct netdev_queue *txq)
2287 {
2288         rcu_read_lock();
2289         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2290                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2291
2292                 __netif_schedule(q);
2293         }
2294         rcu_read_unlock();
2295 }
2296 EXPORT_SYMBOL(netif_schedule_queue);
2297
2298 /**
2299  *      netif_wake_subqueue - allow sending packets on subqueue
2300  *      @dev: network device
2301  *      @queue_index: sub queue index
2302  *
2303  * Resume individual transmit queue of a device with multiple transmit queues.
2304  */
2305 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2306 {
2307         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2308
2309         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2310                 struct Qdisc *q;
2311
2312                 rcu_read_lock();
2313                 q = rcu_dereference(txq->qdisc);
2314                 __netif_schedule(q);
2315                 rcu_read_unlock();
2316         }
2317 }
2318 EXPORT_SYMBOL(netif_wake_subqueue);
2319
2320 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2321 {
2322         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2323                 struct Qdisc *q;
2324
2325                 rcu_read_lock();
2326                 q = rcu_dereference(dev_queue->qdisc);
2327                 __netif_schedule(q);
2328                 rcu_read_unlock();
2329         }
2330 }
2331 EXPORT_SYMBOL(netif_tx_wake_queue);
2332
2333 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2334 {
2335         unsigned long flags;
2336
2337         if (likely(atomic_read(&skb->users) == 1)) {
2338                 smp_rmb();
2339                 atomic_set(&skb->users, 0);
2340         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2341                 return;
2342         }
2343         get_kfree_skb_cb(skb)->reason = reason;
2344         local_irq_save(flags);
2345         skb->next = __this_cpu_read(softnet_data.completion_queue);
2346         __this_cpu_write(softnet_data.completion_queue, skb);
2347         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2348         local_irq_restore(flags);
2349 }
2350 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2351
2352 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2353 {
2354         if (in_irq() || irqs_disabled())
2355                 __dev_kfree_skb_irq(skb, reason);
2356         else
2357                 dev_kfree_skb(skb);
2358 }
2359 EXPORT_SYMBOL(__dev_kfree_skb_any);
2360
2361
2362 /**
2363  * netif_device_detach - mark device as removed
2364  * @dev: network device
2365  *
2366  * Mark device as removed from system and therefore no longer available.
2367  */
2368 void netif_device_detach(struct net_device *dev)
2369 {
2370         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2371             netif_running(dev)) {
2372                 netif_tx_stop_all_queues(dev);
2373         }
2374 }
2375 EXPORT_SYMBOL(netif_device_detach);
2376
2377 /**
2378  * netif_device_attach - mark device as attached
2379  * @dev: network device
2380  *
2381  * Mark device as attached from system and restart if needed.
2382  */
2383 void netif_device_attach(struct net_device *dev)
2384 {
2385         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2386             netif_running(dev)) {
2387                 netif_tx_wake_all_queues(dev);
2388                 __netdev_watchdog_up(dev);
2389         }
2390 }
2391 EXPORT_SYMBOL(netif_device_attach);
2392
2393 /*
2394  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2395  * to be used as a distribution range.
2396  */
2397 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2398                   unsigned int num_tx_queues)
2399 {
2400         u32 hash;
2401         u16 qoffset = 0;
2402         u16 qcount = num_tx_queues;
2403
2404         if (skb_rx_queue_recorded(skb)) {
2405                 hash = skb_get_rx_queue(skb);
2406                 while (unlikely(hash >= num_tx_queues))
2407                         hash -= num_tx_queues;
2408                 return hash;
2409         }
2410
2411         if (dev->num_tc) {
2412                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2413                 qoffset = dev->tc_to_txq[tc].offset;
2414                 qcount = dev->tc_to_txq[tc].count;
2415         }
2416
2417         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2418 }
2419 EXPORT_SYMBOL(__skb_tx_hash);
2420
2421 static void skb_warn_bad_offload(const struct sk_buff *skb)
2422 {
2423         static const netdev_features_t null_features = 0;
2424         struct net_device *dev = skb->dev;
2425         const char *name = "";
2426
2427         if (!net_ratelimit())
2428                 return;
2429
2430         if (dev) {
2431                 if (dev->dev.parent)
2432                         name = dev_driver_string(dev->dev.parent);
2433                 else
2434                         name = netdev_name(dev);
2435         }
2436         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2437              "gso_type=%d ip_summed=%d\n",
2438              name, dev ? &dev->features : &null_features,
2439              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2440              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2441              skb_shinfo(skb)->gso_type, skb->ip_summed);
2442 }
2443
2444 /*
2445  * Invalidate hardware checksum when packet is to be mangled, and
2446  * complete checksum manually on outgoing path.
2447  */
2448 int skb_checksum_help(struct sk_buff *skb)
2449 {
2450         __wsum csum;
2451         int ret = 0, offset;
2452
2453         if (skb->ip_summed == CHECKSUM_COMPLETE)
2454                 goto out_set_summed;
2455
2456         if (unlikely(skb_shinfo(skb)->gso_size)) {
2457                 skb_warn_bad_offload(skb);
2458                 return -EINVAL;
2459         }
2460
2461         /* Before computing a checksum, we should make sure no frag could
2462          * be modified by an external entity : checksum could be wrong.
2463          */
2464         if (skb_has_shared_frag(skb)) {
2465                 ret = __skb_linearize(skb);
2466                 if (ret)
2467                         goto out;
2468         }
2469
2470         offset = skb_checksum_start_offset(skb);
2471         BUG_ON(offset >= skb_headlen(skb));
2472         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2473
2474         offset += skb->csum_offset;
2475         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2476
2477         if (skb_cloned(skb) &&
2478             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2479                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2480                 if (ret)
2481                         goto out;
2482         }
2483
2484         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2485 out_set_summed:
2486         skb->ip_summed = CHECKSUM_NONE;
2487 out:
2488         return ret;
2489 }
2490 EXPORT_SYMBOL(skb_checksum_help);
2491
2492 /* skb_csum_offload_check - Driver helper function to determine if a device
2493  * with limited checksum offload capabilities is able to offload the checksum
2494  * for a given packet.
2495  *
2496  * Arguments:
2497  *   skb - sk_buff for the packet in question
2498  *   spec - contains the description of what device can offload
2499  *   csum_encapped - returns true if the checksum being offloaded is
2500  *            encpasulated. That is it is checksum for the transport header
2501  *            in the inner headers.
2502  *   checksum_help - when set indicates that helper function should
2503  *            call skb_checksum_help if offload checks fail
2504  *
2505  * Returns:
2506  *   true: Packet has passed the checksum checks and should be offloadable to
2507  *         the device (a driver may still need to check for additional
2508  *         restrictions of its device)
2509  *   false: Checksum is not offloadable. If checksum_help was set then
2510  *         skb_checksum_help was called to resolve checksum for non-GSO
2511  *         packets and when IP protocol is not SCTP
2512  */
2513 bool __skb_csum_offload_chk(struct sk_buff *skb,
2514                             const struct skb_csum_offl_spec *spec,
2515                             bool *csum_encapped,
2516                             bool csum_help)
2517 {
2518         struct iphdr *iph;
2519         struct ipv6hdr *ipv6;
2520         void *nhdr;
2521         int protocol;
2522         u8 ip_proto;
2523
2524         if (skb->protocol == htons(ETH_P_8021Q) ||
2525             skb->protocol == htons(ETH_P_8021AD)) {
2526                 if (!spec->vlan_okay)
2527                         goto need_help;
2528         }
2529
2530         /* We check whether the checksum refers to a transport layer checksum in
2531          * the outermost header or an encapsulated transport layer checksum that
2532          * corresponds to the inner headers of the skb. If the checksum is for
2533          * something else in the packet we need help.
2534          */
2535         if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2536                 /* Non-encapsulated checksum */
2537                 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2538                 nhdr = skb_network_header(skb);
2539                 *csum_encapped = false;
2540                 if (spec->no_not_encapped)
2541                         goto need_help;
2542         } else if (skb->encapsulation && spec->encap_okay &&
2543                    skb_checksum_start_offset(skb) ==
2544                    skb_inner_transport_offset(skb)) {
2545                 /* Encapsulated checksum */
2546                 *csum_encapped = true;
2547                 switch (skb->inner_protocol_type) {
2548                 case ENCAP_TYPE_ETHER:
2549                         protocol = eproto_to_ipproto(skb->inner_protocol);
2550                         break;
2551                 case ENCAP_TYPE_IPPROTO:
2552                         protocol = skb->inner_protocol;
2553                         break;
2554                 }
2555                 nhdr = skb_inner_network_header(skb);
2556         } else {
2557                 goto need_help;
2558         }
2559
2560         switch (protocol) {
2561         case IPPROTO_IP:
2562                 if (!spec->ipv4_okay)
2563                         goto need_help;
2564                 iph = nhdr;
2565                 ip_proto = iph->protocol;
2566                 if (iph->ihl != 5 && !spec->ip_options_okay)
2567                         goto need_help;
2568                 break;
2569         case IPPROTO_IPV6:
2570                 if (!spec->ipv6_okay)
2571                         goto need_help;
2572                 if (spec->no_encapped_ipv6 && *csum_encapped)
2573                         goto need_help;
2574                 ipv6 = nhdr;
2575                 nhdr += sizeof(*ipv6);
2576                 ip_proto = ipv6->nexthdr;
2577                 break;
2578         default:
2579                 goto need_help;
2580         }
2581
2582 ip_proto_again:
2583         switch (ip_proto) {
2584         case IPPROTO_TCP:
2585                 if (!spec->tcp_okay ||
2586                     skb->csum_offset != offsetof(struct tcphdr, check))
2587                         goto need_help;
2588                 break;
2589         case IPPROTO_UDP:
2590                 if (!spec->udp_okay ||
2591                     skb->csum_offset != offsetof(struct udphdr, check))
2592                         goto need_help;
2593                 break;
2594         case IPPROTO_SCTP:
2595                 if (!spec->sctp_okay ||
2596                     skb->csum_offset != offsetof(struct sctphdr, checksum))
2597                         goto cant_help;
2598                 break;
2599         case NEXTHDR_HOP:
2600         case NEXTHDR_ROUTING:
2601         case NEXTHDR_DEST: {
2602                 u8 *opthdr = nhdr;
2603
2604                 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2605                         goto need_help;
2606
2607                 ip_proto = opthdr[0];
2608                 nhdr += (opthdr[1] + 1) << 3;
2609
2610                 goto ip_proto_again;
2611         }
2612         default:
2613                 goto need_help;
2614         }
2615
2616         /* Passed the tests for offloading checksum */
2617         return true;
2618
2619 need_help:
2620         if (csum_help && !skb_shinfo(skb)->gso_size)
2621                 skb_checksum_help(skb);
2622 cant_help:
2623         return false;
2624 }
2625 EXPORT_SYMBOL(__skb_csum_offload_chk);
2626
2627 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628 {
2629         __be16 type = skb->protocol;
2630
2631         /* Tunnel gso handlers can set protocol to ethernet. */
2632         if (type == htons(ETH_P_TEB)) {
2633                 struct ethhdr *eth;
2634
2635                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636                         return 0;
2637
2638                 eth = (struct ethhdr *)skb_mac_header(skb);
2639                 type = eth->h_proto;
2640         }
2641
2642         return __vlan_get_protocol(skb, type, depth);
2643 }
2644
2645 /**
2646  *      skb_mac_gso_segment - mac layer segmentation handler.
2647  *      @skb: buffer to segment
2648  *      @features: features for the output path (see dev->features)
2649  */
2650 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651                                     netdev_features_t features)
2652 {
2653         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654         struct packet_offload *ptype;
2655         int vlan_depth = skb->mac_len;
2656         __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658         if (unlikely(!type))
2659                 return ERR_PTR(-EINVAL);
2660
2661         __skb_pull(skb, vlan_depth);
2662
2663         rcu_read_lock();
2664         list_for_each_entry_rcu(ptype, &offload_base, list) {
2665                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2666                         segs = ptype->callbacks.gso_segment(skb, features);
2667                         break;
2668                 }
2669         }
2670         rcu_read_unlock();
2671
2672         __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674         return segs;
2675 }
2676 EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679 /* openvswitch calls this on rx path, so we need a different check.
2680  */
2681 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682 {
2683         if (tx_path)
2684                 return skb->ip_summed != CHECKSUM_PARTIAL;
2685         else
2686                 return skb->ip_summed == CHECKSUM_NONE;
2687 }
2688
2689 /**
2690  *      __skb_gso_segment - Perform segmentation on skb.
2691  *      @skb: buffer to segment
2692  *      @features: features for the output path (see dev->features)
2693  *      @tx_path: whether it is called in TX path
2694  *
2695  *      This function segments the given skb and returns a list of segments.
2696  *
2697  *      It may return NULL if the skb requires no segmentation.  This is
2698  *      only possible when GSO is used for verifying header integrity.
2699  *
2700  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701  */
2702 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703                                   netdev_features_t features, bool tx_path)
2704 {
2705         if (unlikely(skb_needs_check(skb, tx_path))) {
2706                 int err;
2707
2708                 skb_warn_bad_offload(skb);
2709
2710                 err = skb_cow_head(skb, 0);
2711                 if (err < 0)
2712                         return ERR_PTR(err);
2713         }
2714
2715         /* Only report GSO partial support if it will enable us to
2716          * support segmentation on this frame without needing additional
2717          * work.
2718          */
2719         if (features & NETIF_F_GSO_PARTIAL) {
2720                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721                 struct net_device *dev = skb->dev;
2722
2723                 partial_features |= dev->features & dev->gso_partial_features;
2724                 if (!skb_gso_ok(skb, features | partial_features))
2725                         features &= ~NETIF_F_GSO_PARTIAL;
2726         }
2727
2728         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732         SKB_GSO_CB(skb)->encap_level = 0;
2733
2734         skb_reset_mac_header(skb);
2735         skb_reset_mac_len(skb);
2736
2737         return skb_mac_gso_segment(skb, features);
2738 }
2739 EXPORT_SYMBOL(__skb_gso_segment);
2740
2741 /* Take action when hardware reception checksum errors are detected. */
2742 #ifdef CONFIG_BUG
2743 void netdev_rx_csum_fault(struct net_device *dev)
2744 {
2745         if (net_ratelimit()) {
2746                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747                 dump_stack();
2748         }
2749 }
2750 EXPORT_SYMBOL(netdev_rx_csum_fault);
2751 #endif
2752
2753 /* Actually, we should eliminate this check as soon as we know, that:
2754  * 1. IOMMU is present and allows to map all the memory.
2755  * 2. No high memory really exists on this machine.
2756  */
2757
2758 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759 {
2760 #ifdef CONFIG_HIGHMEM
2761         int i;
2762         if (!(dev->features & NETIF_F_HIGHDMA)) {
2763                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765                         if (PageHighMem(skb_frag_page(frag)))
2766                                 return 1;
2767                 }
2768         }
2769
2770         if (PCI_DMA_BUS_IS_PHYS) {
2771                 struct device *pdev = dev->dev.parent;
2772
2773                 if (!pdev)
2774                         return 0;
2775                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779                                 return 1;
2780                 }
2781         }
2782 #endif
2783         return 0;
2784 }
2785
2786 /* If MPLS offload request, verify we are testing hardware MPLS features
2787  * instead of standard features for the netdev.
2788  */
2789 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791                                            netdev_features_t features,
2792                                            __be16 type)
2793 {
2794         if (eth_p_mpls(type))
2795                 features &= skb->dev->mpls_features;
2796
2797         return features;
2798 }
2799 #else
2800 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801                                            netdev_features_t features,
2802                                            __be16 type)
2803 {
2804         return features;
2805 }
2806 #endif
2807
2808 static netdev_features_t harmonize_features(struct sk_buff *skb,
2809         netdev_features_t features)
2810 {
2811         int tmp;
2812         __be16 type;
2813
2814         type = skb_network_protocol(skb, &tmp);
2815         features = net_mpls_features(skb, features, type);
2816
2817         if (skb->ip_summed != CHECKSUM_NONE &&
2818             !can_checksum_protocol(features, type)) {
2819                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820         } else if (illegal_highdma(skb->dev, skb)) {
2821                 features &= ~NETIF_F_SG;
2822         }
2823
2824         return features;
2825 }
2826
2827 netdev_features_t passthru_features_check(struct sk_buff *skb,
2828                                           struct net_device *dev,
2829                                           netdev_features_t features)
2830 {
2831         return features;
2832 }
2833 EXPORT_SYMBOL(passthru_features_check);
2834
2835 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836                                              struct net_device *dev,
2837                                              netdev_features_t features)
2838 {
2839         return vlan_features_check(skb, features);
2840 }
2841
2842 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843                                             struct net_device *dev,
2844                                             netdev_features_t features)
2845 {
2846         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848         if (gso_segs > dev->gso_max_segs)
2849                 return features & ~NETIF_F_GSO_MASK;
2850
2851         /* Support for GSO partial features requires software
2852          * intervention before we can actually process the packets
2853          * so we need to strip support for any partial features now
2854          * and we can pull them back in after we have partially
2855          * segmented the frame.
2856          */
2857         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858                 features &= ~dev->gso_partial_features;
2859
2860         /* Make sure to clear the IPv4 ID mangling feature if the
2861          * IPv4 header has the potential to be fragmented.
2862          */
2863         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864                 struct iphdr *iph = skb->encapsulation ?
2865                                     inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867                 if (!(iph->frag_off & htons(IP_DF)))
2868                         features &= ~NETIF_F_TSO_MANGLEID;
2869         }
2870
2871         return features;
2872 }
2873
2874 netdev_features_t netif_skb_features(struct sk_buff *skb)
2875 {
2876         struct net_device *dev = skb->dev;
2877         netdev_features_t features = dev->features;
2878
2879         if (skb_is_gso(skb))
2880                 features = gso_features_check(skb, dev, features);
2881
2882         /* If encapsulation offload request, verify we are testing
2883          * hardware encapsulation features instead of standard
2884          * features for the netdev
2885          */
2886         if (skb->encapsulation)
2887                 features &= dev->hw_enc_features;
2888
2889         if (skb_vlan_tagged(skb))
2890                 features = netdev_intersect_features(features,
2891                                                      dev->vlan_features |
2892                                                      NETIF_F_HW_VLAN_CTAG_TX |
2893                                                      NETIF_F_HW_VLAN_STAG_TX);
2894
2895         if (dev->netdev_ops->ndo_features_check)
2896                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897                                                                 features);
2898         else
2899                 features &= dflt_features_check(skb, dev, features);
2900
2901         return harmonize_features(skb, features);
2902 }
2903 EXPORT_SYMBOL(netif_skb_features);
2904
2905 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906                     struct netdev_queue *txq, bool more)
2907 {
2908         unsigned int len;
2909         int rc;
2910
2911         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912                 dev_queue_xmit_nit(skb, dev);
2913
2914         len = skb->len;
2915         trace_net_dev_start_xmit(skb, dev);
2916         rc = netdev_start_xmit(skb, dev, txq, more);
2917         trace_net_dev_xmit(skb, rc, dev, len);
2918
2919         return rc;
2920 }
2921
2922 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923                                     struct netdev_queue *txq, int *ret)
2924 {
2925         struct sk_buff *skb = first;
2926         int rc = NETDEV_TX_OK;
2927
2928         while (skb) {
2929                 struct sk_buff *next = skb->next;
2930
2931                 skb->next = NULL;
2932                 rc = xmit_one(skb, dev, txq, next != NULL);
2933                 if (unlikely(!dev_xmit_complete(rc))) {
2934                         skb->next = next;
2935                         goto out;
2936                 }
2937
2938                 skb = next;
2939                 if (netif_xmit_stopped(txq) && skb) {
2940                         rc = NETDEV_TX_BUSY;
2941                         break;
2942                 }
2943         }
2944
2945 out:
2946         *ret = rc;
2947         return skb;
2948 }
2949
2950 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951                                           netdev_features_t features)
2952 {
2953         if (skb_vlan_tag_present(skb) &&
2954             !vlan_hw_offload_capable(features, skb->vlan_proto))
2955                 skb = __vlan_hwaccel_push_inside(skb);
2956         return skb;
2957 }
2958
2959 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2960 {
2961         netdev_features_t features;
2962
2963         features = netif_skb_features(skb);
2964         skb = validate_xmit_vlan(skb, features);
2965         if (unlikely(!skb))
2966                 goto out_null;
2967
2968         if (netif_needs_gso(skb, features)) {
2969                 struct sk_buff *segs;
2970
2971                 segs = skb_gso_segment(skb, features);
2972                 if (IS_ERR(segs)) {
2973                         goto out_kfree_skb;
2974                 } else if (segs) {
2975                         consume_skb(skb);
2976                         skb = segs;
2977                 }
2978         } else {
2979                 if (skb_needs_linearize(skb, features) &&
2980                     __skb_linearize(skb))
2981                         goto out_kfree_skb;
2982
2983                 /* If packet is not checksummed and device does not
2984                  * support checksumming for this protocol, complete
2985                  * checksumming here.
2986                  */
2987                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988                         if (skb->encapsulation)
2989                                 skb_set_inner_transport_header(skb,
2990                                                                skb_checksum_start_offset(skb));
2991                         else
2992                                 skb_set_transport_header(skb,
2993                                                          skb_checksum_start_offset(skb));
2994                         if (!(features & NETIF_F_CSUM_MASK) &&
2995                             skb_checksum_help(skb))
2996                                 goto out_kfree_skb;
2997                 }
2998         }
2999
3000         return skb;
3001
3002 out_kfree_skb:
3003         kfree_skb(skb);
3004 out_null:
3005         atomic_long_inc(&dev->tx_dropped);
3006         return NULL;
3007 }
3008
3009 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010 {
3011         struct sk_buff *next, *head = NULL, *tail;
3012
3013         for (; skb != NULL; skb = next) {
3014                 next = skb->next;
3015                 skb->next = NULL;
3016
3017                 /* in case skb wont be segmented, point to itself */
3018                 skb->prev = skb;
3019
3020                 skb = validate_xmit_skb(skb, dev);
3021                 if (!skb)
3022                         continue;
3023
3024                 if (!head)
3025                         head = skb;
3026                 else
3027                         tail->next = skb;
3028                 /* If skb was segmented, skb->prev points to
3029                  * the last segment. If not, it still contains skb.
3030                  */
3031                 tail = skb->prev;
3032         }
3033         return head;
3034 }
3035
3036 static void qdisc_pkt_len_init(struct sk_buff *skb)
3037 {
3038         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3039
3040         qdisc_skb_cb(skb)->pkt_len = skb->len;
3041
3042         /* To get more precise estimation of bytes sent on wire,
3043          * we add to pkt_len the headers size of all segments
3044          */
3045         if (shinfo->gso_size)  {
3046                 unsigned int hdr_len;
3047                 u16 gso_segs = shinfo->gso_segs;
3048
3049                 /* mac layer + network layer */
3050                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3051
3052                 /* + transport layer */
3053                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3054                         hdr_len += tcp_hdrlen(skb);
3055                 else
3056                         hdr_len += sizeof(struct udphdr);
3057
3058                 if (shinfo->gso_type & SKB_GSO_DODGY)
3059                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3060                                                 shinfo->gso_size);
3061
3062                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3063         }
3064 }
3065
3066 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3067                                  struct net_device *dev,
3068                                  struct netdev_queue *txq)
3069 {
3070         spinlock_t *root_lock = qdisc_lock(q);
3071         bool contended;
3072         int rc;
3073
3074         qdisc_calculate_pkt_len(skb, q);
3075         /*
3076          * Heuristic to force contended enqueues to serialize on a
3077          * separate lock before trying to get qdisc main lock.
3078          * This permits qdisc->running owner to get the lock more
3079          * often and dequeue packets faster.
3080          */
3081         contended = qdisc_is_running(q);
3082         if (unlikely(contended))
3083                 spin_lock(&q->busylock);
3084
3085         spin_lock(root_lock);
3086         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3087                 kfree_skb(skb);
3088                 rc = NET_XMIT_DROP;
3089         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3090                    qdisc_run_begin(q)) {
3091                 /*
3092                  * This is a work-conserving queue; there are no old skbs
3093                  * waiting to be sent out; and the qdisc is not running -
3094                  * xmit the skb directly.
3095                  */
3096
3097                 qdisc_bstats_update(q, skb);
3098
3099                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3100                         if (unlikely(contended)) {
3101                                 spin_unlock(&q->busylock);
3102                                 contended = false;
3103                         }
3104                         __qdisc_run(q);
3105                 } else
3106                         qdisc_run_end(q);
3107
3108                 rc = NET_XMIT_SUCCESS;
3109         } else {
3110                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3111                 if (qdisc_run_begin(q)) {
3112                         if (unlikely(contended)) {
3113                                 spin_unlock(&q->busylock);
3114                                 contended = false;
3115                         }
3116                         __qdisc_run(q);
3117                 }
3118         }
3119         spin_unlock(root_lock);
3120         if (unlikely(contended))
3121                 spin_unlock(&q->busylock);
3122         return rc;
3123 }
3124
3125 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3126 static void skb_update_prio(struct sk_buff *skb)
3127 {
3128         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3129
3130         if (!skb->priority && skb->sk && map) {
3131                 unsigned int prioidx =
3132                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3133
3134                 if (prioidx < map->priomap_len)
3135                         skb->priority = map->priomap[prioidx];
3136         }
3137 }
3138 #else
3139 #define skb_update_prio(skb)
3140 #endif
3141
3142 DEFINE_PER_CPU(int, xmit_recursion);
3143 EXPORT_SYMBOL(xmit_recursion);
3144
3145 #define RECURSION_LIMIT 10
3146
3147 /**
3148  *      dev_loopback_xmit - loop back @skb
3149  *      @net: network namespace this loopback is happening in
3150  *      @sk:  sk needed to be a netfilter okfn
3151  *      @skb: buffer to transmit
3152  */
3153 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3154 {
3155         skb_reset_mac_header(skb);
3156         __skb_pull(skb, skb_network_offset(skb));
3157         skb->pkt_type = PACKET_LOOPBACK;
3158         skb->ip_summed = CHECKSUM_UNNECESSARY;
3159         WARN_ON(!skb_dst(skb));
3160         skb_dst_force(skb);
3161         netif_rx_ni(skb);
3162         return 0;
3163 }
3164 EXPORT_SYMBOL(dev_loopback_xmit);
3165
3166 #ifdef CONFIG_NET_EGRESS
3167 static struct sk_buff *
3168 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3169 {
3170         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171         struct tcf_result cl_res;
3172
3173         if (!cl)
3174                 return skb;
3175
3176         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177          * earlier by the caller.
3178          */
3179         qdisc_bstats_cpu_update(cl->q, skb);
3180
3181         switch (tc_classify(skb, cl, &cl_res, false)) {
3182         case TC_ACT_OK:
3183         case TC_ACT_RECLASSIFY:
3184                 skb->tc_index = TC_H_MIN(cl_res.classid);
3185                 break;
3186         case TC_ACT_SHOT:
3187                 qdisc_qstats_cpu_drop(cl->q);
3188                 *ret = NET_XMIT_DROP;
3189                 kfree_skb(skb);
3190                 return NULL;
3191         case TC_ACT_STOLEN:
3192         case TC_ACT_QUEUED:
3193                 *ret = NET_XMIT_SUCCESS;
3194                 consume_skb(skb);
3195                 return NULL;
3196         case TC_ACT_REDIRECT:
3197                 /* No need to push/pop skb's mac_header here on egress! */
3198                 skb_do_redirect(skb);
3199                 *ret = NET_XMIT_SUCCESS;
3200                 return NULL;
3201         default:
3202                 break;
3203         }
3204
3205         return skb;
3206 }
3207 #endif /* CONFIG_NET_EGRESS */
3208
3209 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3210 {
3211 #ifdef CONFIG_XPS
3212         struct xps_dev_maps *dev_maps;
3213         struct xps_map *map;
3214         int queue_index = -1;
3215
3216         rcu_read_lock();
3217         dev_maps = rcu_dereference(dev->xps_maps);
3218         if (dev_maps) {
3219                 map = rcu_dereference(
3220                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3221                 if (map) {
3222                         if (map->len == 1)
3223                                 queue_index = map->queues[0];
3224                         else
3225                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3226                                                                            map->len)];
3227                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3228                                 queue_index = -1;
3229                 }
3230         }
3231         rcu_read_unlock();
3232
3233         return queue_index;
3234 #else
3235         return -1;
3236 #endif
3237 }
3238
3239 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3240 {
3241         struct sock *sk = skb->sk;
3242         int queue_index = sk_tx_queue_get(sk);
3243
3244         if (queue_index < 0 || skb->ooo_okay ||
3245             queue_index >= dev->real_num_tx_queues) {
3246                 int new_index = get_xps_queue(dev, skb);
3247                 if (new_index < 0)
3248                         new_index = skb_tx_hash(dev, skb);
3249
3250                 if (queue_index != new_index && sk &&
3251                     sk_fullsock(sk) &&
3252                     rcu_access_pointer(sk->sk_dst_cache))
3253                         sk_tx_queue_set(sk, new_index);
3254
3255                 queue_index = new_index;
3256         }
3257
3258         return queue_index;
3259 }
3260
3261 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262                                     struct sk_buff *skb,
3263                                     void *accel_priv)
3264 {
3265         int queue_index = 0;
3266
3267 #ifdef CONFIG_XPS
3268         u32 sender_cpu = skb->sender_cpu - 1;
3269
3270         if (sender_cpu >= (u32)NR_CPUS)
3271                 skb->sender_cpu = raw_smp_processor_id() + 1;
3272 #endif
3273
3274         if (dev->real_num_tx_queues != 1) {
3275                 const struct net_device_ops *ops = dev->netdev_ops;
3276                 if (ops->ndo_select_queue)
3277                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278                                                             __netdev_pick_tx);
3279                 else
3280                         queue_index = __netdev_pick_tx(dev, skb);
3281
3282                 if (!accel_priv)
3283                         queue_index = netdev_cap_txqueue(dev, queue_index);
3284         }
3285
3286         skb_set_queue_mapping(skb, queue_index);
3287         return netdev_get_tx_queue(dev, queue_index);
3288 }
3289
3290 /**
3291  *      __dev_queue_xmit - transmit a buffer
3292  *      @skb: buffer to transmit
3293  *      @accel_priv: private data used for L2 forwarding offload
3294  *
3295  *      Queue a buffer for transmission to a network device. The caller must
3296  *      have set the device and priority and built the buffer before calling
3297  *      this function. The function can be called from an interrupt.
3298  *
3299  *      A negative errno code is returned on a failure. A success does not
3300  *      guarantee the frame will be transmitted as it may be dropped due
3301  *      to congestion or traffic shaping.
3302  *
3303  * -----------------------------------------------------------------------------------
3304  *      I notice this method can also return errors from the queue disciplines,
3305  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3306  *      be positive.
3307  *
3308  *      Regardless of the return value, the skb is consumed, so it is currently
3309  *      difficult to retry a send to this method.  (You can bump the ref count
3310  *      before sending to hold a reference for retry if you are careful.)
3311  *
3312  *      When calling this method, interrupts MUST be enabled.  This is because
3313  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3314  *          --BLG
3315  */
3316 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3317 {
3318         struct net_device *dev = skb->dev;
3319         struct netdev_queue *txq;
3320         struct Qdisc *q;
3321         int rc = -ENOMEM;
3322
3323         skb_reset_mac_header(skb);
3324
3325         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3327
3328         /* Disable soft irqs for various locks below. Also
3329          * stops preemption for RCU.
3330          */
3331         rcu_read_lock_bh();
3332
3333         skb_update_prio(skb);
3334
3335         qdisc_pkt_len_init(skb);
3336 #ifdef CONFIG_NET_CLS_ACT
3337         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338 # ifdef CONFIG_NET_EGRESS
3339         if (static_key_false(&egress_needed)) {
3340                 skb = sch_handle_egress(skb, &rc, dev);
3341                 if (!skb)
3342                         goto out;
3343         }
3344 # endif
3345 #endif
3346         /* If device/qdisc don't need skb->dst, release it right now while
3347          * its hot in this cpu cache.
3348          */
3349         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3350                 skb_dst_drop(skb);
3351         else
3352                 skb_dst_force(skb);
3353
3354 #ifdef CONFIG_NET_SWITCHDEV
3355         /* Don't forward if offload device already forwarded */
3356         if (skb->offload_fwd_mark &&
3357             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3358                 consume_skb(skb);
3359                 rc = NET_XMIT_SUCCESS;
3360                 goto out;
3361         }
3362 #endif
3363
3364         txq = netdev_pick_tx(dev, skb, accel_priv);
3365         q = rcu_dereference_bh(txq->qdisc);
3366
3367         trace_net_dev_queue(skb);
3368         if (q->enqueue) {
3369                 rc = __dev_xmit_skb(skb, q, dev, txq);
3370                 goto out;
3371         }
3372
3373         /* The device has no queue. Common case for software devices:
3374            loopback, all the sorts of tunnels...
3375
3376            Really, it is unlikely that netif_tx_lock protection is necessary
3377            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3378            counters.)
3379            However, it is possible, that they rely on protection
3380            made by us here.
3381
3382            Check this and shot the lock. It is not prone from deadlocks.
3383            Either shot noqueue qdisc, it is even simpler 8)
3384          */
3385         if (dev->flags & IFF_UP) {
3386                 int cpu = smp_processor_id(); /* ok because BHs are off */
3387
3388                 if (txq->xmit_lock_owner != cpu) {
3389
3390                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3391                                 goto recursion_alert;
3392
3393                         skb = validate_xmit_skb(skb, dev);
3394                         if (!skb)
3395                                 goto out;
3396
3397                         HARD_TX_LOCK(dev, txq, cpu);
3398
3399                         if (!netif_xmit_stopped(txq)) {
3400                                 __this_cpu_inc(xmit_recursion);
3401                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3402                                 __this_cpu_dec(xmit_recursion);
3403                                 if (dev_xmit_complete(rc)) {
3404                                         HARD_TX_UNLOCK(dev, txq);
3405                                         goto out;
3406                                 }
3407                         }
3408                         HARD_TX_UNLOCK(dev, txq);
3409                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3410                                              dev->name);
3411                 } else {
3412                         /* Recursion is detected! It is possible,
3413                          * unfortunately
3414                          */
3415 recursion_alert:
3416                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3417                                              dev->name);
3418                 }
3419         }
3420
3421         rc = -ENETDOWN;
3422         rcu_read_unlock_bh();
3423
3424         atomic_long_inc(&dev->tx_dropped);
3425         kfree_skb_list(skb);
3426         return rc;
3427 out:
3428         rcu_read_unlock_bh();
3429         return rc;
3430 }
3431
3432 int dev_queue_xmit(struct sk_buff *skb)
3433 {
3434         return __dev_queue_xmit(skb, NULL);
3435 }
3436 EXPORT_SYMBOL(dev_queue_xmit);
3437
3438 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3439 {
3440         return __dev_queue_xmit(skb, accel_priv);
3441 }
3442 EXPORT_SYMBOL(dev_queue_xmit_accel);
3443
3444
3445 /*=======================================================================
3446                         Receiver routines
3447   =======================================================================*/
3448
3449 int netdev_max_backlog __read_mostly = 1000;
3450 EXPORT_SYMBOL(netdev_max_backlog);
3451
3452 int netdev_tstamp_prequeue __read_mostly = 1;
3453 int netdev_budget __read_mostly = 300;
3454 int weight_p __read_mostly = 64;            /* old backlog weight */
3455
3456 /* Called with irq disabled */
3457 static inline void ____napi_schedule(struct softnet_data *sd,
3458                                      struct napi_struct *napi)
3459 {
3460         list_add_tail(&napi->poll_list, &sd->poll_list);
3461         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3462 }
3463
3464 #ifdef CONFIG_RPS
3465
3466 /* One global table that all flow-based protocols share. */
3467 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3468 EXPORT_SYMBOL(rps_sock_flow_table);
3469 u32 rps_cpu_mask __read_mostly;
3470 EXPORT_SYMBOL(rps_cpu_mask);
3471
3472 struct static_key rps_needed __read_mostly;
3473 EXPORT_SYMBOL(rps_needed);
3474
3475 static struct rps_dev_flow *
3476 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477             struct rps_dev_flow *rflow, u16 next_cpu)
3478 {
3479         if (next_cpu < nr_cpu_ids) {
3480 #ifdef CONFIG_RFS_ACCEL
3481                 struct netdev_rx_queue *rxqueue;
3482                 struct rps_dev_flow_table *flow_table;
3483                 struct rps_dev_flow *old_rflow;
3484                 u32 flow_id;
3485                 u16 rxq_index;
3486                 int rc;
3487
3488                 /* Should we steer this flow to a different hardware queue? */
3489                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490                     !(dev->features & NETIF_F_NTUPLE))
3491                         goto out;
3492                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493                 if (rxq_index == skb_get_rx_queue(skb))
3494                         goto out;
3495
3496                 rxqueue = dev->_rx + rxq_index;
3497                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498                 if (!flow_table)
3499                         goto out;
3500                 flow_id = skb_get_hash(skb) & flow_table->mask;
3501                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502                                                         rxq_index, flow_id);
3503                 if (rc < 0)
3504                         goto out;
3505                 old_rflow = rflow;
3506                 rflow = &flow_table->flows[flow_id];
3507                 rflow->filter = rc;
3508                 if (old_rflow->filter == rflow->filter)
3509                         old_rflow->filter = RPS_NO_FILTER;
3510         out:
3511 #endif
3512                 rflow->last_qtail =
3513                         per_cpu(softnet_data, next_cpu).input_queue_head;
3514         }
3515
3516         rflow->cpu = next_cpu;
3517         return rflow;
3518 }
3519
3520 /*
3521  * get_rps_cpu is called from netif_receive_skb and returns the target
3522  * CPU from the RPS map of the receiving queue for a given skb.
3523  * rcu_read_lock must be held on entry.
3524  */
3525 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526                        struct rps_dev_flow **rflowp)
3527 {
3528         const struct rps_sock_flow_table *sock_flow_table;
3529         struct netdev_rx_queue *rxqueue = dev->_rx;
3530         struct rps_dev_flow_table *flow_table;
3531         struct rps_map *map;
3532         int cpu = -1;
3533         u32 tcpu;
3534         u32 hash;
3535
3536         if (skb_rx_queue_recorded(skb)) {
3537                 u16 index = skb_get_rx_queue(skb);
3538
3539                 if (unlikely(index >= dev->real_num_rx_queues)) {
3540                         WARN_ONCE(dev->real_num_rx_queues > 1,
3541                                   "%s received packet on queue %u, but number "
3542                                   "of RX queues is %u\n",
3543                                   dev->name, index, dev->real_num_rx_queues);
3544                         goto done;
3545                 }
3546                 rxqueue += index;
3547         }
3548
3549         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552         map = rcu_dereference(rxqueue->rps_map);
3553         if (!flow_table && !map)
3554                 goto done;
3555
3556         skb_reset_network_header(skb);
3557         hash = skb_get_hash(skb);
3558         if (!hash)
3559                 goto done;
3560
3561         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562         if (flow_table && sock_flow_table) {
3563                 struct rps_dev_flow *rflow;
3564                 u32 next_cpu;
3565                 u32 ident;
3566
3567                 /* First check into global flow table if there is a match */
3568                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569                 if ((ident ^ hash) & ~rps_cpu_mask)
3570                         goto try_rps;
3571
3572                 next_cpu = ident & rps_cpu_mask;
3573
3574                 /* OK, now we know there is a match,
3575                  * we can look at the local (per receive queue) flow table
3576                  */
3577                 rflow = &flow_table->flows[hash & flow_table->mask];
3578                 tcpu = rflow->cpu;
3579
3580                 /*
3581                  * If the desired CPU (where last recvmsg was done) is
3582                  * different from current CPU (one in the rx-queue flow
3583                  * table entry), switch if one of the following holds:
3584                  *   - Current CPU is unset (>= nr_cpu_ids).
3585                  *   - Current CPU is offline.
3586                  *   - The current CPU's queue tail has advanced beyond the
3587                  *     last packet that was enqueued using this table entry.
3588                  *     This guarantees that all previous packets for the flow
3589                  *     have been dequeued, thus preserving in order delivery.
3590                  */
3591                 if (unlikely(tcpu != next_cpu) &&
3592                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594                       rflow->last_qtail)) >= 0)) {
3595                         tcpu = next_cpu;
3596                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597                 }
3598
3599                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600                         *rflowp = rflow;
3601                         cpu = tcpu;
3602                         goto done;
3603                 }
3604         }
3605
3606 try_rps:
3607
3608         if (map) {
3609                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610                 if (cpu_online(tcpu)) {
3611                         cpu = tcpu;
3612                         goto done;
3613                 }
3614         }
3615
3616 done:
3617         return cpu;
3618 }
3619
3620 #ifdef CONFIG_RFS_ACCEL
3621
3622 /**
3623  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624  * @dev: Device on which the filter was set
3625  * @rxq_index: RX queue index
3626  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628  *
3629  * Drivers that implement ndo_rx_flow_steer() should periodically call
3630  * this function for each installed filter and remove the filters for
3631  * which it returns %true.
3632  */
3633 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634                          u32 flow_id, u16 filter_id)
3635 {
3636         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637         struct rps_dev_flow_table *flow_table;
3638         struct rps_dev_flow *rflow;
3639         bool expire = true;
3640         unsigned int cpu;
3641
3642         rcu_read_lock();
3643         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644         if (flow_table && flow_id <= flow_table->mask) {
3645                 rflow = &flow_table->flows[flow_id];
3646                 cpu = ACCESS_ONCE(rflow->cpu);
3647                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649                            rflow->last_qtail) <
3650                      (int)(10 * flow_table->mask)))
3651                         expire = false;
3652         }
3653         rcu_read_unlock();
3654         return expire;
3655 }
3656 EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658 #endif /* CONFIG_RFS_ACCEL */
3659
3660 /* Called from hardirq (IPI) context */
3661 static void rps_trigger_softirq(void *data)
3662 {
3663         struct softnet_data *sd = data;
3664
3665         ____napi_schedule(sd, &sd->backlog);
3666         sd->received_rps++;
3667 }
3668
3669 #endif /* CONFIG_RPS */
3670
3671 /*
3672  * Check if this softnet_data structure is another cpu one
3673  * If yes, queue it to our IPI list and return 1
3674  * If no, return 0
3675  */
3676 static int rps_ipi_queued(struct softnet_data *sd)
3677 {
3678 #ifdef CONFIG_RPS
3679         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681         if (sd != mysd) {
3682                 sd->rps_ipi_next = mysd->rps_ipi_list;
3683                 mysd->rps_ipi_list = sd;
3684
3685                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686                 return 1;
3687         }
3688 #endif /* CONFIG_RPS */
3689         return 0;
3690 }
3691
3692 #ifdef CONFIG_NET_FLOW_LIMIT
3693 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694 #endif
3695
3696 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697 {
3698 #ifdef CONFIG_NET_FLOW_LIMIT
3699         struct sd_flow_limit *fl;
3700         struct softnet_data *sd;
3701         unsigned int old_flow, new_flow;
3702
3703         if (qlen < (netdev_max_backlog >> 1))
3704                 return false;
3705
3706         sd = this_cpu_ptr(&softnet_data);
3707
3708         rcu_read_lock();
3709         fl = rcu_dereference(sd->flow_limit);
3710         if (fl) {
3711                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712                 old_flow = fl->history[fl->history_head];
3713                 fl->history[fl->history_head] = new_flow;
3714
3715                 fl->history_head++;
3716                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718                 if (likely(fl->buckets[old_flow]))
3719                         fl->buckets[old_flow]--;
3720
3721                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722                         fl->count++;
3723                         rcu_read_unlock();
3724                         return true;
3725                 }
3726         }
3727         rcu_read_unlock();
3728 #endif
3729         return false;
3730 }
3731
3732 /*
3733  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734  * queue (may be a remote CPU queue).
3735  */
3736 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737                               unsigned int *qtail)
3738 {
3739         struct softnet_data *sd;
3740         unsigned long flags;
3741         unsigned int qlen;
3742
3743         sd = &per_cpu(softnet_data, cpu);
3744
3745         local_irq_save(flags);
3746
3747         rps_lock(sd);
3748         if (!netif_running(skb->dev))
3749                 goto drop;
3750         qlen = skb_queue_len(&sd->input_pkt_queue);
3751         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752                 if (qlen) {
3753 enqueue:
3754                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3755                         input_queue_tail_incr_save(sd, qtail);
3756                         rps_unlock(sd);
3757                         local_irq_restore(flags);
3758                         return NET_RX_SUCCESS;
3759                 }
3760
3761                 /* Schedule NAPI for backlog device
3762                  * We can use non atomic operation since we own the queue lock
3763                  */
3764                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765                         if (!rps_ipi_queued(sd))
3766                                 ____napi_schedule(sd, &sd->backlog);
3767                 }
3768                 goto enqueue;
3769         }
3770
3771 drop:
3772         sd->dropped++;
3773         rps_unlock(sd);
3774
3775         local_irq_restore(flags);
3776
3777         atomic_long_inc(&skb->dev->rx_dropped);
3778         kfree_skb(skb);
3779         return NET_RX_DROP;
3780 }
3781
3782 static int netif_rx_internal(struct sk_buff *skb)
3783 {
3784         int ret;
3785
3786         net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788         trace_netif_rx(skb);
3789 #ifdef CONFIG_RPS
3790         if (static_key_false(&rps_needed)) {
3791                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3792                 int cpu;
3793
3794                 preempt_disable();
3795                 rcu_read_lock();
3796
3797                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798                 if (cpu < 0)
3799                         cpu = smp_processor_id();
3800
3801                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803                 rcu_read_unlock();
3804                 preempt_enable();
3805         } else
3806 #endif
3807         {
3808                 unsigned int qtail;
3809                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810                 put_cpu();
3811         }
3812         return ret;
3813 }
3814
3815 /**
3816  *      netif_rx        -       post buffer to the network code
3817  *      @skb: buffer to post
3818  *
3819  *      This function receives a packet from a device driver and queues it for
3820  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3821  *      may be dropped during processing for congestion control or by the
3822  *      protocol layers.
3823  *
3824  *      return values:
3825  *      NET_RX_SUCCESS  (no congestion)
3826  *      NET_RX_DROP     (packet was dropped)
3827  *
3828  */
3829
3830 int netif_rx(struct sk_buff *skb)
3831 {
3832         trace_netif_rx_entry(skb);
3833
3834         return netif_rx_internal(skb);
3835 }
3836 EXPORT_SYMBOL(netif_rx);
3837
3838 int netif_rx_ni(struct sk_buff *skb)
3839 {
3840         int err;
3841
3842         trace_netif_rx_ni_entry(skb);
3843
3844         preempt_disable();
3845         err = netif_rx_internal(skb);
3846         if (local_softirq_pending())
3847                 do_softirq();
3848         preempt_enable();
3849
3850         return err;
3851 }
3852 EXPORT_SYMBOL(netif_rx_ni);
3853
3854 static void net_tx_action(struct softirq_action *h)
3855 {
3856         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858         if (sd->completion_queue) {
3859                 struct sk_buff *clist;
3860
3861                 local_irq_disable();
3862                 clist = sd->completion_queue;
3863                 sd->completion_queue = NULL;
3864                 local_irq_enable();
3865
3866                 while (clist) {
3867                         struct sk_buff *skb = clist;
3868                         clist = clist->next;
3869
3870                         WARN_ON(atomic_read(&skb->users));
3871                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872                                 trace_consume_skb(skb);
3873                         else
3874                                 trace_kfree_skb(skb, net_tx_action);
3875
3876                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877                                 __kfree_skb(skb);
3878                         else
3879                                 __kfree_skb_defer(skb);
3880                 }
3881
3882                 __kfree_skb_flush();
3883         }
3884
3885         if (sd->output_queue) {
3886                 struct Qdisc *head;
3887
3888                 local_irq_disable();
3889                 head = sd->output_queue;
3890                 sd->output_queue = NULL;
3891                 sd->output_queue_tailp = &sd->output_queue;
3892                 local_irq_enable();
3893
3894                 while (head) {
3895                         struct Qdisc *q = head;
3896                         spinlock_t *root_lock;
3897
3898                         head = head->next_sched;
3899
3900                         root_lock = qdisc_lock(q);
3901                         spin_lock(root_lock);
3902                         /* We need to make sure head->next_sched is read
3903                          * before clearing __QDISC_STATE_SCHED
3904                          */
3905                         smp_mb__before_atomic();
3906                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3907                         qdisc_run(q);
3908                         spin_unlock(root_lock);
3909                 }
3910         }
3911 }
3912
3913 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3914     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3915 /* This hook is defined here for ATM LANE */
3916 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3917                              unsigned char *addr) __read_mostly;
3918 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3919 #endif
3920
3921 static inline struct sk_buff *
3922 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3923                    struct net_device *orig_dev)
3924 {
3925 #ifdef CONFIG_NET_CLS_ACT
3926         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3927         struct tcf_result cl_res;
3928
3929         /* If there's at least one ingress present somewhere (so
3930          * we get here via enabled static key), remaining devices
3931          * that are not configured with an ingress qdisc will bail
3932          * out here.
3933          */
3934         if (!cl)
3935                 return skb;
3936         if (*pt_prev) {
3937                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3938                 *pt_prev = NULL;
3939         }
3940
3941         qdisc_skb_cb(skb)->pkt_len = skb->len;
3942         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3943         qdisc_bstats_cpu_update(cl->q, skb);
3944
3945         switch (tc_classify(skb, cl, &cl_res, false)) {
3946         case TC_ACT_OK:
3947         case TC_ACT_RECLASSIFY:
3948                 skb->tc_index = TC_H_MIN(cl_res.classid);
3949                 break;
3950         case TC_ACT_SHOT:
3951                 qdisc_qstats_cpu_drop(cl->q);
3952                 kfree_skb(skb);
3953                 return NULL;
3954         case TC_ACT_STOLEN:
3955         case TC_ACT_QUEUED:
3956                 consume_skb(skb);
3957                 return NULL;
3958         case TC_ACT_REDIRECT:
3959                 /* skb_mac_header check was done by cls/act_bpf, so
3960                  * we can safely push the L2 header back before
3961                  * redirecting to another netdev
3962                  */
3963                 __skb_push(skb, skb->mac_len);
3964                 skb_do_redirect(skb);
3965                 return NULL;
3966         default:
3967                 break;
3968         }
3969 #endif /* CONFIG_NET_CLS_ACT */
3970         return skb;
3971 }
3972
3973 /**
3974  *      netdev_rx_handler_register - register receive handler
3975  *      @dev: device to register a handler for
3976  *      @rx_handler: receive handler to register
3977  *      @rx_handler_data: data pointer that is used by rx handler
3978  *
3979  *      Register a receive handler for a device. This handler will then be
3980  *      called from __netif_receive_skb. A negative errno code is returned
3981  *      on a failure.
3982  *
3983  *      The caller must hold the rtnl_mutex.
3984  *
3985  *      For a general description of rx_handler, see enum rx_handler_result.
3986  */
3987 int netdev_rx_handler_register(struct net_device *dev,
3988                                rx_handler_func_t *rx_handler,
3989                                void *rx_handler_data)
3990 {
3991         ASSERT_RTNL();
3992
3993         if (dev->rx_handler)
3994                 return -EBUSY;
3995
3996         /* Note: rx_handler_data must be set before rx_handler */
3997         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3998         rcu_assign_pointer(dev->rx_handler, rx_handler);
3999
4000         return 0;
4001 }
4002 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4003
4004 /**
4005  *      netdev_rx_handler_unregister - unregister receive handler
4006  *      @dev: device to unregister a handler from
4007  *
4008  *      Unregister a receive handler from a device.
4009  *
4010  *      The caller must hold the rtnl_mutex.
4011  */
4012 void netdev_rx_handler_unregister(struct net_device *dev)
4013 {
4014
4015         ASSERT_RTNL();
4016         RCU_INIT_POINTER(dev->rx_handler, NULL);
4017         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4018          * section has a guarantee to see a non NULL rx_handler_data
4019          * as well.
4020          */
4021         synchronize_net();
4022         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4023 }
4024 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4025
4026 /*
4027  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4028  * the special handling of PFMEMALLOC skbs.
4029  */
4030 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4031 {
4032         switch (skb->protocol) {
4033         case htons(ETH_P_ARP):
4034         case htons(ETH_P_IP):
4035         case htons(ETH_P_IPV6):
4036         case htons(ETH_P_8021Q):
4037         case htons(ETH_P_8021AD):
4038                 return true;
4039         default:
4040                 return false;
4041         }
4042 }
4043
4044 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4045                              int *ret, struct net_device *orig_dev)
4046 {
4047 #ifdef CONFIG_NETFILTER_INGRESS
4048         if (nf_hook_ingress_active(skb)) {
4049                 if (*pt_prev) {
4050                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4051                         *pt_prev = NULL;
4052                 }
4053
4054                 return nf_hook_ingress(skb);
4055         }
4056 #endif /* CONFIG_NETFILTER_INGRESS */
4057         return 0;
4058 }
4059
4060 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4061 {
4062         struct packet_type *ptype, *pt_prev;
4063         rx_handler_func_t *rx_handler;
4064         struct net_device *orig_dev;
4065         bool deliver_exact = false;
4066         int ret = NET_RX_DROP;
4067         __be16 type;
4068
4069         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4070
4071         trace_netif_receive_skb(skb);
4072
4073         orig_dev = skb->dev;
4074
4075         skb_reset_network_header(skb);
4076         if (!skb_transport_header_was_set(skb))
4077                 skb_reset_transport_header(skb);
4078         skb_reset_mac_len(skb);
4079
4080         pt_prev = NULL;
4081
4082 another_round:
4083         skb->skb_iif = skb->dev->ifindex;
4084
4085         __this_cpu_inc(softnet_data.processed);
4086
4087         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4088             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4089                 skb = skb_vlan_untag(skb);
4090                 if (unlikely(!skb))
4091                         goto out;
4092         }
4093
4094 #ifdef CONFIG_NET_CLS_ACT
4095         if (skb->tc_verd & TC_NCLS) {
4096                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4097                 goto ncls;
4098         }
4099 #endif
4100
4101         if (pfmemalloc)
4102                 goto skip_taps;
4103
4104         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4105                 if (pt_prev)
4106                         ret = deliver_skb(skb, pt_prev, orig_dev);
4107                 pt_prev = ptype;
4108         }
4109
4110         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4111                 if (pt_prev)
4112                         ret = deliver_skb(skb, pt_prev, orig_dev);
4113                 pt_prev = ptype;
4114         }
4115
4116 skip_taps:
4117 #ifdef CONFIG_NET_INGRESS
4118         if (static_key_false(&ingress_needed)) {
4119                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4120                 if (!skb)
4121                         goto out;
4122
4123                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4124                         goto out;
4125         }
4126 #endif
4127 #ifdef CONFIG_NET_CLS_ACT
4128         skb->tc_verd = 0;
4129 ncls:
4130 #endif
4131         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4132                 goto drop;
4133
4134         if (skb_vlan_tag_present(skb)) {
4135                 if (pt_prev) {
4136                         ret = deliver_skb(skb, pt_prev, orig_dev);
4137                         pt_prev = NULL;
4138                 }
4139                 if (vlan_do_receive(&skb))
4140                         goto another_round;
4141                 else if (unlikely(!skb))
4142                         goto out;
4143         }
4144
4145         rx_handler = rcu_dereference(skb->dev->rx_handler);
4146         if (rx_handler) {
4147                 if (pt_prev) {
4148                         ret = deliver_skb(skb, pt_prev, orig_dev);
4149                         pt_prev = NULL;
4150                 }
4151                 switch (rx_handler(&skb)) {
4152                 case RX_HANDLER_CONSUMED:
4153                         ret = NET_RX_SUCCESS;
4154                         goto out;
4155                 case RX_HANDLER_ANOTHER:
4156                         goto another_round;
4157                 case RX_HANDLER_EXACT:
4158                         deliver_exact = true;
4159                 case RX_HANDLER_PASS:
4160                         break;
4161                 default:
4162                         BUG();
4163                 }
4164         }
4165
4166         if (unlikely(skb_vlan_tag_present(skb))) {
4167                 if (skb_vlan_tag_get_id(skb))
4168                         skb->pkt_type = PACKET_OTHERHOST;
4169                 /* Note: we might in the future use prio bits
4170                  * and set skb->priority like in vlan_do_receive()
4171                  * For the time being, just ignore Priority Code Point
4172                  */
4173                 skb->vlan_tci = 0;
4174         }
4175
4176         type = skb->protocol;
4177
4178         /* deliver only exact match when indicated */
4179         if (likely(!deliver_exact)) {
4180                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4181                                        &ptype_base[ntohs(type) &
4182                                                    PTYPE_HASH_MASK]);
4183         }
4184
4185         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4186                                &orig_dev->ptype_specific);
4187
4188         if (unlikely(skb->dev != orig_dev)) {
4189                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4190                                        &skb->dev->ptype_specific);
4191         }
4192
4193         if (pt_prev) {
4194                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4195                         goto drop;
4196                 else
4197                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4198         } else {
4199 drop:
4200                 if (!deliver_exact)
4201                         atomic_long_inc(&skb->dev->rx_dropped);
4202                 else
4203                         atomic_long_inc(&skb->dev->rx_nohandler);
4204                 kfree_skb(skb);
4205                 /* Jamal, now you will not able to escape explaining
4206                  * me how you were going to use this. :-)
4207                  */
4208                 ret = NET_RX_DROP;
4209         }
4210
4211 out:
4212         return ret;
4213 }
4214
4215 static int __netif_receive_skb(struct sk_buff *skb)
4216 {
4217         int ret;
4218
4219         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4220                 unsigned long pflags = current->flags;
4221
4222                 /*
4223                  * PFMEMALLOC skbs are special, they should
4224                  * - be delivered to SOCK_MEMALLOC sockets only
4225                  * - stay away from userspace
4226                  * - have bounded memory usage
4227                  *
4228                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4229                  * context down to all allocation sites.
4230                  */
4231                 current->flags |= PF_MEMALLOC;
4232                 ret = __netif_receive_skb_core(skb, true);
4233                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4234         } else
4235                 ret = __netif_receive_skb_core(skb, false);
4236
4237         return ret;
4238 }
4239
4240 static int netif_receive_skb_internal(struct sk_buff *skb)
4241 {
4242         int ret;
4243
4244         net_timestamp_check(netdev_tstamp_prequeue, skb);
4245
4246         if (skb_defer_rx_timestamp(skb))
4247                 return NET_RX_SUCCESS;
4248
4249         rcu_read_lock();
4250
4251 #ifdef CONFIG_RPS
4252         if (static_key_false(&rps_needed)) {
4253                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4254                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4255
4256                 if (cpu >= 0) {
4257                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4258                         rcu_read_unlock();
4259                         return ret;
4260                 }
4261         }
4262 #endif
4263         ret = __netif_receive_skb(skb);
4264         rcu_read_unlock();
4265         return ret;
4266 }
4267
4268 /**
4269  *      netif_receive_skb - process receive buffer from network
4270  *      @skb: buffer to process
4271  *
4272  *      netif_receive_skb() is the main receive data processing function.
4273  *      It always succeeds. The buffer may be dropped during processing
4274  *      for congestion control or by the protocol layers.
4275  *
4276  *      This function may only be called from softirq context and interrupts
4277  *      should be enabled.
4278  *
4279  *      Return values (usually ignored):
4280  *      NET_RX_SUCCESS: no congestion
4281  *      NET_RX_DROP: packet was dropped
4282  */
4283 int netif_receive_skb(struct sk_buff *skb)
4284 {
4285         trace_netif_receive_skb_entry(skb);
4286
4287         return netif_receive_skb_internal(skb);
4288 }
4289 EXPORT_SYMBOL(netif_receive_skb);
4290
4291 /* Network device is going away, flush any packets still pending
4292  * Called with irqs disabled.
4293  */
4294 static void flush_backlog(void *arg)
4295 {
4296         struct net_device *dev = arg;
4297         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4298         struct sk_buff *skb, *tmp;
4299
4300         rps_lock(sd);
4301         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4302                 if (skb->dev == dev) {
4303                         __skb_unlink(skb, &sd->input_pkt_queue);
4304                         kfree_skb(skb);
4305                         input_queue_head_incr(sd);
4306                 }
4307         }
4308         rps_unlock(sd);
4309
4310         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4311                 if (skb->dev == dev) {
4312                         __skb_unlink(skb, &sd->process_queue);
4313                         kfree_skb(skb);
4314                         input_queue_head_incr(sd);
4315                 }
4316         }
4317 }
4318
4319 static int napi_gro_complete(struct sk_buff *skb)
4320 {
4321         struct packet_offload *ptype;
4322         __be16 type = skb->protocol;
4323         struct list_head *head = &offload_base;
4324         int err = -ENOENT;
4325
4326         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4327
4328         if (NAPI_GRO_CB(skb)->count == 1) {
4329                 skb_shinfo(skb)->gso_size = 0;
4330                 goto out;
4331         }
4332
4333         rcu_read_lock();
4334         list_for_each_entry_rcu(ptype, head, list) {
4335                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4336                         continue;
4337
4338                 err = ptype->callbacks.gro_complete(skb, 0);
4339                 break;
4340         }
4341         rcu_read_unlock();
4342
4343         if (err) {
4344                 WARN_ON(&ptype->list == head);
4345                 kfree_skb(skb);
4346                 return NET_RX_SUCCESS;
4347         }
4348
4349 out:
4350         return netif_receive_skb_internal(skb);
4351 }
4352
4353 /* napi->gro_list contains packets ordered by age.
4354  * youngest packets at the head of it.
4355  * Complete skbs in reverse order to reduce latencies.
4356  */
4357 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4358 {
4359         struct sk_buff *skb, *prev = NULL;
4360
4361         /* scan list and build reverse chain */
4362         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4363                 skb->prev = prev;
4364                 prev = skb;
4365         }
4366
4367         for (skb = prev; skb; skb = prev) {
4368                 skb->next = NULL;
4369
4370                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4371                         return;
4372
4373                 prev = skb->prev;
4374                 napi_gro_complete(skb);
4375                 napi->gro_count--;
4376         }
4377
4378         napi->gro_list = NULL;
4379 }
4380 EXPORT_SYMBOL(napi_gro_flush);
4381
4382 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4383 {
4384         struct sk_buff *p;
4385         unsigned int maclen = skb->dev->hard_header_len;
4386         u32 hash = skb_get_hash_raw(skb);
4387
4388         for (p = napi->gro_list; p; p = p->next) {
4389                 unsigned long diffs;
4390
4391                 NAPI_GRO_CB(p)->flush = 0;
4392
4393                 if (hash != skb_get_hash_raw(p)) {
4394                         NAPI_GRO_CB(p)->same_flow = 0;
4395                         continue;
4396                 }
4397
4398                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4399                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4400                 diffs |= skb_metadata_dst_cmp(p, skb);
4401                 if (maclen == ETH_HLEN)
4402                         diffs |= compare_ether_header(skb_mac_header(p),
4403                                                       skb_mac_header(skb));
4404                 else if (!diffs)
4405                         diffs = memcmp(skb_mac_header(p),
4406                                        skb_mac_header(skb),
4407                                        maclen);
4408                 NAPI_GRO_CB(p)->same_flow = !diffs;
4409         }
4410 }
4411
4412 static void skb_gro_reset_offset(struct sk_buff *skb)
4413 {
4414         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4415         const skb_frag_t *frag0 = &pinfo->frags[0];
4416
4417         NAPI_GRO_CB(skb)->data_offset = 0;
4418         NAPI_GRO_CB(skb)->frag0 = NULL;
4419         NAPI_GRO_CB(skb)->frag0_len = 0;
4420
4421         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4422             pinfo->nr_frags &&
4423             !PageHighMem(skb_frag_page(frag0))) {
4424                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4425                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4426         }
4427 }
4428
4429 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4430 {
4431         struct skb_shared_info *pinfo = skb_shinfo(skb);
4432
4433         BUG_ON(skb->end - skb->tail < grow);
4434
4435         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4436
4437         skb->data_len -= grow;
4438         skb->tail += grow;
4439
4440         pinfo->frags[0].page_offset += grow;
4441         skb_frag_size_sub(&pinfo->frags[0], grow);
4442
4443         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4444                 skb_frag_unref(skb, 0);
4445                 memmove(pinfo->frags, pinfo->frags + 1,
4446                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4447         }
4448 }
4449
4450 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4451 {
4452         struct sk_buff **pp = NULL;
4453         struct packet_offload *ptype;
4454         __be16 type = skb->protocol;
4455         struct list_head *head = &offload_base;
4456         int same_flow;
4457         enum gro_result ret;
4458         int grow;
4459
4460         if (!(skb->dev->features & NETIF_F_GRO))
4461                 goto normal;
4462
4463         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4464                 goto normal;
4465
4466         gro_list_prepare(napi, skb);
4467
4468         rcu_read_lock();
4469         list_for_each_entry_rcu(ptype, head, list) {
4470                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4471                         continue;
4472
4473                 skb_set_network_header(skb, skb_gro_offset(skb));
4474                 skb_reset_mac_len(skb);
4475                 NAPI_GRO_CB(skb)->same_flow = 0;
4476                 NAPI_GRO_CB(skb)->flush = 0;
4477                 NAPI_GRO_CB(skb)->free = 0;
4478                 NAPI_GRO_CB(skb)->encap_mark = 0;
4479                 NAPI_GRO_CB(skb)->is_fou = 0;
4480                 NAPI_GRO_CB(skb)->is_atomic = 1;
4481                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4482
4483                 /* Setup for GRO checksum validation */
4484                 switch (skb->ip_summed) {
4485                 case CHECKSUM_COMPLETE:
4486                         NAPI_GRO_CB(skb)->csum = skb->csum;
4487                         NAPI_GRO_CB(skb)->csum_valid = 1;
4488                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4489                         break;
4490                 case CHECKSUM_UNNECESSARY:
4491                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4492                         NAPI_GRO_CB(skb)->csum_valid = 0;
4493                         break;
4494                 default:
4495                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4496                         NAPI_GRO_CB(skb)->csum_valid = 0;
4497                 }
4498
4499                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4500                 break;
4501         }
4502         rcu_read_unlock();
4503
4504         if (&ptype->list == head)
4505                 goto normal;
4506
4507         same_flow = NAPI_GRO_CB(skb)->same_flow;
4508         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4509
4510         if (pp) {
4511                 struct sk_buff *nskb = *pp;
4512
4513                 *pp = nskb->next;
4514                 nskb->next = NULL;
4515                 napi_gro_complete(nskb);
4516                 napi->gro_count--;
4517         }
4518
4519         if (same_flow)
4520                 goto ok;
4521
4522         if (NAPI_GRO_CB(skb)->flush)
4523                 goto normal;
4524
4525         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4526                 struct sk_buff *nskb = napi->gro_list;
4527
4528                 /* locate the end of the list to select the 'oldest' flow */
4529                 while (nskb->next) {
4530                         pp = &nskb->next;
4531                         nskb = *pp;
4532                 }
4533                 *pp = NULL;
4534                 nskb->next = NULL;
4535                 napi_gro_complete(nskb);
4536         } else {
4537                 napi->gro_count++;
4538         }
4539         NAPI_GRO_CB(skb)->count = 1;
4540         NAPI_GRO_CB(skb)->age = jiffies;
4541         NAPI_GRO_CB(skb)->last = skb;
4542         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4543         skb->next = napi->gro_list;
4544         napi->gro_list = skb;
4545         ret = GRO_HELD;
4546
4547 pull:
4548         grow = skb_gro_offset(skb) - skb_headlen(skb);
4549         if (grow > 0)
4550                 gro_pull_from_frag0(skb, grow);
4551 ok:
4552         return ret;
4553
4554 normal:
4555         ret = GRO_NORMAL;
4556         goto pull;
4557 }
4558
4559 struct packet_offload *gro_find_receive_by_type(__be16 type)
4560 {
4561         struct list_head *offload_head = &offload_base;
4562         struct packet_offload *ptype;
4563
4564         list_for_each_entry_rcu(ptype, offload_head, list) {
4565                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4566                         continue;
4567                 return ptype;
4568         }
4569         return NULL;
4570 }
4571 EXPORT_SYMBOL(gro_find_receive_by_type);
4572
4573 struct packet_offload *gro_find_complete_by_type(__be16 type)
4574 {
4575         struct list_head *offload_head = &offload_base;
4576         struct packet_offload *ptype;
4577
4578         list_for_each_entry_rcu(ptype, offload_head, list) {
4579                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4580                         continue;
4581                 return ptype;
4582         }
4583         return NULL;
4584 }
4585 EXPORT_SYMBOL(gro_find_complete_by_type);
4586
4587 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4588 {
4589         switch (ret) {
4590         case GRO_NORMAL:
4591                 if (netif_receive_skb_internal(skb))
4592                         ret = GRO_DROP;
4593                 break;
4594
4595         case GRO_DROP:
4596                 kfree_skb(skb);
4597                 break;
4598
4599         case GRO_MERGED_FREE:
4600                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4601                         skb_dst_drop(skb);
4602                         kmem_cache_free(skbuff_head_cache, skb);
4603                 } else {
4604                         __kfree_skb(skb);
4605                 }
4606                 break;
4607
4608         case GRO_HELD:
4609         case GRO_MERGED:
4610                 break;
4611         }
4612
4613         return ret;
4614 }
4615
4616 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4617 {
4618         skb_mark_napi_id(skb, napi);
4619         trace_napi_gro_receive_entry(skb);
4620
4621         skb_gro_reset_offset(skb);
4622
4623         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4624 }
4625 EXPORT_SYMBOL(napi_gro_receive);
4626
4627 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4628 {
4629         if (unlikely(skb->pfmemalloc)) {
4630                 consume_skb(skb);
4631                 return;
4632         }
4633         __skb_pull(skb, skb_headlen(skb));
4634         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4635         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4636         skb->vlan_tci = 0;
4637         skb->dev = napi->dev;
4638         skb->skb_iif = 0;
4639         skb->encapsulation = 0;
4640         skb_shinfo(skb)->gso_type = 0;
4641         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4642
4643         napi->skb = skb;
4644 }
4645
4646 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4647 {
4648         struct sk_buff *skb = napi->skb;
4649
4650         if (!skb) {
4651                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4652                 if (skb) {
4653                         napi->skb = skb;
4654                         skb_mark_napi_id(skb, napi);
4655                 }
4656         }
4657         return skb;
4658 }
4659 EXPORT_SYMBOL(napi_get_frags);
4660
4661 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4662                                       struct sk_buff *skb,
4663                                       gro_result_t ret)
4664 {
4665         switch (ret) {
4666         case GRO_NORMAL:
4667         case GRO_HELD:
4668                 __skb_push(skb, ETH_HLEN);
4669                 skb->protocol = eth_type_trans(skb, skb->dev);
4670                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4671                         ret = GRO_DROP;
4672                 break;
4673
4674         case GRO_DROP:
4675         case GRO_MERGED_FREE:
4676                 napi_reuse_skb(napi, skb);
4677                 break;
4678
4679         case GRO_MERGED:
4680                 break;
4681         }
4682
4683         return ret;
4684 }
4685
4686 /* Upper GRO stack assumes network header starts at gro_offset=0
4687  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4688  * We copy ethernet header into skb->data to have a common layout.
4689  */
4690 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4691 {
4692         struct sk_buff *skb = napi->skb;
4693         const struct ethhdr *eth;
4694         unsigned int hlen = sizeof(*eth);
4695
4696         napi->skb = NULL;
4697
4698         skb_reset_mac_header(skb);
4699         skb_gro_reset_offset(skb);
4700
4701         eth = skb_gro_header_fast(skb, 0);
4702         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4703                 eth = skb_gro_header_slow(skb, hlen, 0);
4704                 if (unlikely(!eth)) {
4705                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4706                                              __func__, napi->dev->name);
4707                         napi_reuse_skb(napi, skb);
4708                         return NULL;
4709                 }
4710         } else {
4711                 gro_pull_from_frag0(skb, hlen);
4712                 NAPI_GRO_CB(skb)->frag0 += hlen;
4713                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4714         }
4715         __skb_pull(skb, hlen);
4716
4717         /*
4718          * This works because the only protocols we care about don't require
4719          * special handling.
4720          * We'll fix it up properly in napi_frags_finish()
4721          */
4722         skb->protocol = eth->h_proto;
4723
4724         return skb;
4725 }
4726
4727 gro_result_t napi_gro_frags(struct napi_struct *napi)
4728 {
4729         struct sk_buff *skb = napi_frags_skb(napi);
4730
4731         if (!skb)
4732                 return GRO_DROP;
4733
4734         trace_napi_gro_frags_entry(skb);
4735
4736         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4737 }
4738 EXPORT_SYMBOL(napi_gro_frags);
4739
4740 /* Compute the checksum from gro_offset and return the folded value
4741  * after adding in any pseudo checksum.
4742  */
4743 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4744 {
4745         __wsum wsum;
4746         __sum16 sum;
4747
4748         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4749
4750         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4751         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4752         if (likely(!sum)) {
4753                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4754                     !skb->csum_complete_sw)
4755                         netdev_rx_csum_fault(skb->dev);
4756         }
4757
4758         NAPI_GRO_CB(skb)->csum = wsum;
4759         NAPI_GRO_CB(skb)->csum_valid = 1;
4760
4761         return sum;
4762 }
4763 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4764
4765 /*
4766  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4767  * Note: called with local irq disabled, but exits with local irq enabled.
4768  */
4769 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4770 {
4771 #ifdef CONFIG_RPS
4772         struct softnet_data *remsd = sd->rps_ipi_list;
4773
4774         if (remsd) {
4775                 sd->rps_ipi_list = NULL;
4776
4777                 local_irq_enable();
4778
4779                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4780                 while (remsd) {
4781                         struct softnet_data *next = remsd->rps_ipi_next;
4782
4783                         if (cpu_online(remsd->cpu))
4784                                 smp_call_function_single_async(remsd->cpu,
4785                                                            &remsd->csd);
4786                         remsd = next;
4787                 }
4788         } else
4789 #endif
4790                 local_irq_enable();
4791 }
4792
4793 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4794 {
4795 #ifdef CONFIG_RPS
4796         return sd->rps_ipi_list != NULL;
4797 #else
4798         return false;
4799 #endif
4800 }
4801
4802 static int process_backlog(struct napi_struct *napi, int quota)
4803 {
4804         int work = 0;
4805         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4806
4807         /* Check if we have pending ipi, its better to send them now,
4808          * not waiting net_rx_action() end.
4809          */
4810         if (sd_has_rps_ipi_waiting(sd)) {
4811                 local_irq_disable();
4812                 net_rps_action_and_irq_enable(sd);
4813         }
4814
4815         napi->weight = weight_p;
4816         local_irq_disable();
4817         while (1) {
4818                 struct sk_buff *skb;
4819
4820                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4821                         rcu_read_lock();
4822                         local_irq_enable();
4823                         __netif_receive_skb(skb);
4824                         rcu_read_unlock();
4825                         local_irq_disable();
4826                         input_queue_head_incr(sd);
4827                         if (++work >= quota) {
4828                                 local_irq_enable();
4829                                 return work;
4830                         }
4831                 }
4832
4833                 rps_lock(sd);
4834                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4835                         /*
4836                          * Inline a custom version of __napi_complete().
4837                          * only current cpu owns and manipulates this napi,
4838                          * and NAPI_STATE_SCHED is the only possible flag set
4839                          * on backlog.
4840                          * We can use a plain write instead of clear_bit(),
4841                          * and we dont need an smp_mb() memory barrier.
4842                          */
4843                         napi->state = 0;
4844                         rps_unlock(sd);
4845
4846                         break;
4847                 }
4848
4849                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4850                                            &sd->process_queue);
4851                 rps_unlock(sd);
4852         }
4853         local_irq_enable();
4854
4855         return work;
4856 }
4857
4858 /**
4859  * __napi_schedule - schedule for receive
4860  * @n: entry to schedule
4861  *
4862  * The entry's receive function will be scheduled to run.
4863  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4864  */
4865 void __napi_schedule(struct napi_struct *n)
4866 {
4867         unsigned long flags;
4868
4869         local_irq_save(flags);
4870         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4871         local_irq_restore(flags);
4872 }
4873 EXPORT_SYMBOL(__napi_schedule);
4874
4875 /**
4876  * __napi_schedule_irqoff - schedule for receive
4877  * @n: entry to schedule
4878  *
4879  * Variant of __napi_schedule() assuming hard irqs are masked
4880  */
4881 void __napi_schedule_irqoff(struct napi_struct *n)
4882 {
4883         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4884 }
4885 EXPORT_SYMBOL(__napi_schedule_irqoff);
4886
4887 void __napi_complete(struct napi_struct *n)
4888 {
4889         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4890
4891         list_del_init(&n->poll_list);
4892         smp_mb__before_atomic();
4893         clear_bit(NAPI_STATE_SCHED, &n->state);
4894 }
4895 EXPORT_SYMBOL(__napi_complete);
4896
4897 void napi_complete_done(struct napi_struct *n, int work_done)
4898 {
4899         unsigned long flags;
4900
4901         /*
4902          * don't let napi dequeue from the cpu poll list
4903          * just in case its running on a different cpu
4904          */
4905         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4906                 return;
4907
4908         if (n->gro_list) {
4909                 unsigned long timeout = 0;
4910
4911                 if (work_done)
4912                         timeout = n->dev->gro_flush_timeout;
4913
4914                 if (timeout)
4915                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4916                                       HRTIMER_MODE_REL_PINNED);
4917                 else
4918                         napi_gro_flush(n, false);
4919         }
4920         if (likely(list_empty(&n->poll_list))) {
4921                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4922         } else {
4923                 /* If n->poll_list is not empty, we need to mask irqs */
4924                 local_irq_save(flags);
4925                 __napi_complete(n);
4926                 local_irq_restore(flags);
4927         }
4928 }
4929 EXPORT_SYMBOL(napi_complete_done);
4930
4931 /* must be called under rcu_read_lock(), as we dont take a reference */
4932 static struct napi_struct *napi_by_id(unsigned int napi_id)
4933 {
4934         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4935         struct napi_struct *napi;
4936
4937         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4938                 if (napi->napi_id == napi_id)
4939                         return napi;
4940
4941         return NULL;
4942 }
4943
4944 #if defined(CONFIG_NET_RX_BUSY_POLL)
4945 #define BUSY_POLL_BUDGET 8
4946 bool sk_busy_loop(struct sock *sk, int nonblock)
4947 {
4948         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4949         int (*busy_poll)(struct napi_struct *dev);
4950         struct napi_struct *napi;
4951         int rc = false;
4952
4953         rcu_read_lock();
4954
4955         napi = napi_by_id(sk->sk_napi_id);
4956         if (!napi)
4957                 goto out;
4958
4959         /* Note: ndo_busy_poll method is optional in linux-4.5 */
4960         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4961
4962         do {
4963                 rc = 0;
4964                 local_bh_disable();
4965                 if (busy_poll) {
4966                         rc = busy_poll(napi);
4967                 } else if (napi_schedule_prep(napi)) {
4968                         void *have = netpoll_poll_lock(napi);
4969
4970                         if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4971                                 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4972                                 trace_napi_poll(napi);
4973                                 if (rc == BUSY_POLL_BUDGET) {
4974                                         napi_complete_done(napi, rc);
4975                                         napi_schedule(napi);
4976                                 }
4977                         }
4978                         netpoll_poll_unlock(have);
4979                 }
4980                 if (rc > 0)
4981                         __NET_ADD_STATS(sock_net(sk),
4982                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4983                 local_bh_enable();
4984
4985                 if (rc == LL_FLUSH_FAILED)
4986                         break; /* permanent failure */
4987
4988                 cpu_relax();
4989         } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4990                  !need_resched() && !busy_loop_timeout(end_time));
4991
4992         rc = !skb_queue_empty(&sk->sk_receive_queue);
4993 out:
4994         rcu_read_unlock();
4995         return rc;
4996 }
4997 EXPORT_SYMBOL(sk_busy_loop);
4998
4999 #endif /* CONFIG_NET_RX_BUSY_POLL */
5000
5001 void napi_hash_add(struct napi_struct *napi)
5002 {
5003         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5004             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5005                 return;
5006
5007         spin_lock(&napi_hash_lock);
5008
5009         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5010         do {
5011                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5012                         napi_gen_id = NR_CPUS + 1;
5013         } while (napi_by_id(napi_gen_id));
5014         napi->napi_id = napi_gen_id;
5015
5016         hlist_add_head_rcu(&napi->napi_hash_node,
5017                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5018
5019         spin_unlock(&napi_hash_lock);
5020 }
5021 EXPORT_SYMBOL_GPL(napi_hash_add);
5022
5023 /* Warning : caller is responsible to make sure rcu grace period
5024  * is respected before freeing memory containing @napi
5025  */
5026 bool napi_hash_del(struct napi_struct *napi)
5027 {
5028         bool rcu_sync_needed = false;
5029
5030         spin_lock(&napi_hash_lock);
5031
5032         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5033                 rcu_sync_needed = true;
5034                 hlist_del_rcu(&napi->napi_hash_node);
5035         }
5036         spin_unlock(&napi_hash_lock);
5037         return rcu_sync_needed;
5038 }
5039 EXPORT_SYMBOL_GPL(napi_hash_del);
5040
5041 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5042 {
5043         struct napi_struct *napi;
5044
5045         napi = container_of(timer, struct napi_struct, timer);
5046         if (napi->gro_list)
5047                 napi_schedule(napi);
5048
5049         return HRTIMER_NORESTART;
5050 }
5051
5052 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5053                     int (*poll)(struct napi_struct *, int), int weight)
5054 {
5055         INIT_LIST_HEAD(&napi->poll_list);
5056         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5057         napi->timer.function = napi_watchdog;
5058         napi->gro_count = 0;
5059         napi->gro_list = NULL;
5060         napi->skb = NULL;
5061         napi->poll = poll;
5062         if (weight > NAPI_POLL_WEIGHT)
5063                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5064                             weight, dev->name);
5065         napi->weight = weight;
5066         list_add(&napi->dev_list, &dev->napi_list);
5067         napi->dev = dev;
5068 #ifdef CONFIG_NETPOLL
5069         spin_lock_init(&napi->poll_lock);
5070         napi->poll_owner = -1;
5071 #endif
5072         set_bit(NAPI_STATE_SCHED, &napi->state);
5073         napi_hash_add(napi);
5074 }
5075 EXPORT_SYMBOL(netif_napi_add);
5076
5077 void napi_disable(struct napi_struct *n)
5078 {
5079         might_sleep();
5080         set_bit(NAPI_STATE_DISABLE, &n->state);
5081
5082         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5083                 msleep(1);
5084         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5085                 msleep(1);
5086
5087         hrtimer_cancel(&n->timer);
5088
5089         clear_bit(NAPI_STATE_DISABLE, &n->state);
5090 }
5091 EXPORT_SYMBOL(napi_disable);
5092
5093 /* Must be called in process context */
5094 void netif_napi_del(struct napi_struct *napi)
5095 {
5096         might_sleep();
5097         if (napi_hash_del(napi))
5098                 synchronize_net();
5099         list_del_init(&napi->dev_list);
5100         napi_free_frags(napi);
5101
5102         kfree_skb_list(napi->gro_list);
5103         napi->gro_list = NULL;
5104         napi->gro_count = 0;
5105 }
5106 EXPORT_SYMBOL(netif_napi_del);
5107
5108 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5109 {
5110         void *have;
5111         int work, weight;
5112
5113         list_del_init(&n->poll_list);
5114
5115         have = netpoll_poll_lock(n);
5116
5117         weight = n->weight;
5118
5119         /* This NAPI_STATE_SCHED test is for avoiding a race
5120          * with netpoll's poll_napi().  Only the entity which
5121          * obtains the lock and sees NAPI_STATE_SCHED set will
5122          * actually make the ->poll() call.  Therefore we avoid
5123          * accidentally calling ->poll() when NAPI is not scheduled.
5124          */
5125         work = 0;
5126         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5127                 work = n->poll(n, weight);
5128                 trace_napi_poll(n);
5129         }
5130
5131         WARN_ON_ONCE(work > weight);
5132
5133         if (likely(work < weight))
5134                 goto out_unlock;
5135
5136         /* Drivers must not modify the NAPI state if they
5137          * consume the entire weight.  In such cases this code
5138          * still "owns" the NAPI instance and therefore can
5139          * move the instance around on the list at-will.
5140          */
5141         if (unlikely(napi_disable_pending(n))) {
5142                 napi_complete(n);
5143                 goto out_unlock;
5144         }
5145
5146         if (n->gro_list) {
5147                 /* flush too old packets
5148                  * If HZ < 1000, flush all packets.
5149                  */
5150                 napi_gro_flush(n, HZ >= 1000);
5151         }
5152
5153         /* Some drivers may have called napi_schedule
5154          * prior to exhausting their budget.
5155          */
5156         if (unlikely(!list_empty(&n->poll_list))) {
5157                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5158                              n->dev ? n->dev->name : "backlog");
5159                 goto out_unlock;
5160         }
5161
5162         list_add_tail(&n->poll_list, repoll);
5163
5164 out_unlock:
5165         netpoll_poll_unlock(have);
5166
5167         return work;
5168 }
5169
5170 static void net_rx_action(struct softirq_action *h)
5171 {
5172         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5173         unsigned long time_limit = jiffies + 2;
5174         int budget = netdev_budget;
5175         LIST_HEAD(list);
5176         LIST_HEAD(repoll);
5177
5178         local_irq_disable();
5179         list_splice_init(&sd->poll_list, &list);
5180         local_irq_enable();
5181
5182         for (;;) {
5183                 struct napi_struct *n;
5184
5185                 if (list_empty(&list)) {
5186                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5187                                 return;
5188                         break;
5189                 }
5190
5191                 n = list_first_entry(&list, struct napi_struct, poll_list);
5192                 budget -= napi_poll(n, &repoll);
5193
5194                 /* If softirq window is exhausted then punt.
5195                  * Allow this to run for 2 jiffies since which will allow
5196                  * an average latency of 1.5/HZ.
5197                  */
5198                 if (unlikely(budget <= 0 ||
5199                              time_after_eq(jiffies, time_limit))) {
5200                         sd->time_squeeze++;
5201                         break;
5202                 }
5203         }
5204
5205         __kfree_skb_flush();
5206         local_irq_disable();
5207
5208         list_splice_tail_init(&sd->poll_list, &list);
5209         list_splice_tail(&repoll, &list);
5210         list_splice(&list, &sd->poll_list);
5211         if (!list_empty(&sd->poll_list))
5212                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5213
5214         net_rps_action_and_irq_enable(sd);
5215 }
5216
5217 struct netdev_adjacent {
5218         struct net_device *dev;
5219
5220         /* upper master flag, there can only be one master device per list */
5221         bool master;
5222
5223         /* counter for the number of times this device was added to us */
5224         u16 ref_nr;
5225
5226         /* private field for the users */
5227         void *private;
5228
5229         struct list_head list;
5230         struct rcu_head rcu;
5231 };
5232
5233 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5234                                                  struct list_head *adj_list)
5235 {
5236         struct netdev_adjacent *adj;
5237
5238         list_for_each_entry(adj, adj_list, list) {
5239                 if (adj->dev == adj_dev)
5240                         return adj;
5241         }
5242         return NULL;
5243 }
5244
5245 /**
5246  * netdev_has_upper_dev - Check if device is linked to an upper device
5247  * @dev: device
5248  * @upper_dev: upper device to check
5249  *
5250  * Find out if a device is linked to specified upper device and return true
5251  * in case it is. Note that this checks only immediate upper device,
5252  * not through a complete stack of devices. The caller must hold the RTNL lock.
5253  */
5254 bool netdev_has_upper_dev(struct net_device *dev,
5255                           struct net_device *upper_dev)
5256 {
5257         ASSERT_RTNL();
5258
5259         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5260 }
5261 EXPORT_SYMBOL(netdev_has_upper_dev);
5262
5263 /**
5264  * netdev_has_any_upper_dev - Check if device is linked to some device
5265  * @dev: device
5266  *
5267  * Find out if a device is linked to an upper device and return true in case
5268  * it is. The caller must hold the RTNL lock.
5269  */
5270 static bool netdev_has_any_upper_dev(struct net_device *dev)
5271 {
5272         ASSERT_RTNL();
5273
5274         return !list_empty(&dev->all_adj_list.upper);
5275 }
5276
5277 /**
5278  * netdev_master_upper_dev_get - Get master upper device
5279  * @dev: device
5280  *
5281  * Find a master upper device and return pointer to it or NULL in case
5282  * it's not there. The caller must hold the RTNL lock.
5283  */
5284 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5285 {
5286         struct netdev_adjacent *upper;
5287
5288         ASSERT_RTNL();
5289
5290         if (list_empty(&dev->adj_list.upper))
5291                 return NULL;
5292
5293         upper = list_first_entry(&dev->adj_list.upper,
5294                                  struct netdev_adjacent, list);
5295         if (likely(upper->master))
5296                 return upper->dev;
5297         return NULL;
5298 }
5299 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5300
5301 void *netdev_adjacent_get_private(struct list_head *adj_list)
5302 {
5303         struct netdev_adjacent *adj;
5304
5305         adj = list_entry(adj_list, struct netdev_adjacent, list);
5306
5307         return adj->private;
5308 }
5309 EXPORT_SYMBOL(netdev_adjacent_get_private);
5310
5311 /**
5312  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5313  * @dev: device
5314  * @iter: list_head ** of the current position
5315  *
5316  * Gets the next device from the dev's upper list, starting from iter
5317  * position. The caller must hold RCU read lock.
5318  */
5319 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5320                                                  struct list_head **iter)
5321 {
5322         struct netdev_adjacent *upper;
5323
5324         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5325
5326         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5327
5328         if (&upper->list == &dev->adj_list.upper)
5329                 return NULL;
5330
5331         *iter = &upper->list;
5332
5333         return upper->dev;
5334 }
5335 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5336
5337 /**
5338  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5339  * @dev: device
5340  * @iter: list_head ** of the current position
5341  *
5342  * Gets the next device from the dev's upper list, starting from iter
5343  * position. The caller must hold RCU read lock.
5344  */
5345 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5346                                                      struct list_head **iter)
5347 {
5348         struct netdev_adjacent *upper;
5349
5350         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5351
5352         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5353
5354         if (&upper->list == &dev->all_adj_list.upper)
5355                 return NULL;
5356
5357         *iter = &upper->list;
5358
5359         return upper->dev;
5360 }
5361 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5362
5363 /**
5364  * netdev_lower_get_next_private - Get the next ->private from the
5365  *                                 lower neighbour list
5366  * @dev: device
5367  * @iter: list_head ** of the current position
5368  *
5369  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5370  * list, starting from iter position. The caller must hold either hold the
5371  * RTNL lock or its own locking that guarantees that the neighbour lower
5372  * list will remain unchanged.
5373  */
5374 void *netdev_lower_get_next_private(struct net_device *dev,
5375                                     struct list_head **iter)
5376 {
5377         struct netdev_adjacent *lower;
5378
5379         lower = list_entry(*iter, struct netdev_adjacent, list);
5380
5381         if (&lower->list == &dev->adj_list.lower)
5382                 return NULL;
5383
5384         *iter = lower->list.next;
5385
5386         return lower->private;
5387 }
5388 EXPORT_SYMBOL(netdev_lower_get_next_private);
5389
5390 /**
5391  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5392  *                                     lower neighbour list, RCU
5393  *                                     variant
5394  * @dev: device
5395  * @iter: list_head ** of the current position
5396  *
5397  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5398  * list, starting from iter position. The caller must hold RCU read lock.
5399  */
5400 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5401                                         struct list_head **iter)
5402 {
5403         struct netdev_adjacent *lower;
5404
5405         WARN_ON_ONCE(!rcu_read_lock_held());
5406
5407         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5408
5409         if (&lower->list == &dev->adj_list.lower)
5410                 return NULL;
5411
5412         *iter = &lower->list;
5413
5414         return lower->private;
5415 }
5416 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5417
5418 /**
5419  * netdev_lower_get_next - Get the next device from the lower neighbour
5420  *                         list
5421  * @dev: device
5422  * @iter: list_head ** of the current position
5423  *
5424  * Gets the next netdev_adjacent from the dev's lower neighbour
5425  * list, starting from iter position. The caller must hold RTNL lock or
5426  * its own locking that guarantees that the neighbour lower
5427  * list will remain unchanged.
5428  */
5429 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5430 {
5431         struct netdev_adjacent *lower;
5432
5433         lower = list_entry(*iter, struct netdev_adjacent, list);
5434
5435         if (&lower->list == &dev->adj_list.lower)
5436                 return NULL;
5437
5438         *iter = lower->list.next;
5439
5440         return lower->dev;
5441 }
5442 EXPORT_SYMBOL(netdev_lower_get_next);
5443
5444 /**
5445  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5446  *                                     lower neighbour list, RCU
5447  *                                     variant
5448  * @dev: device
5449  *
5450  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5451  * list. The caller must hold RCU read lock.
5452  */
5453 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5454 {
5455         struct netdev_adjacent *lower;
5456
5457         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5458                         struct netdev_adjacent, list);
5459         if (lower)
5460                 return lower->private;
5461         return NULL;
5462 }
5463 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5464
5465 /**
5466  * netdev_master_upper_dev_get_rcu - Get master upper device
5467  * @dev: device
5468  *
5469  * Find a master upper device and return pointer to it or NULL in case
5470  * it's not there. The caller must hold the RCU read lock.
5471  */
5472 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5473 {
5474         struct netdev_adjacent *upper;
5475
5476         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5477                                        struct netdev_adjacent, list);
5478         if (upper && likely(upper->master))
5479                 return upper->dev;
5480         return NULL;
5481 }
5482 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5483
5484 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5485                               struct net_device *adj_dev,
5486                               struct list_head *dev_list)
5487 {
5488         char linkname[IFNAMSIZ+7];
5489         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5490                 "upper_%s" : "lower_%s", adj_dev->name);
5491         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5492                                  linkname);
5493 }
5494 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5495                                char *name,
5496                                struct list_head *dev_list)
5497 {
5498         char linkname[IFNAMSIZ+7];
5499         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5500                 "upper_%s" : "lower_%s", name);
5501         sysfs_remove_link(&(dev->dev.kobj), linkname);
5502 }
5503
5504 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5505                                                  struct net_device *adj_dev,
5506                                                  struct list_head *dev_list)
5507 {
5508         return (dev_list == &dev->adj_list.upper ||
5509                 dev_list == &dev->adj_list.lower) &&
5510                 net_eq(dev_net(dev), dev_net(adj_dev));
5511 }
5512
5513 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5514                                         struct net_device *adj_dev,
5515                                         struct list_head *dev_list,
5516                                         void *private, bool master)
5517 {
5518         struct netdev_adjacent *adj;
5519         int ret;
5520
5521         adj = __netdev_find_adj(adj_dev, dev_list);
5522
5523         if (adj) {
5524                 adj->ref_nr++;
5525                 return 0;
5526         }
5527
5528         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5529         if (!adj)
5530                 return -ENOMEM;
5531
5532         adj->dev = adj_dev;
5533         adj->master = master;
5534         adj->ref_nr = 1;
5535         adj->private = private;
5536         dev_hold(adj_dev);
5537
5538         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5539                  adj_dev->name, dev->name, adj_dev->name);
5540
5541         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5542                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5543                 if (ret)
5544                         goto free_adj;
5545         }
5546
5547         /* Ensure that master link is always the first item in list. */
5548         if (master) {
5549                 ret = sysfs_create_link(&(dev->dev.kobj),
5550                                         &(adj_dev->dev.kobj), "master");
5551                 if (ret)
5552                         goto remove_symlinks;
5553
5554                 list_add_rcu(&adj->list, dev_list);
5555         } else {
5556                 list_add_tail_rcu(&adj->list, dev_list);
5557         }
5558
5559         return 0;
5560
5561 remove_symlinks:
5562         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5563                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5564 free_adj:
5565         kfree(adj);
5566         dev_put(adj_dev);
5567
5568         return ret;
5569 }
5570
5571 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5572                                          struct net_device *adj_dev,
5573                                          struct list_head *dev_list)
5574 {
5575         struct netdev_adjacent *adj;
5576
5577         adj = __netdev_find_adj(adj_dev, dev_list);
5578
5579         if (!adj) {
5580                 pr_err("tried to remove device %s from %s\n",
5581                        dev->name, adj_dev->name);
5582                 BUG();
5583         }
5584
5585         if (adj->ref_nr > 1) {
5586                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5587                          adj->ref_nr-1);
5588                 adj->ref_nr--;
5589                 return;
5590         }
5591
5592         if (adj->master)
5593                 sysfs_remove_link(&(dev->dev.kobj), "master");
5594
5595         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5596                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5597
5598         list_del_rcu(&adj->list);
5599         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5600                  adj_dev->name, dev->name, adj_dev->name);
5601         dev_put(adj_dev);
5602         kfree_rcu(adj, rcu);
5603 }
5604
5605 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5606                                             struct net_device *upper_dev,
5607                                             struct list_head *up_list,
5608                                             struct list_head *down_list,
5609                                             void *private, bool master)
5610 {
5611         int ret;
5612
5613         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5614                                            master);
5615         if (ret)
5616                 return ret;
5617
5618         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5619                                            false);
5620         if (ret) {
5621                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5622                 return ret;
5623         }
5624
5625         return 0;
5626 }
5627
5628 static int __netdev_adjacent_dev_link(struct net_device *dev,
5629                                       struct net_device *upper_dev)
5630 {
5631         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5632                                                 &dev->all_adj_list.upper,
5633                                                 &upper_dev->all_adj_list.lower,
5634                                                 NULL, false);
5635 }
5636
5637 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5638                                                struct net_device *upper_dev,
5639                                                struct list_head *up_list,
5640                                                struct list_head *down_list)
5641 {
5642         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5643         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5644 }
5645
5646 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5647                                          struct net_device *upper_dev)
5648 {
5649         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5650                                            &dev->all_adj_list.upper,
5651                                            &upper_dev->all_adj_list.lower);
5652 }
5653
5654 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5655                                                 struct net_device *upper_dev,
5656                                                 void *private, bool master)
5657 {
5658         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5659
5660         if (ret)
5661                 return ret;
5662
5663         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5664                                                &dev->adj_list.upper,
5665                                                &upper_dev->adj_list.lower,
5666                                                private, master);
5667         if (ret) {
5668                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5669                 return ret;
5670         }
5671
5672         return 0;
5673 }
5674
5675 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5676                                                    struct net_device *upper_dev)
5677 {
5678         __netdev_adjacent_dev_unlink(dev, upper_dev);
5679         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5680                                            &dev->adj_list.upper,
5681                                            &upper_dev->adj_list.lower);
5682 }
5683
5684 static int __netdev_upper_dev_link(struct net_device *dev,
5685                                    struct net_device *upper_dev, bool master,
5686                                    void *upper_priv, void *upper_info)
5687 {
5688         struct netdev_notifier_changeupper_info changeupper_info;
5689         struct netdev_adjacent *i, *j, *to_i, *to_j;
5690         int ret = 0;
5691
5692         ASSERT_RTNL();
5693
5694         if (dev == upper_dev)
5695                 return -EBUSY;
5696
5697         /* To prevent loops, check if dev is not upper device to upper_dev. */
5698         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5699                 return -EBUSY;
5700
5701         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5702                 return -EEXIST;
5703
5704         if (master && netdev_master_upper_dev_get(dev))
5705                 return -EBUSY;
5706
5707         changeupper_info.upper_dev = upper_dev;
5708         changeupper_info.master = master;
5709         changeupper_info.linking = true;
5710         changeupper_info.upper_info = upper_info;
5711
5712         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5713                                             &changeupper_info.info);
5714         ret = notifier_to_errno(ret);
5715         if (ret)
5716                 return ret;
5717
5718         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5719                                                    master);
5720         if (ret)
5721                 return ret;
5722
5723         /* Now that we linked these devs, make all the upper_dev's
5724          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5725          * versa, and don't forget the devices itself. All of these
5726          * links are non-neighbours.
5727          */
5728         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5729                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5730                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5731                                  i->dev->name, j->dev->name);
5732                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5733                         if (ret)
5734                                 goto rollback_mesh;
5735                 }
5736         }
5737
5738         /* add dev to every upper_dev's upper device */
5739         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5740                 pr_debug("linking %s's upper device %s with %s\n",
5741                          upper_dev->name, i->dev->name, dev->name);
5742                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5743                 if (ret)
5744                         goto rollback_upper_mesh;
5745         }
5746
5747         /* add upper_dev to every dev's lower device */
5748         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5749                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5750                          i->dev->name, upper_dev->name);
5751                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5752                 if (ret)
5753                         goto rollback_lower_mesh;
5754         }
5755
5756         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5757                                             &changeupper_info.info);
5758         ret = notifier_to_errno(ret);
5759         if (ret)
5760                 goto rollback_lower_mesh;
5761
5762         return 0;
5763
5764 rollback_lower_mesh:
5765         to_i = i;
5766         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5767                 if (i == to_i)
5768                         break;
5769                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5770         }
5771
5772         i = NULL;
5773
5774 rollback_upper_mesh:
5775         to_i = i;
5776         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5777                 if (i == to_i)
5778                         break;
5779                 __netdev_adjacent_dev_unlink(dev, i->dev);
5780         }
5781
5782         i = j = NULL;
5783
5784 rollback_mesh:
5785         to_i = i;
5786         to_j = j;
5787         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5788                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5789                         if (i == to_i && j == to_j)
5790                                 break;
5791                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5792                 }
5793                 if (i == to_i)
5794                         break;
5795         }
5796
5797         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5798
5799         return ret;
5800 }
5801
5802 /**
5803  * netdev_upper_dev_link - Add a link to the upper device
5804  * @dev: device
5805  * @upper_dev: new upper device
5806  *
5807  * Adds a link to device which is upper to this one. The caller must hold
5808  * the RTNL lock. On a failure a negative errno code is returned.
5809  * On success the reference counts are adjusted and the function
5810  * returns zero.
5811  */
5812 int netdev_upper_dev_link(struct net_device *dev,
5813                           struct net_device *upper_dev)
5814 {
5815         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5816 }
5817 EXPORT_SYMBOL(netdev_upper_dev_link);
5818
5819 /**
5820  * netdev_master_upper_dev_link - Add a master link to the upper device
5821  * @dev: device
5822  * @upper_dev: new upper device
5823  * @upper_priv: upper device private
5824  * @upper_info: upper info to be passed down via notifier
5825  *
5826  * Adds a link to device which is upper to this one. In this case, only
5827  * one master upper device can be linked, although other non-master devices
5828  * might be linked as well. The caller must hold the RTNL lock.
5829  * On a failure a negative errno code is returned. On success the reference
5830  * counts are adjusted and the function returns zero.
5831  */
5832 int netdev_master_upper_dev_link(struct net_device *dev,
5833                                  struct net_device *upper_dev,
5834                                  void *upper_priv, void *upper_info)
5835 {
5836         return __netdev_upper_dev_link(dev, upper_dev, true,
5837                                        upper_priv, upper_info);
5838 }
5839 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5840
5841 /**
5842  * netdev_upper_dev_unlink - Removes a link to upper device
5843  * @dev: device
5844  * @upper_dev: new upper device
5845  *
5846  * Removes a link to device which is upper to this one. The caller must hold
5847  * the RTNL lock.
5848  */
5849 void netdev_upper_dev_unlink(struct net_device *dev,
5850                              struct net_device *upper_dev)
5851 {
5852         struct netdev_notifier_changeupper_info changeupper_info;
5853         struct netdev_adjacent *i, *j;
5854         ASSERT_RTNL();
5855
5856         changeupper_info.upper_dev = upper_dev;
5857         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5858         changeupper_info.linking = false;
5859
5860         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5861                                       &changeupper_info.info);
5862
5863         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5864
5865         /* Here is the tricky part. We must remove all dev's lower
5866          * devices from all upper_dev's upper devices and vice
5867          * versa, to maintain the graph relationship.
5868          */
5869         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5870                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5871                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5872
5873         /* remove also the devices itself from lower/upper device
5874          * list
5875          */
5876         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5877                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5878
5879         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5880                 __netdev_adjacent_dev_unlink(dev, i->dev);
5881
5882         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5883                                       &changeupper_info.info);
5884 }
5885 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5886
5887 /**
5888  * netdev_bonding_info_change - Dispatch event about slave change
5889  * @dev: device
5890  * @bonding_info: info to dispatch
5891  *
5892  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5893  * The caller must hold the RTNL lock.
5894  */
5895 void netdev_bonding_info_change(struct net_device *dev,
5896                                 struct netdev_bonding_info *bonding_info)
5897 {
5898         struct netdev_notifier_bonding_info     info;
5899
5900         memcpy(&info.bonding_info, bonding_info,
5901                sizeof(struct netdev_bonding_info));
5902         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5903                                       &info.info);
5904 }
5905 EXPORT_SYMBOL(netdev_bonding_info_change);
5906
5907 static void netdev_adjacent_add_links(struct net_device *dev)
5908 {
5909         struct netdev_adjacent *iter;
5910
5911         struct net *net = dev_net(dev);
5912
5913         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5914                 if (!net_eq(net,dev_net(iter->dev)))
5915                         continue;
5916                 netdev_adjacent_sysfs_add(iter->dev, dev,
5917                                           &iter->dev->adj_list.lower);
5918                 netdev_adjacent_sysfs_add(dev, iter->dev,
5919                                           &dev->adj_list.upper);
5920         }
5921
5922         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5923                 if (!net_eq(net,dev_net(iter->dev)))
5924                         continue;
5925                 netdev_adjacent_sysfs_add(iter->dev, dev,
5926                                           &iter->dev->adj_list.upper);
5927                 netdev_adjacent_sysfs_add(dev, iter->dev,
5928                                           &dev->adj_list.lower);
5929         }
5930 }
5931
5932 static void netdev_adjacent_del_links(struct net_device *dev)
5933 {
5934         struct netdev_adjacent *iter;
5935
5936         struct net *net = dev_net(dev);
5937
5938         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5939                 if (!net_eq(net,dev_net(iter->dev)))
5940                         continue;
5941                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5942                                           &iter->dev->adj_list.lower);
5943                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5944                                           &dev->adj_list.upper);
5945         }
5946
5947         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5948                 if (!net_eq(net,dev_net(iter->dev)))
5949                         continue;
5950                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5951                                           &iter->dev->adj_list.upper);
5952                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5953                                           &dev->adj_list.lower);
5954         }
5955 }
5956
5957 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5958 {
5959         struct netdev_adjacent *iter;
5960
5961         struct net *net = dev_net(dev);
5962
5963         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5964                 if (!net_eq(net,dev_net(iter->dev)))
5965                         continue;
5966                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5967                                           &iter->dev->adj_list.lower);
5968                 netdev_adjacent_sysfs_add(iter->dev, dev,
5969                                           &iter->dev->adj_list.lower);
5970         }
5971
5972         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5973                 if (!net_eq(net,dev_net(iter->dev)))
5974                         continue;
5975                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5976                                           &iter->dev->adj_list.upper);
5977                 netdev_adjacent_sysfs_add(iter->dev, dev,
5978                                           &iter->dev->adj_list.upper);
5979         }
5980 }
5981
5982 void *netdev_lower_dev_get_private(struct net_device *dev,
5983                                    struct net_device *lower_dev)
5984 {
5985         struct netdev_adjacent *lower;
5986
5987         if (!lower_dev)
5988                 return NULL;
5989         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5990         if (!lower)
5991                 return NULL;
5992
5993         return lower->private;
5994 }
5995 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5996
5997
5998 int dev_get_nest_level(struct net_device *dev,
5999                        bool (*type_check)(const struct net_device *dev))
6000 {
6001         struct net_device *lower = NULL;
6002         struct list_head *iter;
6003         int max_nest = -1;
6004         int nest;
6005
6006         ASSERT_RTNL();
6007
6008         netdev_for_each_lower_dev(dev, lower, iter) {
6009                 nest = dev_get_nest_level(lower, type_check);
6010                 if (max_nest < nest)
6011                         max_nest = nest;
6012         }
6013
6014         if (type_check(dev))
6015                 max_nest++;
6016
6017         return max_nest;
6018 }
6019 EXPORT_SYMBOL(dev_get_nest_level);
6020
6021 /**
6022  * netdev_lower_change - Dispatch event about lower device state change
6023  * @lower_dev: device
6024  * @lower_state_info: state to dispatch
6025  *
6026  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6027  * The caller must hold the RTNL lock.
6028  */
6029 void netdev_lower_state_changed(struct net_device *lower_dev,
6030                                 void *lower_state_info)
6031 {
6032         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6033
6034         ASSERT_RTNL();
6035         changelowerstate_info.lower_state_info = lower_state_info;
6036         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6037                                       &changelowerstate_info.info);
6038 }
6039 EXPORT_SYMBOL(netdev_lower_state_changed);
6040
6041 static void dev_change_rx_flags(struct net_device *dev, int flags)
6042 {
6043         const struct net_device_ops *ops = dev->netdev_ops;
6044
6045         if (ops->ndo_change_rx_flags)
6046                 ops->ndo_change_rx_flags(dev, flags);
6047 }
6048
6049 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6050 {
6051         unsigned int old_flags = dev->flags;
6052         kuid_t uid;
6053         kgid_t gid;
6054
6055         ASSERT_RTNL();
6056
6057         dev->flags |= IFF_PROMISC;
6058         dev->promiscuity += inc;
6059         if (dev->promiscuity == 0) {
6060                 /*
6061                  * Avoid overflow.
6062                  * If inc causes overflow, untouch promisc and return error.
6063                  */
6064                 if (inc < 0)
6065                         dev->flags &= ~IFF_PROMISC;
6066                 else {
6067                         dev->promiscuity -= inc;
6068                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6069                                 dev->name);
6070                         return -EOVERFLOW;
6071                 }
6072         }
6073         if (dev->flags != old_flags) {
6074                 pr_info("device %s %s promiscuous mode\n",
6075                         dev->name,
6076                         dev->flags & IFF_PROMISC ? "entered" : "left");
6077                 if (audit_enabled) {
6078                         current_uid_gid(&uid, &gid);
6079                         audit_log(current->audit_context, GFP_ATOMIC,
6080                                 AUDIT_ANOM_PROMISCUOUS,
6081                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6082                                 dev->name, (dev->flags & IFF_PROMISC),
6083                                 (old_flags & IFF_PROMISC),
6084                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6085                                 from_kuid(&init_user_ns, uid),
6086                                 from_kgid(&init_user_ns, gid),
6087                                 audit_get_sessionid(current));
6088                 }
6089
6090                 dev_change_rx_flags(dev, IFF_PROMISC);
6091         }
6092         if (notify)
6093                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6094         return 0;
6095 }
6096
6097 /**
6098  *      dev_set_promiscuity     - update promiscuity count on a device
6099  *      @dev: device
6100  *      @inc: modifier
6101  *
6102  *      Add or remove promiscuity from a device. While the count in the device
6103  *      remains above zero the interface remains promiscuous. Once it hits zero
6104  *      the device reverts back to normal filtering operation. A negative inc
6105  *      value is used to drop promiscuity on the device.
6106  *      Return 0 if successful or a negative errno code on error.
6107  */
6108 int dev_set_promiscuity(struct net_device *dev, int inc)
6109 {
6110         unsigned int old_flags = dev->flags;
6111         int err;
6112
6113         err = __dev_set_promiscuity(dev, inc, true);
6114         if (err < 0)
6115                 return err;
6116         if (dev->flags != old_flags)
6117                 dev_set_rx_mode(dev);
6118         return err;
6119 }
6120 EXPORT_SYMBOL(dev_set_promiscuity);
6121
6122 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6123 {
6124         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6125
6126         ASSERT_RTNL();
6127
6128         dev->flags |= IFF_ALLMULTI;
6129         dev->allmulti += inc;
6130         if (dev->allmulti == 0) {
6131                 /*
6132                  * Avoid overflow.
6133                  * If inc causes overflow, untouch allmulti and return error.
6134                  */
6135                 if (inc < 0)
6136                         dev->flags &= ~IFF_ALLMULTI;
6137                 else {
6138                         dev->allmulti -= inc;
6139                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6140                                 dev->name);
6141                         return -EOVERFLOW;
6142                 }
6143         }
6144         if (dev->flags ^ old_flags) {
6145                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6146                 dev_set_rx_mode(dev);
6147                 if (notify)
6148                         __dev_notify_flags(dev, old_flags,
6149                                            dev->gflags ^ old_gflags);
6150         }
6151         return 0;
6152 }
6153
6154 /**
6155  *      dev_set_allmulti        - update allmulti count on a device
6156  *      @dev: device
6157  *      @inc: modifier
6158  *
6159  *      Add or remove reception of all multicast frames to a device. While the
6160  *      count in the device remains above zero the interface remains listening
6161  *      to all interfaces. Once it hits zero the device reverts back to normal
6162  *      filtering operation. A negative @inc value is used to drop the counter
6163  *      when releasing a resource needing all multicasts.
6164  *      Return 0 if successful or a negative errno code on error.
6165  */
6166
6167 int dev_set_allmulti(struct net_device *dev, int inc)
6168 {
6169         return __dev_set_allmulti(dev, inc, true);
6170 }
6171 EXPORT_SYMBOL(dev_set_allmulti);
6172
6173 /*
6174  *      Upload unicast and multicast address lists to device and
6175  *      configure RX filtering. When the device doesn't support unicast
6176  *      filtering it is put in promiscuous mode while unicast addresses
6177  *      are present.
6178  */
6179 void __dev_set_rx_mode(struct net_device *dev)
6180 {
6181         const struct net_device_ops *ops = dev->netdev_ops;
6182
6183         /* dev_open will call this function so the list will stay sane. */
6184         if (!(dev->flags&IFF_UP))
6185                 return;
6186
6187         if (!netif_device_present(dev))
6188                 return;
6189
6190         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6191                 /* Unicast addresses changes may only happen under the rtnl,
6192                  * therefore calling __dev_set_promiscuity here is safe.
6193                  */
6194                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6195                         __dev_set_promiscuity(dev, 1, false);
6196                         dev->uc_promisc = true;
6197                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6198                         __dev_set_promiscuity(dev, -1, false);
6199                         dev->uc_promisc = false;
6200                 }
6201         }
6202
6203         if (ops->ndo_set_rx_mode)
6204                 ops->ndo_set_rx_mode(dev);
6205 }
6206
6207 void dev_set_rx_mode(struct net_device *dev)
6208 {
6209         netif_addr_lock_bh(dev);
6210         __dev_set_rx_mode(dev);
6211         netif_addr_unlock_bh(dev);
6212 }
6213
6214 /**
6215  *      dev_get_flags - get flags reported to userspace
6216  *      @dev: device
6217  *
6218  *      Get the combination of flag bits exported through APIs to userspace.
6219  */
6220 unsigned int dev_get_flags(const struct net_device *dev)
6221 {
6222         unsigned int flags;
6223
6224         flags = (dev->flags & ~(IFF_PROMISC |
6225                                 IFF_ALLMULTI |
6226                                 IFF_RUNNING |
6227                                 IFF_LOWER_UP |
6228                                 IFF_DORMANT)) |
6229                 (dev->gflags & (IFF_PROMISC |
6230                                 IFF_ALLMULTI));
6231
6232         if (netif_running(dev)) {
6233                 if (netif_oper_up(dev))
6234                         flags |= IFF_RUNNING;
6235                 if (netif_carrier_ok(dev))
6236                         flags |= IFF_LOWER_UP;
6237                 if (netif_dormant(dev))
6238                         flags |= IFF_DORMANT;
6239         }
6240
6241         return flags;
6242 }
6243 EXPORT_SYMBOL(dev_get_flags);
6244
6245 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6246 {
6247         unsigned int old_flags = dev->flags;
6248         int ret;
6249
6250         ASSERT_RTNL();
6251
6252         /*
6253          *      Set the flags on our device.
6254          */
6255
6256         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6257                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6258                                IFF_AUTOMEDIA)) |
6259                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6260                                     IFF_ALLMULTI));
6261
6262         /*
6263          *      Load in the correct multicast list now the flags have changed.
6264          */
6265
6266         if ((old_flags ^ flags) & IFF_MULTICAST)
6267                 dev_change_rx_flags(dev, IFF_MULTICAST);
6268
6269         dev_set_rx_mode(dev);
6270
6271         /*
6272          *      Have we downed the interface. We handle IFF_UP ourselves
6273          *      according to user attempts to set it, rather than blindly
6274          *      setting it.
6275          */
6276
6277         ret = 0;
6278         if ((old_flags ^ flags) & IFF_UP)
6279                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6280
6281         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6282                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6283                 unsigned int old_flags = dev->flags;
6284
6285                 dev->gflags ^= IFF_PROMISC;
6286
6287                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6288                         if (dev->flags != old_flags)
6289                                 dev_set_rx_mode(dev);
6290         }
6291
6292         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6293            is important. Some (broken) drivers set IFF_PROMISC, when
6294            IFF_ALLMULTI is requested not asking us and not reporting.
6295          */
6296         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6297                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6298
6299                 dev->gflags ^= IFF_ALLMULTI;
6300                 __dev_set_allmulti(dev, inc, false);
6301         }
6302
6303         return ret;
6304 }
6305
6306 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6307                         unsigned int gchanges)
6308 {
6309         unsigned int changes = dev->flags ^ old_flags;
6310
6311         if (gchanges)
6312                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6313
6314         if (changes & IFF_UP) {
6315                 if (dev->flags & IFF_UP)
6316                         call_netdevice_notifiers(NETDEV_UP, dev);
6317                 else
6318                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6319         }
6320
6321         if (dev->flags & IFF_UP &&
6322             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6323                 struct netdev_notifier_change_info change_info;
6324
6325                 change_info.flags_changed = changes;
6326                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6327                                               &change_info.info);
6328         }
6329 }
6330
6331 /**
6332  *      dev_change_flags - change device settings
6333  *      @dev: device
6334  *      @flags: device state flags
6335  *
6336  *      Change settings on device based state flags. The flags are
6337  *      in the userspace exported format.
6338  */
6339 int dev_change_flags(struct net_device *dev, unsigned int flags)
6340 {
6341         int ret;
6342         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6343
6344         ret = __dev_change_flags(dev, flags);
6345         if (ret < 0)
6346                 return ret;
6347
6348         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6349         __dev_notify_flags(dev, old_flags, changes);
6350         return ret;
6351 }
6352 EXPORT_SYMBOL(dev_change_flags);
6353
6354 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6355 {
6356         const struct net_device_ops *ops = dev->netdev_ops;
6357
6358         if (ops->ndo_change_mtu)
6359                 return ops->ndo_change_mtu(dev, new_mtu);
6360
6361         dev->mtu = new_mtu;
6362         return 0;
6363 }
6364
6365 /**
6366  *      dev_set_mtu - Change maximum transfer unit
6367  *      @dev: device
6368  *      @new_mtu: new transfer unit
6369  *
6370  *      Change the maximum transfer size of the network device.
6371  */
6372 int dev_set_mtu(struct net_device *dev, int new_mtu)
6373 {
6374         int err, orig_mtu;
6375
6376         if (new_mtu == dev->mtu)
6377                 return 0;
6378
6379         /*      MTU must be positive.    */
6380         if (new_mtu < 0)
6381                 return -EINVAL;
6382
6383         if (!netif_device_present(dev))
6384                 return -ENODEV;
6385
6386         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6387         err = notifier_to_errno(err);
6388         if (err)
6389                 return err;
6390
6391         orig_mtu = dev->mtu;
6392         err = __dev_set_mtu(dev, new_mtu);
6393
6394         if (!err) {
6395                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6396                 err = notifier_to_errno(err);
6397                 if (err) {
6398                         /* setting mtu back and notifying everyone again,
6399                          * so that they have a chance to revert changes.
6400                          */
6401                         __dev_set_mtu(dev, orig_mtu);
6402                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6403                 }
6404         }
6405         return err;
6406 }
6407 EXPORT_SYMBOL(dev_set_mtu);
6408
6409 /**
6410  *      dev_set_group - Change group this device belongs to
6411  *      @dev: device
6412  *      @new_group: group this device should belong to
6413  */
6414 void dev_set_group(struct net_device *dev, int new_group)
6415 {
6416         dev->group = new_group;
6417 }
6418 EXPORT_SYMBOL(dev_set_group);
6419
6420 /**
6421  *      dev_set_mac_address - Change Media Access Control Address
6422  *      @dev: device
6423  *      @sa: new address
6424  *
6425  *      Change the hardware (MAC) address of the device
6426  */
6427 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6428 {
6429         const struct net_device_ops *ops = dev->netdev_ops;
6430         int err;
6431
6432         if (!ops->ndo_set_mac_address)
6433                 return -EOPNOTSUPP;
6434         if (sa->sa_family != dev->type)
6435                 return -EINVAL;
6436         if (!netif_device_present(dev))
6437                 return -ENODEV;
6438         err = ops->ndo_set_mac_address(dev, sa);
6439         if (err)
6440                 return err;
6441         dev->addr_assign_type = NET_ADDR_SET;
6442         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6443         add_device_randomness(dev->dev_addr, dev->addr_len);
6444         return 0;
6445 }
6446 EXPORT_SYMBOL(dev_set_mac_address);
6447
6448 /**
6449  *      dev_change_carrier - Change device carrier
6450  *      @dev: device
6451  *      @new_carrier: new value
6452  *
6453  *      Change device carrier
6454  */
6455 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6456 {
6457         const struct net_device_ops *ops = dev->netdev_ops;
6458
6459         if (!ops->ndo_change_carrier)
6460                 return -EOPNOTSUPP;
6461         if (!netif_device_present(dev))
6462                 return -ENODEV;
6463         return ops->ndo_change_carrier(dev, new_carrier);
6464 }
6465 EXPORT_SYMBOL(dev_change_carrier);
6466
6467 /**
6468  *      dev_get_phys_port_id - Get device physical port ID
6469  *      @dev: device
6470  *      @ppid: port ID
6471  *
6472  *      Get device physical port ID
6473  */
6474 int dev_get_phys_port_id(struct net_device *dev,
6475                          struct netdev_phys_item_id *ppid)
6476 {
6477         const struct net_device_ops *ops = dev->netdev_ops;
6478
6479         if (!ops->ndo_get_phys_port_id)
6480                 return -EOPNOTSUPP;
6481         return ops->ndo_get_phys_port_id(dev, ppid);
6482 }
6483 EXPORT_SYMBOL(dev_get_phys_port_id);
6484
6485 /**
6486  *      dev_get_phys_port_name - Get device physical port name
6487  *      @dev: device
6488  *      @name: port name
6489  *      @len: limit of bytes to copy to name
6490  *
6491  *      Get device physical port name
6492  */
6493 int dev_get_phys_port_name(struct net_device *dev,
6494                            char *name, size_t len)
6495 {
6496         const struct net_device_ops *ops = dev->netdev_ops;
6497
6498         if (!ops->ndo_get_phys_port_name)
6499                 return -EOPNOTSUPP;
6500         return ops->ndo_get_phys_port_name(dev, name, len);
6501 }
6502 EXPORT_SYMBOL(dev_get_phys_port_name);
6503
6504 /**
6505  *      dev_change_proto_down - update protocol port state information
6506  *      @dev: device
6507  *      @proto_down: new value
6508  *
6509  *      This info can be used by switch drivers to set the phys state of the
6510  *      port.
6511  */
6512 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6513 {
6514         const struct net_device_ops *ops = dev->netdev_ops;
6515
6516         if (!ops->ndo_change_proto_down)
6517                 return -EOPNOTSUPP;
6518         if (!netif_device_present(dev))
6519                 return -ENODEV;
6520         return ops->ndo_change_proto_down(dev, proto_down);
6521 }
6522 EXPORT_SYMBOL(dev_change_proto_down);
6523
6524 /**
6525  *      dev_new_index   -       allocate an ifindex
6526  *      @net: the applicable net namespace
6527  *
6528  *      Returns a suitable unique value for a new device interface
6529  *      number.  The caller must hold the rtnl semaphore or the
6530  *      dev_base_lock to be sure it remains unique.
6531  */
6532 static int dev_new_index(struct net *net)
6533 {
6534         int ifindex = net->ifindex;
6535         for (;;) {
6536                 if (++ifindex <= 0)
6537                         ifindex = 1;
6538                 if (!__dev_get_by_index(net, ifindex))
6539                         return net->ifindex = ifindex;
6540         }
6541 }
6542
6543 /* Delayed registration/unregisteration */
6544 static LIST_HEAD(net_todo_list);
6545 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6546
6547 static void net_set_todo(struct net_device *dev)
6548 {
6549         list_add_tail(&dev->todo_list, &net_todo_list);
6550         dev_net(dev)->dev_unreg_count++;
6551 }
6552
6553 static void rollback_registered_many(struct list_head *head)
6554 {
6555         struct net_device *dev, *tmp;
6556         LIST_HEAD(close_head);
6557
6558         BUG_ON(dev_boot_phase);
6559         ASSERT_RTNL();
6560
6561         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6562                 /* Some devices call without registering
6563                  * for initialization unwind. Remove those
6564                  * devices and proceed with the remaining.
6565                  */
6566                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6567                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6568                                  dev->name, dev);
6569
6570                         WARN_ON(1);
6571                         list_del(&dev->unreg_list);
6572                         continue;
6573                 }
6574                 dev->dismantle = true;
6575                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6576         }
6577
6578         /* If device is running, close it first. */
6579         list_for_each_entry(dev, head, unreg_list)
6580                 list_add_tail(&dev->close_list, &close_head);
6581         dev_close_many(&close_head, true);
6582
6583         list_for_each_entry(dev, head, unreg_list) {
6584                 /* And unlink it from device chain. */
6585                 unlist_netdevice(dev);
6586
6587                 dev->reg_state = NETREG_UNREGISTERING;
6588                 on_each_cpu(flush_backlog, dev, 1);
6589         }
6590
6591         synchronize_net();
6592
6593         list_for_each_entry(dev, head, unreg_list) {
6594                 struct sk_buff *skb = NULL;
6595
6596                 /* Shutdown queueing discipline. */
6597                 dev_shutdown(dev);
6598
6599
6600                 /* Notify protocols, that we are about to destroy
6601                    this device. They should clean all the things.
6602                 */
6603                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6604
6605                 if (!dev->rtnl_link_ops ||
6606                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6607                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6608                                                      GFP_KERNEL);
6609
6610                 /*
6611                  *      Flush the unicast and multicast chains
6612                  */
6613                 dev_uc_flush(dev);
6614                 dev_mc_flush(dev);
6615
6616                 if (dev->netdev_ops->ndo_uninit)
6617                         dev->netdev_ops->ndo_uninit(dev);
6618
6619                 if (skb)
6620                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6621
6622                 /* Notifier chain MUST detach us all upper devices. */
6623                 WARN_ON(netdev_has_any_upper_dev(dev));
6624
6625                 /* Remove entries from kobject tree */
6626                 netdev_unregister_kobject(dev);
6627 #ifdef CONFIG_XPS
6628                 /* Remove XPS queueing entries */
6629                 netif_reset_xps_queues_gt(dev, 0);
6630 #endif
6631         }
6632
6633         synchronize_net();
6634
6635         list_for_each_entry(dev, head, unreg_list)
6636                 dev_put(dev);
6637 }
6638
6639 static void rollback_registered(struct net_device *dev)
6640 {
6641         LIST_HEAD(single);
6642
6643         list_add(&dev->unreg_list, &single);
6644         rollback_registered_many(&single);
6645         list_del(&single);
6646 }
6647
6648 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6649         struct net_device *upper, netdev_features_t features)
6650 {
6651         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6652         netdev_features_t feature;
6653         int feature_bit;
6654
6655         for_each_netdev_feature(&upper_disables, feature_bit) {
6656                 feature = __NETIF_F_BIT(feature_bit);
6657                 if (!(upper->wanted_features & feature)
6658                     && (features & feature)) {
6659                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6660                                    &feature, upper->name);
6661                         features &= ~feature;
6662                 }
6663         }
6664
6665         return features;
6666 }
6667
6668 static void netdev_sync_lower_features(struct net_device *upper,
6669         struct net_device *lower, netdev_features_t features)
6670 {
6671         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6672         netdev_features_t feature;
6673         int feature_bit;
6674
6675         for_each_netdev_feature(&upper_disables, feature_bit) {
6676                 feature = __NETIF_F_BIT(feature_bit);
6677                 if (!(features & feature) && (lower->features & feature)) {
6678                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6679                                    &feature, lower->name);
6680                         lower->wanted_features &= ~feature;
6681                         netdev_update_features(lower);
6682
6683                         if (unlikely(lower->features & feature))
6684                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6685                                             &feature, lower->name);
6686                 }
6687         }
6688 }
6689
6690 static netdev_features_t netdev_fix_features(struct net_device *dev,
6691         netdev_features_t features)
6692 {
6693         /* Fix illegal checksum combinations */
6694         if ((features & NETIF_F_HW_CSUM) &&
6695             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6696                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6697                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6698         }
6699
6700         /* TSO requires that SG is present as well. */
6701         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6702                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6703                 features &= ~NETIF_F_ALL_TSO;
6704         }
6705
6706         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6707                                         !(features & NETIF_F_IP_CSUM)) {
6708                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6709                 features &= ~NETIF_F_TSO;
6710                 features &= ~NETIF_F_TSO_ECN;
6711         }
6712
6713         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6714                                          !(features & NETIF_F_IPV6_CSUM)) {
6715                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6716                 features &= ~NETIF_F_TSO6;
6717         }
6718
6719         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6720         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6721                 features &= ~NETIF_F_TSO_MANGLEID;
6722
6723         /* TSO ECN requires that TSO is present as well. */
6724         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6725                 features &= ~NETIF_F_TSO_ECN;
6726
6727         /* Software GSO depends on SG. */
6728         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6729                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6730                 features &= ~NETIF_F_GSO;
6731         }
6732
6733         /* UFO needs SG and checksumming */
6734         if (features & NETIF_F_UFO) {
6735                 /* maybe split UFO into V4 and V6? */
6736                 if (!(features & NETIF_F_HW_CSUM) &&
6737                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6738                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6739                         netdev_dbg(dev,
6740                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6741                         features &= ~NETIF_F_UFO;
6742                 }
6743
6744                 if (!(features & NETIF_F_SG)) {
6745                         netdev_dbg(dev,
6746                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6747                         features &= ~NETIF_F_UFO;
6748                 }
6749         }
6750
6751         /* GSO partial features require GSO partial be set */
6752         if ((features & dev->gso_partial_features) &&
6753             !(features & NETIF_F_GSO_PARTIAL)) {
6754                 netdev_dbg(dev,
6755                            "Dropping partially supported GSO features since no GSO partial.\n");
6756                 features &= ~dev->gso_partial_features;
6757         }
6758
6759 #ifdef CONFIG_NET_RX_BUSY_POLL
6760         if (dev->netdev_ops->ndo_busy_poll)
6761                 features |= NETIF_F_BUSY_POLL;
6762         else
6763 #endif
6764                 features &= ~NETIF_F_BUSY_POLL;
6765
6766         return features;
6767 }
6768
6769 int __netdev_update_features(struct net_device *dev)
6770 {
6771         struct net_device *upper, *lower;
6772         netdev_features_t features;
6773         struct list_head *iter;
6774         int err = -1;
6775
6776         ASSERT_RTNL();
6777
6778         features = netdev_get_wanted_features(dev);
6779
6780         if (dev->netdev_ops->ndo_fix_features)
6781                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6782
6783         /* driver might be less strict about feature dependencies */
6784         features = netdev_fix_features(dev, features);
6785
6786         /* some features can't be enabled if they're off an an upper device */
6787         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6788                 features = netdev_sync_upper_features(dev, upper, features);
6789
6790         if (dev->features == features)
6791                 goto sync_lower;
6792
6793         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6794                 &dev->features, &features);
6795
6796         if (dev->netdev_ops->ndo_set_features)
6797                 err = dev->netdev_ops->ndo_set_features(dev, features);
6798         else
6799                 err = 0;
6800
6801         if (unlikely(err < 0)) {
6802                 netdev_err(dev,
6803                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6804                         err, &features, &dev->features);
6805                 /* return non-0 since some features might have changed and
6806                  * it's better to fire a spurious notification than miss it
6807                  */
6808                 return -1;
6809         }
6810
6811 sync_lower:
6812         /* some features must be disabled on lower devices when disabled
6813          * on an upper device (think: bonding master or bridge)
6814          */
6815         netdev_for_each_lower_dev(dev, lower, iter)
6816                 netdev_sync_lower_features(dev, lower, features);
6817
6818         if (!err)
6819                 dev->features = features;
6820
6821         return err < 0 ? 0 : 1;
6822 }
6823
6824 /**
6825  *      netdev_update_features - recalculate device features
6826  *      @dev: the device to check
6827  *
6828  *      Recalculate dev->features set and send notifications if it
6829  *      has changed. Should be called after driver or hardware dependent
6830  *      conditions might have changed that influence the features.
6831  */
6832 void netdev_update_features(struct net_device *dev)
6833 {
6834         if (__netdev_update_features(dev))
6835                 netdev_features_change(dev);
6836 }
6837 EXPORT_SYMBOL(netdev_update_features);
6838
6839 /**
6840  *      netdev_change_features - recalculate device features
6841  *      @dev: the device to check
6842  *
6843  *      Recalculate dev->features set and send notifications even
6844  *      if they have not changed. Should be called instead of
6845  *      netdev_update_features() if also dev->vlan_features might
6846  *      have changed to allow the changes to be propagated to stacked
6847  *      VLAN devices.
6848  */
6849 void netdev_change_features(struct net_device *dev)
6850 {
6851         __netdev_update_features(dev);
6852         netdev_features_change(dev);
6853 }
6854 EXPORT_SYMBOL(netdev_change_features);
6855
6856 /**
6857  *      netif_stacked_transfer_operstate -      transfer operstate
6858  *      @rootdev: the root or lower level device to transfer state from
6859  *      @dev: the device to transfer operstate to
6860  *
6861  *      Transfer operational state from root to device. This is normally
6862  *      called when a stacking relationship exists between the root
6863  *      device and the device(a leaf device).
6864  */
6865 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6866                                         struct net_device *dev)
6867 {
6868         if (rootdev->operstate == IF_OPER_DORMANT)
6869                 netif_dormant_on(dev);
6870         else
6871                 netif_dormant_off(dev);
6872
6873         if (netif_carrier_ok(rootdev)) {
6874                 if (!netif_carrier_ok(dev))
6875                         netif_carrier_on(dev);
6876         } else {
6877                 if (netif_carrier_ok(dev))
6878                         netif_carrier_off(dev);
6879         }
6880 }
6881 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6882
6883 #ifdef CONFIG_SYSFS
6884 static int netif_alloc_rx_queues(struct net_device *dev)
6885 {
6886         unsigned int i, count = dev->num_rx_queues;
6887         struct netdev_rx_queue *rx;
6888         size_t sz = count * sizeof(*rx);
6889
6890         BUG_ON(count < 1);
6891
6892         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6893         if (!rx) {
6894                 rx = vzalloc(sz);
6895                 if (!rx)
6896                         return -ENOMEM;
6897         }
6898         dev->_rx = rx;
6899
6900         for (i = 0; i < count; i++)
6901                 rx[i].dev = dev;
6902         return 0;
6903 }
6904 #endif
6905
6906 static void netdev_init_one_queue(struct net_device *dev,
6907                                   struct netdev_queue *queue, void *_unused)
6908 {
6909         /* Initialize queue lock */
6910         spin_lock_init(&queue->_xmit_lock);
6911         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6912         queue->xmit_lock_owner = -1;
6913         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6914         queue->dev = dev;
6915 #ifdef CONFIG_BQL
6916         dql_init(&queue->dql, HZ);
6917 #endif
6918 }
6919
6920 static void netif_free_tx_queues(struct net_device *dev)
6921 {
6922         kvfree(dev->_tx);
6923 }
6924
6925 static int netif_alloc_netdev_queues(struct net_device *dev)
6926 {
6927         unsigned int count = dev->num_tx_queues;
6928         struct netdev_queue *tx;
6929         size_t sz = count * sizeof(*tx);
6930
6931         if (count < 1 || count > 0xffff)
6932                 return -EINVAL;
6933
6934         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6935         if (!tx) {
6936                 tx = vzalloc(sz);
6937                 if (!tx)
6938                         return -ENOMEM;
6939         }
6940         dev->_tx = tx;
6941
6942         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6943         spin_lock_init(&dev->tx_global_lock);
6944
6945         return 0;
6946 }
6947
6948 void netif_tx_stop_all_queues(struct net_device *dev)
6949 {
6950         unsigned int i;
6951
6952         for (i = 0; i < dev->num_tx_queues; i++) {
6953                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6954                 netif_tx_stop_queue(txq);
6955         }
6956 }
6957 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6958
6959 /**
6960  *      register_netdevice      - register a network device
6961  *      @dev: device to register
6962  *
6963  *      Take a completed network device structure and add it to the kernel
6964  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6965  *      chain. 0 is returned on success. A negative errno code is returned
6966  *      on a failure to set up the device, or if the name is a duplicate.
6967  *
6968  *      Callers must hold the rtnl semaphore. You may want
6969  *      register_netdev() instead of this.
6970  *
6971  *      BUGS:
6972  *      The locking appears insufficient to guarantee two parallel registers
6973  *      will not get the same name.
6974  */
6975
6976 int register_netdevice(struct net_device *dev)
6977 {
6978         int ret;
6979         struct net *net = dev_net(dev);
6980
6981         BUG_ON(dev_boot_phase);
6982         ASSERT_RTNL();
6983
6984         might_sleep();
6985
6986         /* When net_device's are persistent, this will be fatal. */
6987         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6988         BUG_ON(!net);
6989
6990         spin_lock_init(&dev->addr_list_lock);
6991         netdev_set_addr_lockdep_class(dev);
6992
6993         ret = dev_get_valid_name(net, dev, dev->name);
6994         if (ret < 0)
6995                 goto out;
6996
6997         /* Init, if this function is available */
6998         if (dev->netdev_ops->ndo_init) {
6999                 ret = dev->netdev_ops->ndo_init(dev);
7000                 if (ret) {
7001                         if (ret > 0)
7002                                 ret = -EIO;
7003                         goto out;
7004                 }
7005         }
7006
7007         if (((dev->hw_features | dev->features) &
7008              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7009             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7010              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7011                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7012                 ret = -EINVAL;
7013                 goto err_uninit;
7014         }
7015
7016         ret = -EBUSY;
7017         if (!dev->ifindex)
7018                 dev->ifindex = dev_new_index(net);
7019         else if (__dev_get_by_index(net, dev->ifindex))
7020                 goto err_uninit;
7021
7022         /* Transfer changeable features to wanted_features and enable
7023          * software offloads (GSO and GRO).
7024          */
7025         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7026         dev->features |= NETIF_F_SOFT_FEATURES;
7027         dev->wanted_features = dev->features & dev->hw_features;
7028
7029         if (!(dev->flags & IFF_LOOPBACK))
7030                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7031
7032         /* If IPv4 TCP segmentation offload is supported we should also
7033          * allow the device to enable segmenting the frame with the option
7034          * of ignoring a static IP ID value.  This doesn't enable the
7035          * feature itself but allows the user to enable it later.
7036          */
7037         if (dev->hw_features & NETIF_F_TSO)
7038                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7039         if (dev->vlan_features & NETIF_F_TSO)
7040                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7041         if (dev->mpls_features & NETIF_F_TSO)
7042                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7043         if (dev->hw_enc_features & NETIF_F_TSO)
7044                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7045
7046         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7047          */
7048         dev->vlan_features |= NETIF_F_HIGHDMA;
7049
7050         /* Make NETIF_F_SG inheritable to tunnel devices.
7051          */
7052         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7053
7054         /* Make NETIF_F_SG inheritable to MPLS.
7055          */
7056         dev->mpls_features |= NETIF_F_SG;
7057
7058         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7059         ret = notifier_to_errno(ret);
7060         if (ret)
7061                 goto err_uninit;
7062
7063         ret = netdev_register_kobject(dev);
7064         if (ret)
7065                 goto err_uninit;
7066         dev->reg_state = NETREG_REGISTERED;
7067
7068         __netdev_update_features(dev);
7069
7070         /*
7071          *      Default initial state at registry is that the
7072          *      device is present.
7073          */
7074
7075         set_bit(__LINK_STATE_PRESENT, &dev->state);
7076
7077         linkwatch_init_dev(dev);
7078
7079         dev_init_scheduler(dev);
7080         dev_hold(dev);
7081         list_netdevice(dev);
7082         add_device_randomness(dev->dev_addr, dev->addr_len);
7083
7084         /* If the device has permanent device address, driver should
7085          * set dev_addr and also addr_assign_type should be set to
7086          * NET_ADDR_PERM (default value).
7087          */
7088         if (dev->addr_assign_type == NET_ADDR_PERM)
7089                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7090
7091         /* Notify protocols, that a new device appeared. */
7092         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7093         ret = notifier_to_errno(ret);
7094         if (ret) {
7095                 rollback_registered(dev);
7096                 dev->reg_state = NETREG_UNREGISTERED;
7097         }
7098         /*
7099          *      Prevent userspace races by waiting until the network
7100          *      device is fully setup before sending notifications.
7101          */
7102         if (!dev->rtnl_link_ops ||
7103             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7104                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7105
7106 out:
7107         return ret;
7108
7109 err_uninit:
7110         if (dev->netdev_ops->ndo_uninit)
7111                 dev->netdev_ops->ndo_uninit(dev);
7112         goto out;
7113 }
7114 EXPORT_SYMBOL(register_netdevice);
7115
7116 /**
7117  *      init_dummy_netdev       - init a dummy network device for NAPI
7118  *      @dev: device to init
7119  *
7120  *      This takes a network device structure and initialize the minimum
7121  *      amount of fields so it can be used to schedule NAPI polls without
7122  *      registering a full blown interface. This is to be used by drivers
7123  *      that need to tie several hardware interfaces to a single NAPI
7124  *      poll scheduler due to HW limitations.
7125  */
7126 int init_dummy_netdev(struct net_device *dev)
7127 {
7128         /* Clear everything. Note we don't initialize spinlocks
7129          * are they aren't supposed to be taken by any of the
7130          * NAPI code and this dummy netdev is supposed to be
7131          * only ever used for NAPI polls
7132          */
7133         memset(dev, 0, sizeof(struct net_device));
7134
7135         /* make sure we BUG if trying to hit standard
7136          * register/unregister code path
7137          */
7138         dev->reg_state = NETREG_DUMMY;
7139
7140         /* NAPI wants this */
7141         INIT_LIST_HEAD(&dev->napi_list);
7142
7143         /* a dummy interface is started by default */
7144         set_bit(__LINK_STATE_PRESENT, &dev->state);
7145         set_bit(__LINK_STATE_START, &dev->state);
7146
7147         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7148          * because users of this 'device' dont need to change
7149          * its refcount.
7150          */
7151
7152         return 0;
7153 }
7154 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7155
7156
7157 /**
7158  *      register_netdev - register a network device
7159  *      @dev: device to register
7160  *
7161  *      Take a completed network device structure and add it to the kernel
7162  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7163  *      chain. 0 is returned on success. A negative errno code is returned
7164  *      on a failure to set up the device, or if the name is a duplicate.
7165  *
7166  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7167  *      and expands the device name if you passed a format string to
7168  *      alloc_netdev.
7169  */
7170 int register_netdev(struct net_device *dev)
7171 {
7172         int err;
7173
7174         rtnl_lock();
7175         err = register_netdevice(dev);
7176         rtnl_unlock();
7177         return err;
7178 }
7179 EXPORT_SYMBOL(register_netdev);
7180
7181 int netdev_refcnt_read(const struct net_device *dev)
7182 {
7183         int i, refcnt = 0;
7184
7185         for_each_possible_cpu(i)
7186                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7187         return refcnt;
7188 }
7189 EXPORT_SYMBOL(netdev_refcnt_read);
7190
7191 /**
7192  * netdev_wait_allrefs - wait until all references are gone.
7193  * @dev: target net_device
7194  *
7195  * This is called when unregistering network devices.
7196  *
7197  * Any protocol or device that holds a reference should register
7198  * for netdevice notification, and cleanup and put back the
7199  * reference if they receive an UNREGISTER event.
7200  * We can get stuck here if buggy protocols don't correctly
7201  * call dev_put.
7202  */
7203 static void netdev_wait_allrefs(struct net_device *dev)
7204 {
7205         unsigned long rebroadcast_time, warning_time;
7206         int refcnt;
7207
7208         linkwatch_forget_dev(dev);
7209
7210         rebroadcast_time = warning_time = jiffies;
7211         refcnt = netdev_refcnt_read(dev);
7212
7213         while (refcnt != 0) {
7214                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7215                         rtnl_lock();
7216
7217                         /* Rebroadcast unregister notification */
7218                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7219
7220                         __rtnl_unlock();
7221                         rcu_barrier();
7222                         rtnl_lock();
7223
7224                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7225                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7226                                      &dev->state)) {
7227                                 /* We must not have linkwatch events
7228                                  * pending on unregister. If this
7229                                  * happens, we simply run the queue
7230                                  * unscheduled, resulting in a noop
7231                                  * for this device.
7232                                  */
7233                                 linkwatch_run_queue();
7234                         }
7235
7236                         __rtnl_unlock();
7237
7238                         rebroadcast_time = jiffies;
7239                 }
7240
7241                 msleep(250);
7242
7243                 refcnt = netdev_refcnt_read(dev);
7244
7245                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7246                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7247                                  dev->name, refcnt);
7248                         warning_time = jiffies;
7249                 }
7250         }
7251 }
7252
7253 /* The sequence is:
7254  *
7255  *      rtnl_lock();
7256  *      ...
7257  *      register_netdevice(x1);
7258  *      register_netdevice(x2);
7259  *      ...
7260  *      unregister_netdevice(y1);
7261  *      unregister_netdevice(y2);
7262  *      ...
7263  *      rtnl_unlock();
7264  *      free_netdev(y1);
7265  *      free_netdev(y2);
7266  *
7267  * We are invoked by rtnl_unlock().
7268  * This allows us to deal with problems:
7269  * 1) We can delete sysfs objects which invoke hotplug
7270  *    without deadlocking with linkwatch via keventd.
7271  * 2) Since we run with the RTNL semaphore not held, we can sleep
7272  *    safely in order to wait for the netdev refcnt to drop to zero.
7273  *
7274  * We must not return until all unregister events added during
7275  * the interval the lock was held have been completed.
7276  */
7277 void netdev_run_todo(void)
7278 {
7279         struct list_head list;
7280
7281         /* Snapshot list, allow later requests */
7282         list_replace_init(&net_todo_list, &list);
7283
7284         __rtnl_unlock();
7285
7286
7287         /* Wait for rcu callbacks to finish before next phase */
7288         if (!list_empty(&list))
7289                 rcu_barrier();
7290
7291         while (!list_empty(&list)) {
7292                 struct net_device *dev
7293                         = list_first_entry(&list, struct net_device, todo_list);
7294                 list_del(&dev->todo_list);
7295
7296                 rtnl_lock();
7297                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7298                 __rtnl_unlock();
7299
7300                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7301                         pr_err("network todo '%s' but state %d\n",
7302                                dev->name, dev->reg_state);
7303                         dump_stack();
7304                         continue;
7305                 }
7306
7307                 dev->reg_state = NETREG_UNREGISTERED;
7308
7309                 netdev_wait_allrefs(dev);
7310
7311                 /* paranoia */
7312                 BUG_ON(netdev_refcnt_read(dev));
7313                 BUG_ON(!list_empty(&dev->ptype_all));
7314                 BUG_ON(!list_empty(&dev->ptype_specific));
7315                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7316                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7317                 WARN_ON(dev->dn_ptr);
7318
7319                 if (dev->destructor)
7320                         dev->destructor(dev);
7321
7322                 /* Report a network device has been unregistered */
7323                 rtnl_lock();
7324                 dev_net(dev)->dev_unreg_count--;
7325                 __rtnl_unlock();
7326                 wake_up(&netdev_unregistering_wq);
7327
7328                 /* Free network device */
7329                 kobject_put(&dev->dev.kobj);
7330         }
7331 }
7332
7333 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7334  * all the same fields in the same order as net_device_stats, with only
7335  * the type differing, but rtnl_link_stats64 may have additional fields
7336  * at the end for newer counters.
7337  */
7338 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7339                              const struct net_device_stats *netdev_stats)
7340 {
7341 #if BITS_PER_LONG == 64
7342         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7343         memcpy(stats64, netdev_stats, sizeof(*stats64));
7344         /* zero out counters that only exist in rtnl_link_stats64 */
7345         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7346                sizeof(*stats64) - sizeof(*netdev_stats));
7347 #else
7348         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7349         const unsigned long *src = (const unsigned long *)netdev_stats;
7350         u64 *dst = (u64 *)stats64;
7351
7352         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7353         for (i = 0; i < n; i++)
7354                 dst[i] = src[i];
7355         /* zero out counters that only exist in rtnl_link_stats64 */
7356         memset((char *)stats64 + n * sizeof(u64), 0,
7357                sizeof(*stats64) - n * sizeof(u64));
7358 #endif
7359 }
7360 EXPORT_SYMBOL(netdev_stats_to_stats64);
7361
7362 /**
7363  *      dev_get_stats   - get network device statistics
7364  *      @dev: device to get statistics from
7365  *      @storage: place to store stats
7366  *
7367  *      Get network statistics from device. Return @storage.
7368  *      The device driver may provide its own method by setting
7369  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7370  *      otherwise the internal statistics structure is used.
7371  */
7372 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7373                                         struct rtnl_link_stats64 *storage)
7374 {
7375         const struct net_device_ops *ops = dev->netdev_ops;
7376
7377         if (ops->ndo_get_stats64) {
7378                 memset(storage, 0, sizeof(*storage));
7379                 ops->ndo_get_stats64(dev, storage);
7380         } else if (ops->ndo_get_stats) {
7381                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7382         } else {
7383                 netdev_stats_to_stats64(storage, &dev->stats);
7384         }
7385         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7386         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7387         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7388         return storage;
7389 }
7390 EXPORT_SYMBOL(dev_get_stats);
7391
7392 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7393 {
7394         struct netdev_queue *queue = dev_ingress_queue(dev);
7395
7396 #ifdef CONFIG_NET_CLS_ACT
7397         if (queue)
7398                 return queue;
7399         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7400         if (!queue)
7401                 return NULL;
7402         netdev_init_one_queue(dev, queue, NULL);
7403         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7404         queue->qdisc_sleeping = &noop_qdisc;
7405         rcu_assign_pointer(dev->ingress_queue, queue);
7406 #endif
7407         return queue;
7408 }
7409
7410 static const struct ethtool_ops default_ethtool_ops;
7411
7412 void netdev_set_default_ethtool_ops(struct net_device *dev,
7413                                     const struct ethtool_ops *ops)
7414 {
7415         if (dev->ethtool_ops == &default_ethtool_ops)
7416                 dev->ethtool_ops = ops;
7417 }
7418 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7419
7420 void netdev_freemem(struct net_device *dev)
7421 {
7422         char *addr = (char *)dev - dev->padded;
7423
7424         kvfree(addr);
7425 }
7426
7427 /**
7428  *      alloc_netdev_mqs - allocate network device
7429  *      @sizeof_priv:           size of private data to allocate space for
7430  *      @name:                  device name format string
7431  *      @name_assign_type:      origin of device name
7432  *      @setup:                 callback to initialize device
7433  *      @txqs:                  the number of TX subqueues to allocate
7434  *      @rxqs:                  the number of RX subqueues to allocate
7435  *
7436  *      Allocates a struct net_device with private data area for driver use
7437  *      and performs basic initialization.  Also allocates subqueue structs
7438  *      for each queue on the device.
7439  */
7440 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7441                 unsigned char name_assign_type,
7442                 void (*setup)(struct net_device *),
7443                 unsigned int txqs, unsigned int rxqs)
7444 {
7445         struct net_device *dev;
7446         size_t alloc_size;
7447         struct net_device *p;
7448
7449         BUG_ON(strlen(name) >= sizeof(dev->name));
7450
7451         if (txqs < 1) {
7452                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7453                 return NULL;
7454         }
7455
7456 #ifdef CONFIG_SYSFS
7457         if (rxqs < 1) {
7458                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7459                 return NULL;
7460         }
7461 #endif
7462
7463         alloc_size = sizeof(struct net_device);
7464         if (sizeof_priv) {
7465                 /* ensure 32-byte alignment of private area */
7466                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7467                 alloc_size += sizeof_priv;
7468         }
7469         /* ensure 32-byte alignment of whole construct */
7470         alloc_size += NETDEV_ALIGN - 1;
7471
7472         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7473         if (!p)
7474                 p = vzalloc(alloc_size);
7475         if (!p)
7476                 return NULL;
7477
7478         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7479         dev->padded = (char *)dev - (char *)p;
7480
7481         dev->pcpu_refcnt = alloc_percpu(int);
7482         if (!dev->pcpu_refcnt)
7483                 goto free_dev;
7484
7485         if (dev_addr_init(dev))
7486                 goto free_pcpu;
7487
7488         dev_mc_init(dev);
7489         dev_uc_init(dev);
7490
7491         dev_net_set(dev, &init_net);
7492
7493         dev->gso_max_size = GSO_MAX_SIZE;
7494         dev->gso_max_segs = GSO_MAX_SEGS;
7495
7496         INIT_LIST_HEAD(&dev->napi_list);
7497         INIT_LIST_HEAD(&dev->unreg_list);
7498         INIT_LIST_HEAD(&dev->close_list);
7499         INIT_LIST_HEAD(&dev->link_watch_list);
7500         INIT_LIST_HEAD(&dev->adj_list.upper);
7501         INIT_LIST_HEAD(&dev->adj_list.lower);
7502         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7503         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7504         INIT_LIST_HEAD(&dev->ptype_all);
7505         INIT_LIST_HEAD(&dev->ptype_specific);
7506         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7507         setup(dev);
7508
7509         if (!dev->tx_queue_len) {
7510                 dev->priv_flags |= IFF_NO_QUEUE;
7511                 dev->tx_queue_len = 1;
7512         }
7513
7514         dev->num_tx_queues = txqs;
7515         dev->real_num_tx_queues = txqs;
7516         if (netif_alloc_netdev_queues(dev))
7517                 goto free_all;
7518
7519 #ifdef CONFIG_SYSFS
7520         dev->num_rx_queues = rxqs;
7521         dev->real_num_rx_queues = rxqs;
7522         if (netif_alloc_rx_queues(dev))
7523                 goto free_all;
7524 #endif
7525
7526         strcpy(dev->name, name);
7527         dev->name_assign_type = name_assign_type;
7528         dev->group = INIT_NETDEV_GROUP;
7529         if (!dev->ethtool_ops)
7530                 dev->ethtool_ops = &default_ethtool_ops;
7531
7532         nf_hook_ingress_init(dev);
7533
7534         return dev;
7535
7536 free_all:
7537         free_netdev(dev);
7538         return NULL;
7539
7540 free_pcpu:
7541         free_percpu(dev->pcpu_refcnt);
7542 free_dev:
7543         netdev_freemem(dev);
7544         return NULL;
7545 }
7546 EXPORT_SYMBOL(alloc_netdev_mqs);
7547
7548 /**
7549  *      free_netdev - free network device
7550  *      @dev: device
7551  *
7552  *      This function does the last stage of destroying an allocated device
7553  *      interface. The reference to the device object is released.
7554  *      If this is the last reference then it will be freed.
7555  *      Must be called in process context.
7556  */
7557 void free_netdev(struct net_device *dev)
7558 {
7559         struct napi_struct *p, *n;
7560
7561         might_sleep();
7562         netif_free_tx_queues(dev);
7563 #ifdef CONFIG_SYSFS
7564         kvfree(dev->_rx);
7565 #endif
7566
7567         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7568
7569         /* Flush device addresses */
7570         dev_addr_flush(dev);
7571
7572         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7573                 netif_napi_del(p);
7574
7575         free_percpu(dev->pcpu_refcnt);
7576         dev->pcpu_refcnt = NULL;
7577
7578         /*  Compatibility with error handling in drivers */
7579         if (dev->reg_state == NETREG_UNINITIALIZED) {
7580                 netdev_freemem(dev);
7581                 return;
7582         }
7583
7584         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7585         dev->reg_state = NETREG_RELEASED;
7586
7587         /* will free via device release */
7588         put_device(&dev->dev);
7589 }
7590 EXPORT_SYMBOL(free_netdev);
7591
7592 /**
7593  *      synchronize_net -  Synchronize with packet receive processing
7594  *
7595  *      Wait for packets currently being received to be done.
7596  *      Does not block later packets from starting.
7597  */
7598 void synchronize_net(void)
7599 {
7600         might_sleep();
7601         if (rtnl_is_locked())
7602                 synchronize_rcu_expedited();
7603         else
7604                 synchronize_rcu();
7605 }
7606 EXPORT_SYMBOL(synchronize_net);
7607
7608 /**
7609  *      unregister_netdevice_queue - remove device from the kernel
7610  *      @dev: device
7611  *      @head: list
7612  *
7613  *      This function shuts down a device interface and removes it
7614  *      from the kernel tables.
7615  *      If head not NULL, device is queued to be unregistered later.
7616  *
7617  *      Callers must hold the rtnl semaphore.  You may want
7618  *      unregister_netdev() instead of this.
7619  */
7620
7621 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7622 {
7623         ASSERT_RTNL();
7624
7625         if (head) {
7626                 list_move_tail(&dev->unreg_list, head);
7627         } else {
7628                 rollback_registered(dev);
7629                 /* Finish processing unregister after unlock */
7630                 net_set_todo(dev);
7631         }
7632 }
7633 EXPORT_SYMBOL(unregister_netdevice_queue);
7634
7635 /**
7636  *      unregister_netdevice_many - unregister many devices
7637  *      @head: list of devices
7638  *
7639  *  Note: As most callers use a stack allocated list_head,
7640  *  we force a list_del() to make sure stack wont be corrupted later.
7641  */
7642 void unregister_netdevice_many(struct list_head *head)
7643 {
7644         struct net_device *dev;
7645
7646         if (!list_empty(head)) {
7647                 rollback_registered_many(head);
7648                 list_for_each_entry(dev, head, unreg_list)
7649                         net_set_todo(dev);
7650                 list_del(head);
7651         }
7652 }
7653 EXPORT_SYMBOL(unregister_netdevice_many);
7654
7655 /**
7656  *      unregister_netdev - remove device from the kernel
7657  *      @dev: device
7658  *
7659  *      This function shuts down a device interface and removes it
7660  *      from the kernel tables.
7661  *
7662  *      This is just a wrapper for unregister_netdevice that takes
7663  *      the rtnl semaphore.  In general you want to use this and not
7664  *      unregister_netdevice.
7665  */
7666 void unregister_netdev(struct net_device *dev)
7667 {
7668         rtnl_lock();
7669         unregister_netdevice(dev);
7670         rtnl_unlock();
7671 }
7672 EXPORT_SYMBOL(unregister_netdev);
7673
7674 /**
7675  *      dev_change_net_namespace - move device to different nethost namespace
7676  *      @dev: device
7677  *      @net: network namespace
7678  *      @pat: If not NULL name pattern to try if the current device name
7679  *            is already taken in the destination network namespace.
7680  *
7681  *      This function shuts down a device interface and moves it
7682  *      to a new network namespace. On success 0 is returned, on
7683  *      a failure a netagive errno code is returned.
7684  *
7685  *      Callers must hold the rtnl semaphore.
7686  */
7687
7688 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7689 {
7690         int err;
7691
7692         ASSERT_RTNL();
7693
7694         /* Don't allow namespace local devices to be moved. */
7695         err = -EINVAL;
7696         if (dev->features & NETIF_F_NETNS_LOCAL)
7697                 goto out;
7698
7699         /* Ensure the device has been registrered */
7700         if (dev->reg_state != NETREG_REGISTERED)
7701                 goto out;
7702
7703         /* Get out if there is nothing todo */
7704         err = 0;
7705         if (net_eq(dev_net(dev), net))
7706                 goto out;
7707
7708         /* Pick the destination device name, and ensure
7709          * we can use it in the destination network namespace.
7710          */
7711         err = -EEXIST;
7712         if (__dev_get_by_name(net, dev->name)) {
7713                 /* We get here if we can't use the current device name */
7714                 if (!pat)
7715                         goto out;
7716                 if (dev_get_valid_name(net, dev, pat) < 0)
7717                         goto out;
7718         }
7719
7720         /*
7721          * And now a mini version of register_netdevice unregister_netdevice.
7722          */
7723
7724         /* If device is running close it first. */
7725         dev_close(dev);
7726
7727         /* And unlink it from device chain */
7728         err = -ENODEV;
7729         unlist_netdevice(dev);
7730
7731         synchronize_net();
7732
7733         /* Shutdown queueing discipline. */
7734         dev_shutdown(dev);
7735
7736         /* Notify protocols, that we are about to destroy
7737            this device. They should clean all the things.
7738
7739            Note that dev->reg_state stays at NETREG_REGISTERED.
7740            This is wanted because this way 8021q and macvlan know
7741            the device is just moving and can keep their slaves up.
7742         */
7743         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7744         rcu_barrier();
7745         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7746         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7747
7748         /*
7749          *      Flush the unicast and multicast chains
7750          */
7751         dev_uc_flush(dev);
7752         dev_mc_flush(dev);
7753
7754         /* Send a netdev-removed uevent to the old namespace */
7755         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7756         netdev_adjacent_del_links(dev);
7757
7758         /* Actually switch the network namespace */
7759         dev_net_set(dev, net);
7760
7761         /* If there is an ifindex conflict assign a new one */
7762         if (__dev_get_by_index(net, dev->ifindex))
7763                 dev->ifindex = dev_new_index(net);
7764
7765         /* Send a netdev-add uevent to the new namespace */
7766         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7767         netdev_adjacent_add_links(dev);
7768
7769         /* Fixup kobjects */
7770         err = device_rename(&dev->dev, dev->name);
7771         WARN_ON(err);
7772
7773         /* Add the device back in the hashes */
7774         list_netdevice(dev);
7775
7776         /* Notify protocols, that a new device appeared. */
7777         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7778
7779         /*
7780          *      Prevent userspace races by waiting until the network
7781          *      device is fully setup before sending notifications.
7782          */
7783         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7784
7785         synchronize_net();
7786         err = 0;
7787 out:
7788         return err;
7789 }
7790 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7791
7792 static int dev_cpu_callback(struct notifier_block *nfb,
7793                             unsigned long action,
7794                             void *ocpu)
7795 {
7796         struct sk_buff **list_skb;
7797         struct sk_buff *skb;
7798         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7799         struct softnet_data *sd, *oldsd;
7800
7801         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7802                 return NOTIFY_OK;
7803
7804         local_irq_disable();
7805         cpu = smp_processor_id();
7806         sd = &per_cpu(softnet_data, cpu);
7807         oldsd = &per_cpu(softnet_data, oldcpu);
7808
7809         /* Find end of our completion_queue. */
7810         list_skb = &sd->completion_queue;
7811         while (*list_skb)
7812                 list_skb = &(*list_skb)->next;
7813         /* Append completion queue from offline CPU. */
7814         *list_skb = oldsd->completion_queue;
7815         oldsd->completion_queue = NULL;
7816
7817         /* Append output queue from offline CPU. */
7818         if (oldsd->output_queue) {
7819                 *sd->output_queue_tailp = oldsd->output_queue;
7820                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7821                 oldsd->output_queue = NULL;
7822                 oldsd->output_queue_tailp = &oldsd->output_queue;
7823         }
7824         /* Append NAPI poll list from offline CPU, with one exception :
7825          * process_backlog() must be called by cpu owning percpu backlog.
7826          * We properly handle process_queue & input_pkt_queue later.
7827          */
7828         while (!list_empty(&oldsd->poll_list)) {
7829                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7830                                                             struct napi_struct,
7831                                                             poll_list);
7832
7833                 list_del_init(&napi->poll_list);
7834                 if (napi->poll == process_backlog)
7835                         napi->state = 0;
7836                 else
7837                         ____napi_schedule(sd, napi);
7838         }
7839
7840         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7841         local_irq_enable();
7842
7843         /* Process offline CPU's input_pkt_queue */
7844         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7845                 netif_rx_ni(skb);
7846                 input_queue_head_incr(oldsd);
7847         }
7848         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7849                 netif_rx_ni(skb);
7850                 input_queue_head_incr(oldsd);
7851         }
7852
7853         return NOTIFY_OK;
7854 }
7855
7856
7857 /**
7858  *      netdev_increment_features - increment feature set by one
7859  *      @all: current feature set
7860  *      @one: new feature set
7861  *      @mask: mask feature set
7862  *
7863  *      Computes a new feature set after adding a device with feature set
7864  *      @one to the master device with current feature set @all.  Will not
7865  *      enable anything that is off in @mask. Returns the new feature set.
7866  */
7867 netdev_features_t netdev_increment_features(netdev_features_t all,
7868         netdev_features_t one, netdev_features_t mask)
7869 {
7870         if (mask & NETIF_F_HW_CSUM)
7871                 mask |= NETIF_F_CSUM_MASK;
7872         mask |= NETIF_F_VLAN_CHALLENGED;
7873
7874         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7875         all &= one | ~NETIF_F_ALL_FOR_ALL;
7876
7877         /* If one device supports hw checksumming, set for all. */
7878         if (all & NETIF_F_HW_CSUM)
7879                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7880
7881         return all;
7882 }
7883 EXPORT_SYMBOL(netdev_increment_features);
7884
7885 static struct hlist_head * __net_init netdev_create_hash(void)
7886 {
7887         int i;
7888         struct hlist_head *hash;
7889
7890         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7891         if (hash != NULL)
7892                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7893                         INIT_HLIST_HEAD(&hash[i]);
7894
7895         return hash;
7896 }
7897
7898 /* Initialize per network namespace state */
7899 static int __net_init netdev_init(struct net *net)
7900 {
7901         if (net != &init_net)
7902                 INIT_LIST_HEAD(&net->dev_base_head);
7903
7904         net->dev_name_head = netdev_create_hash();
7905         if (net->dev_name_head == NULL)
7906                 goto err_name;
7907
7908         net->dev_index_head = netdev_create_hash();
7909         if (net->dev_index_head == NULL)
7910                 goto err_idx;
7911
7912         return 0;
7913
7914 err_idx:
7915         kfree(net->dev_name_head);
7916 err_name:
7917         return -ENOMEM;
7918 }
7919
7920 /**
7921  *      netdev_drivername - network driver for the device
7922  *      @dev: network device
7923  *
7924  *      Determine network driver for device.
7925  */
7926 const char *netdev_drivername(const struct net_device *dev)
7927 {
7928         const struct device_driver *driver;
7929         const struct device *parent;
7930         const char *empty = "";
7931
7932         parent = dev->dev.parent;
7933         if (!parent)
7934                 return empty;
7935
7936         driver = parent->driver;
7937         if (driver && driver->name)
7938                 return driver->name;
7939         return empty;
7940 }
7941
7942 static void __netdev_printk(const char *level, const struct net_device *dev,
7943                             struct va_format *vaf)
7944 {
7945         if (dev && dev->dev.parent) {
7946                 dev_printk_emit(level[1] - '0',
7947                                 dev->dev.parent,
7948                                 "%s %s %s%s: %pV",
7949                                 dev_driver_string(dev->dev.parent),
7950                                 dev_name(dev->dev.parent),
7951                                 netdev_name(dev), netdev_reg_state(dev),
7952                                 vaf);
7953         } else if (dev) {
7954                 printk("%s%s%s: %pV",
7955                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7956         } else {
7957                 printk("%s(NULL net_device): %pV", level, vaf);
7958         }
7959 }
7960
7961 void netdev_printk(const char *level, const struct net_device *dev,
7962                    const char *format, ...)
7963 {
7964         struct va_format vaf;
7965         va_list args;
7966
7967         va_start(args, format);
7968
7969         vaf.fmt = format;
7970         vaf.va = &args;
7971
7972         __netdev_printk(level, dev, &vaf);
7973
7974         va_end(args);
7975 }
7976 EXPORT_SYMBOL(netdev_printk);
7977
7978 #define define_netdev_printk_level(func, level)                 \
7979 void func(const struct net_device *dev, const char *fmt, ...)   \
7980 {                                                               \
7981         struct va_format vaf;                                   \
7982         va_list args;                                           \
7983                                                                 \
7984         va_start(args, fmt);                                    \
7985                                                                 \
7986         vaf.fmt = fmt;                                          \
7987         vaf.va = &args;                                         \
7988                                                                 \
7989         __netdev_printk(level, dev, &vaf);                      \
7990                                                                 \
7991         va_end(args);                                           \
7992 }                                                               \
7993 EXPORT_SYMBOL(func);
7994
7995 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7996 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7997 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7998 define_netdev_printk_level(netdev_err, KERN_ERR);
7999 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8000 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8001 define_netdev_printk_level(netdev_info, KERN_INFO);
8002
8003 static void __net_exit netdev_exit(struct net *net)
8004 {
8005         kfree(net->dev_name_head);
8006         kfree(net->dev_index_head);
8007 }
8008
8009 static struct pernet_operations __net_initdata netdev_net_ops = {
8010         .init = netdev_init,
8011         .exit = netdev_exit,
8012 };
8013
8014 static void __net_exit default_device_exit(struct net *net)
8015 {
8016         struct net_device *dev, *aux;
8017         /*
8018          * Push all migratable network devices back to the
8019          * initial network namespace
8020          */
8021         rtnl_lock();
8022         for_each_netdev_safe(net, dev, aux) {
8023                 int err;
8024                 char fb_name[IFNAMSIZ];
8025
8026                 /* Ignore unmoveable devices (i.e. loopback) */
8027                 if (dev->features & NETIF_F_NETNS_LOCAL)
8028                         continue;
8029
8030                 /* Leave virtual devices for the generic cleanup */
8031                 if (dev->rtnl_link_ops)
8032                         continue;
8033
8034                 /* Push remaining network devices to init_net */
8035                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8036                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8037                 if (err) {
8038                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8039                                  __func__, dev->name, err);
8040                         BUG();
8041                 }
8042         }
8043         rtnl_unlock();
8044 }
8045
8046 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8047 {
8048         /* Return with the rtnl_lock held when there are no network
8049          * devices unregistering in any network namespace in net_list.
8050          */
8051         struct net *net;
8052         bool unregistering;
8053         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8054
8055         add_wait_queue(&netdev_unregistering_wq, &wait);
8056         for (;;) {
8057                 unregistering = false;
8058                 rtnl_lock();
8059                 list_for_each_entry(net, net_list, exit_list) {
8060                         if (net->dev_unreg_count > 0) {
8061                                 unregistering = true;
8062                                 break;
8063                         }
8064                 }
8065                 if (!unregistering)
8066                         break;
8067                 __rtnl_unlock();
8068
8069                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8070         }
8071         remove_wait_queue(&netdev_unregistering_wq, &wait);
8072 }
8073
8074 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8075 {
8076         /* At exit all network devices most be removed from a network
8077          * namespace.  Do this in the reverse order of registration.
8078          * Do this across as many network namespaces as possible to
8079          * improve batching efficiency.
8080          */
8081         struct net_device *dev;
8082         struct net *net;
8083         LIST_HEAD(dev_kill_list);
8084
8085         /* To prevent network device cleanup code from dereferencing
8086          * loopback devices or network devices that have been freed
8087          * wait here for all pending unregistrations to complete,
8088          * before unregistring the loopback device and allowing the
8089          * network namespace be freed.
8090          *
8091          * The netdev todo list containing all network devices
8092          * unregistrations that happen in default_device_exit_batch
8093          * will run in the rtnl_unlock() at the end of
8094          * default_device_exit_batch.
8095          */
8096         rtnl_lock_unregistering(net_list);
8097         list_for_each_entry(net, net_list, exit_list) {
8098                 for_each_netdev_reverse(net, dev) {
8099                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8100                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8101                         else
8102                                 unregister_netdevice_queue(dev, &dev_kill_list);
8103                 }
8104         }
8105         unregister_netdevice_many(&dev_kill_list);
8106         rtnl_unlock();
8107 }
8108
8109 static struct pernet_operations __net_initdata default_device_ops = {
8110         .exit = default_device_exit,
8111         .exit_batch = default_device_exit_batch,
8112 };
8113
8114 /*
8115  *      Initialize the DEV module. At boot time this walks the device list and
8116  *      unhooks any devices that fail to initialise (normally hardware not
8117  *      present) and leaves us with a valid list of present and active devices.
8118  *
8119  */
8120
8121 /*
8122  *       This is called single threaded during boot, so no need
8123  *       to take the rtnl semaphore.
8124  */
8125 static int __init net_dev_init(void)
8126 {
8127         int i, rc = -ENOMEM;
8128
8129         BUG_ON(!dev_boot_phase);
8130
8131         if (dev_proc_init())
8132                 goto out;
8133
8134         if (netdev_kobject_init())
8135                 goto out;
8136
8137         INIT_LIST_HEAD(&ptype_all);
8138         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8139                 INIT_LIST_HEAD(&ptype_base[i]);
8140
8141         INIT_LIST_HEAD(&offload_base);
8142
8143         if (register_pernet_subsys(&netdev_net_ops))
8144                 goto out;
8145
8146         /*
8147          *      Initialise the packet receive queues.
8148          */
8149
8150         for_each_possible_cpu(i) {
8151                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8152
8153                 skb_queue_head_init(&sd->input_pkt_queue);
8154                 skb_queue_head_init(&sd->process_queue);
8155                 INIT_LIST_HEAD(&sd->poll_list);
8156                 sd->output_queue_tailp = &sd->output_queue;
8157 #ifdef CONFIG_RPS
8158                 sd->csd.func = rps_trigger_softirq;
8159                 sd->csd.info = sd;
8160                 sd->cpu = i;
8161 #endif
8162
8163                 sd->backlog.poll = process_backlog;
8164                 sd->backlog.weight = weight_p;
8165         }
8166
8167         dev_boot_phase = 0;
8168
8169         /* The loopback device is special if any other network devices
8170          * is present in a network namespace the loopback device must
8171          * be present. Since we now dynamically allocate and free the
8172          * loopback device ensure this invariant is maintained by
8173          * keeping the loopback device as the first device on the
8174          * list of network devices.  Ensuring the loopback devices
8175          * is the first device that appears and the last network device
8176          * that disappears.
8177          */
8178         if (register_pernet_device(&loopback_net_ops))
8179                 goto out;
8180
8181         if (register_pernet_device(&default_device_ops))
8182                 goto out;
8183
8184         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8185         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8186
8187         hotcpu_notifier(dev_cpu_callback, 0);
8188         dst_subsys_init();
8189         rc = 0;
8190 out:
8191         return rc;
8192 }
8193
8194 subsys_initcall(net_dev_init);