2ab0922af0b430f0cf6e6e0b88099cffba3d698f
[cascardo/linux.git] / drivers / net / vxlan.c
1 /*
2  * VXLAN: Virtual eXtensible Local Area Network
3  *
4  * Copyright (c) 2012-2013 Vyatta Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/kernel.h>
14 #include <linux/types.h>
15 #include <linux/module.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/skbuff.h>
19 #include <linux/rculist.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/ip.h>
23 #include <linux/udp.h>
24 #include <linux/igmp.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/hash.h>
29 #include <linux/ethtool.h>
30 #include <net/arp.h>
31 #include <net/ndisc.h>
32 #include <net/ip.h>
33 #include <net/ip_tunnels.h>
34 #include <net/icmp.h>
35 #include <net/udp.h>
36 #include <net/udp_tunnel.h>
37 #include <net/rtnetlink.h>
38 #include <net/route.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/vxlan.h>
44 #include <net/protocol.h>
45 #include <net/udp_tunnel.h>
46 #if IS_ENABLED(CONFIG_IPV6)
47 #include <net/ipv6.h>
48 #include <net/addrconf.h>
49 #include <net/ip6_tunnel.h>
50 #include <net/ip6_checksum.h>
51 #endif
52
53 #define VXLAN_VERSION   "0.1"
54
55 #define PORT_HASH_BITS  8
56 #define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
57 #define VNI_HASH_BITS   10
58 #define VNI_HASH_SIZE   (1<<VNI_HASH_BITS)
59 #define FDB_HASH_BITS   8
60 #define FDB_HASH_SIZE   (1<<FDB_HASH_BITS)
61 #define FDB_AGE_DEFAULT 300 /* 5 min */
62 #define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
63
64 #define VXLAN_N_VID     (1u << 24)
65 #define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
66 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
67
68 #define VXLAN_FLAGS 0x08000000  /* struct vxlanhdr.vx_flags required value. */
69
70 /* UDP port for VXLAN traffic.
71  * The IANA assigned port is 4789, but the Linux default is 8472
72  * for compatibility with early adopters.
73  */
74 static unsigned short vxlan_port __read_mostly = 8472;
75 module_param_named(udp_port, vxlan_port, ushort, 0444);
76 MODULE_PARM_DESC(udp_port, "Destination UDP port");
77
78 static bool log_ecn_error = true;
79 module_param(log_ecn_error, bool, 0644);
80 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
81
82 static int vxlan_net_id;
83
84 static const u8 all_zeros_mac[ETH_ALEN];
85
86 /* per-network namespace private data for this module */
87 struct vxlan_net {
88         struct list_head  vxlan_list;
89         struct hlist_head sock_list[PORT_HASH_SIZE];
90         spinlock_t        sock_lock;
91 };
92
93 union vxlan_addr {
94         struct sockaddr_in sin;
95         struct sockaddr_in6 sin6;
96         struct sockaddr sa;
97 };
98
99 struct vxlan_rdst {
100         union vxlan_addr         remote_ip;
101         __be16                   remote_port;
102         u32                      remote_vni;
103         u32                      remote_ifindex;
104         struct list_head         list;
105         struct rcu_head          rcu;
106 };
107
108 /* Forwarding table entry */
109 struct vxlan_fdb {
110         struct hlist_node hlist;        /* linked list of entries */
111         struct rcu_head   rcu;
112         unsigned long     updated;      /* jiffies */
113         unsigned long     used;
114         struct list_head  remotes;
115         u16               state;        /* see ndm_state */
116         u8                flags;        /* see ndm_flags */
117         u8                eth_addr[ETH_ALEN];
118 };
119
120 /* Pseudo network device */
121 struct vxlan_dev {
122         struct hlist_node hlist;        /* vni hash table */
123         struct list_head  next;         /* vxlan's per namespace list */
124         struct vxlan_sock *vn_sock;     /* listening socket */
125         struct net_device *dev;
126         struct net        *net;         /* netns for packet i/o */
127         struct vxlan_rdst default_dst;  /* default destination */
128         union vxlan_addr  saddr;        /* source address */
129         __be16            dst_port;
130         __u16             port_min;     /* source port range */
131         __u16             port_max;
132         __u8              tos;          /* TOS override */
133         __u8              ttl;
134         u32               flags;        /* VXLAN_F_* in vxlan.h */
135
136         struct work_struct sock_work;
137         struct work_struct igmp_join;
138         struct work_struct igmp_leave;
139
140         unsigned long     age_interval;
141         struct timer_list age_timer;
142         spinlock_t        hash_lock;
143         unsigned int      addrcnt;
144         unsigned int      addrmax;
145
146         struct hlist_head fdb_head[FDB_HASH_SIZE];
147 };
148
149 /* salt for hash table */
150 static u32 vxlan_salt __read_mostly;
151 static struct workqueue_struct *vxlan_wq;
152
153 static void vxlan_sock_work(struct work_struct *work);
154
155 #if IS_ENABLED(CONFIG_IPV6)
156 static inline
157 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
158 {
159        if (a->sa.sa_family != b->sa.sa_family)
160                return false;
161        if (a->sa.sa_family == AF_INET6)
162                return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
163        else
164                return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
165 }
166
167 static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
168 {
169        if (ipa->sa.sa_family == AF_INET6)
170                return ipv6_addr_any(&ipa->sin6.sin6_addr);
171        else
172                return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
173 }
174
175 static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
176 {
177        if (ipa->sa.sa_family == AF_INET6)
178                return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
179        else
180                return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
181 }
182
183 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
184 {
185        if (nla_len(nla) >= sizeof(struct in6_addr)) {
186                nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr));
187                ip->sa.sa_family = AF_INET6;
188                return 0;
189        } else if (nla_len(nla) >= sizeof(__be32)) {
190                ip->sin.sin_addr.s_addr = nla_get_be32(nla);
191                ip->sa.sa_family = AF_INET;
192                return 0;
193        } else {
194                return -EAFNOSUPPORT;
195        }
196 }
197
198 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
199                              const union vxlan_addr *ip)
200 {
201        if (ip->sa.sa_family == AF_INET6)
202                return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr);
203        else
204                return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
205 }
206
207 #else /* !CONFIG_IPV6 */
208
209 static inline
210 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
211 {
212        return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
213 }
214
215 static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
216 {
217        return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
218 }
219
220 static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
221 {
222        return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
223 }
224
225 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
226 {
227        if (nla_len(nla) >= sizeof(struct in6_addr)) {
228                return -EAFNOSUPPORT;
229        } else if (nla_len(nla) >= sizeof(__be32)) {
230                ip->sin.sin_addr.s_addr = nla_get_be32(nla);
231                ip->sa.sa_family = AF_INET;
232                return 0;
233        } else {
234                return -EAFNOSUPPORT;
235        }
236 }
237
238 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
239                              const union vxlan_addr *ip)
240 {
241        return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
242 }
243 #endif
244
245 /* Virtual Network hash table head */
246 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
247 {
248         return &vs->vni_list[hash_32(id, VNI_HASH_BITS)];
249 }
250
251 /* Socket hash table head */
252 static inline struct hlist_head *vs_head(struct net *net, __be16 port)
253 {
254         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
255
256         return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
257 }
258
259 /* First remote destination for a forwarding entry.
260  * Guaranteed to be non-NULL because remotes are never deleted.
261  */
262 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
263 {
264         return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
265 }
266
267 static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
268 {
269         return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
270 }
271
272 /* Find VXLAN socket based on network namespace, address family and UDP port */
273 static struct vxlan_sock *vxlan_find_sock(struct net *net,
274                                           sa_family_t family, __be16 port)
275 {
276         struct vxlan_sock *vs;
277
278         hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
279                 if (inet_sk(vs->sock->sk)->inet_sport == port &&
280                     inet_sk(vs->sock->sk)->sk.sk_family == family)
281                         return vs;
282         }
283         return NULL;
284 }
285
286 static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
287 {
288         struct vxlan_dev *vxlan;
289
290         hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
291                 if (vxlan->default_dst.remote_vni == id)
292                         return vxlan;
293         }
294
295         return NULL;
296 }
297
298 /* Look up VNI in a per net namespace table */
299 static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id,
300                                         sa_family_t family, __be16 port)
301 {
302         struct vxlan_sock *vs;
303
304         vs = vxlan_find_sock(net, family, port);
305         if (!vs)
306                 return NULL;
307
308         return vxlan_vs_find_vni(vs, id);
309 }
310
311 /* Fill in neighbour message in skbuff. */
312 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
313                           const struct vxlan_fdb *fdb,
314                           u32 portid, u32 seq, int type, unsigned int flags,
315                           const struct vxlan_rdst *rdst)
316 {
317         unsigned long now = jiffies;
318         struct nda_cacheinfo ci;
319         struct nlmsghdr *nlh;
320         struct ndmsg *ndm;
321         bool send_ip, send_eth;
322
323         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
324         if (nlh == NULL)
325                 return -EMSGSIZE;
326
327         ndm = nlmsg_data(nlh);
328         memset(ndm, 0, sizeof(*ndm));
329
330         send_eth = send_ip = true;
331
332         if (type == RTM_GETNEIGH) {
333                 ndm->ndm_family = AF_INET;
334                 send_ip = !vxlan_addr_any(&rdst->remote_ip);
335                 send_eth = !is_zero_ether_addr(fdb->eth_addr);
336         } else
337                 ndm->ndm_family = AF_BRIDGE;
338         ndm->ndm_state = fdb->state;
339         ndm->ndm_ifindex = vxlan->dev->ifindex;
340         ndm->ndm_flags = fdb->flags;
341         ndm->ndm_type = RTN_UNICAST;
342
343         if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
344                 goto nla_put_failure;
345
346         if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
347                 goto nla_put_failure;
348
349         if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
350             nla_put_be16(skb, NDA_PORT, rdst->remote_port))
351                 goto nla_put_failure;
352         if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
353             nla_put_u32(skb, NDA_VNI, rdst->remote_vni))
354                 goto nla_put_failure;
355         if (rdst->remote_ifindex &&
356             nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
357                 goto nla_put_failure;
358
359         ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
360         ci.ndm_confirmed = 0;
361         ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
362         ci.ndm_refcnt    = 0;
363
364         if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
365                 goto nla_put_failure;
366
367         return nlmsg_end(skb, nlh);
368
369 nla_put_failure:
370         nlmsg_cancel(skb, nlh);
371         return -EMSGSIZE;
372 }
373
374 static inline size_t vxlan_nlmsg_size(void)
375 {
376         return NLMSG_ALIGN(sizeof(struct ndmsg))
377                 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
378                 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
379                 + nla_total_size(sizeof(__be16)) /* NDA_PORT */
380                 + nla_total_size(sizeof(__be32)) /* NDA_VNI */
381                 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
382                 + nla_total_size(sizeof(struct nda_cacheinfo));
383 }
384
385 static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
386                              struct vxlan_rdst *rd, int type)
387 {
388         struct net *net = dev_net(vxlan->dev);
389         struct sk_buff *skb;
390         int err = -ENOBUFS;
391
392         skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
393         if (skb == NULL)
394                 goto errout;
395
396         err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
397         if (err < 0) {
398                 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
399                 WARN_ON(err == -EMSGSIZE);
400                 kfree_skb(skb);
401                 goto errout;
402         }
403
404         rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
405         return;
406 errout:
407         if (err < 0)
408                 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
409 }
410
411 static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
412 {
413         struct vxlan_dev *vxlan = netdev_priv(dev);
414         struct vxlan_fdb f = {
415                 .state = NUD_STALE,
416         };
417         struct vxlan_rdst remote = {
418                 .remote_ip = *ipa, /* goes to NDA_DST */
419                 .remote_vni = VXLAN_N_VID,
420         };
421
422         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
423 }
424
425 static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
426 {
427         struct vxlan_fdb f = {
428                 .state = NUD_STALE,
429         };
430         struct vxlan_rdst remote = { };
431
432         memcpy(f.eth_addr, eth_addr, ETH_ALEN);
433
434         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
435 }
436
437 /* Hash Ethernet address */
438 static u32 eth_hash(const unsigned char *addr)
439 {
440         u64 value = get_unaligned((u64 *)addr);
441
442         /* only want 6 bytes */
443 #ifdef __BIG_ENDIAN
444         value >>= 16;
445 #else
446         value <<= 16;
447 #endif
448         return hash_64(value, FDB_HASH_BITS);
449 }
450
451 /* Hash chain to use given mac address */
452 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
453                                                 const u8 *mac)
454 {
455         return &vxlan->fdb_head[eth_hash(mac)];
456 }
457
458 /* Look up Ethernet address in forwarding table */
459 static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
460                                         const u8 *mac)
461 {
462         struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
463         struct vxlan_fdb *f;
464
465         hlist_for_each_entry_rcu(f, head, hlist) {
466                 if (ether_addr_equal(mac, f->eth_addr))
467                         return f;
468         }
469
470         return NULL;
471 }
472
473 static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
474                                         const u8 *mac)
475 {
476         struct vxlan_fdb *f;
477
478         f = __vxlan_find_mac(vxlan, mac);
479         if (f)
480                 f->used = jiffies;
481
482         return f;
483 }
484
485 /* caller should hold vxlan->hash_lock */
486 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
487                                               union vxlan_addr *ip, __be16 port,
488                                               __u32 vni, __u32 ifindex)
489 {
490         struct vxlan_rdst *rd;
491
492         list_for_each_entry(rd, &f->remotes, list) {
493                 if (vxlan_addr_equal(&rd->remote_ip, ip) &&
494                     rd->remote_port == port &&
495                     rd->remote_vni == vni &&
496                     rd->remote_ifindex == ifindex)
497                         return rd;
498         }
499
500         return NULL;
501 }
502
503 /* Replace destination of unicast mac */
504 static int vxlan_fdb_replace(struct vxlan_fdb *f,
505                              union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
506 {
507         struct vxlan_rdst *rd;
508
509         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
510         if (rd)
511                 return 0;
512
513         rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
514         if (!rd)
515                 return 0;
516         rd->remote_ip = *ip;
517         rd->remote_port = port;
518         rd->remote_vni = vni;
519         rd->remote_ifindex = ifindex;
520         return 1;
521 }
522
523 /* Add/update destinations for multicast */
524 static int vxlan_fdb_append(struct vxlan_fdb *f,
525                             union vxlan_addr *ip, __be16 port, __u32 vni,
526                             __u32 ifindex, struct vxlan_rdst **rdp)
527 {
528         struct vxlan_rdst *rd;
529
530         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
531         if (rd)
532                 return 0;
533
534         rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
535         if (rd == NULL)
536                 return -ENOBUFS;
537         rd->remote_ip = *ip;
538         rd->remote_port = port;
539         rd->remote_vni = vni;
540         rd->remote_ifindex = ifindex;
541
542         list_add_tail_rcu(&rd->list, &f->remotes);
543
544         *rdp = rd;
545         return 1;
546 }
547
548 static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb)
549 {
550         struct sk_buff *p, **pp = NULL;
551         struct vxlanhdr *vh, *vh2;
552         unsigned int hlen, off_vx;
553         int flush = 1;
554
555         off_vx = skb_gro_offset(skb);
556         hlen = off_vx + sizeof(*vh);
557         vh   = skb_gro_header_fast(skb, off_vx);
558         if (skb_gro_header_hard(skb, hlen)) {
559                 vh = skb_gro_header_slow(skb, hlen, off_vx);
560                 if (unlikely(!vh))
561                         goto out;
562         }
563
564         flush = 0;
565
566         for (p = *head; p; p = p->next) {
567                 if (!NAPI_GRO_CB(p)->same_flow)
568                         continue;
569
570                 vh2 = (struct vxlanhdr *)(p->data + off_vx);
571                 if (vh->vx_vni != vh2->vx_vni) {
572                         NAPI_GRO_CB(p)->same_flow = 0;
573                         continue;
574                 }
575         }
576
577         skb_gro_pull(skb, sizeof(struct vxlanhdr));
578         skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
579         pp = eth_gro_receive(head, skb);
580
581 out:
582         NAPI_GRO_CB(skb)->flush |= flush;
583
584         return pp;
585 }
586
587 static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
588 {
589         udp_tunnel_gro_complete(skb, nhoff);
590
591         return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
592 }
593
594 /* Notify netdevs that UDP port started listening */
595 static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
596 {
597         struct net_device *dev;
598         struct sock *sk = vs->sock->sk;
599         struct net *net = sock_net(sk);
600         sa_family_t sa_family = sk->sk_family;
601         __be16 port = inet_sk(sk)->inet_sport;
602         int err;
603
604         if (sa_family == AF_INET) {
605                 err = udp_add_offload(&vs->udp_offloads);
606                 if (err)
607                         pr_warn("vxlan: udp_add_offload failed with status %d\n", err);
608         }
609
610         rcu_read_lock();
611         for_each_netdev_rcu(net, dev) {
612                 if (dev->netdev_ops->ndo_add_vxlan_port)
613                         dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
614                                                             port);
615         }
616         rcu_read_unlock();
617 }
618
619 /* Notify netdevs that UDP port is no more listening */
620 static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
621 {
622         struct net_device *dev;
623         struct sock *sk = vs->sock->sk;
624         struct net *net = sock_net(sk);
625         sa_family_t sa_family = sk->sk_family;
626         __be16 port = inet_sk(sk)->inet_sport;
627
628         rcu_read_lock();
629         for_each_netdev_rcu(net, dev) {
630                 if (dev->netdev_ops->ndo_del_vxlan_port)
631                         dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family,
632                                                             port);
633         }
634         rcu_read_unlock();
635
636         if (sa_family == AF_INET)
637                 udp_del_offload(&vs->udp_offloads);
638 }
639
640 /* Add new entry to forwarding table -- assumes lock held */
641 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
642                             const u8 *mac, union vxlan_addr *ip,
643                             __u16 state, __u16 flags,
644                             __be16 port, __u32 vni, __u32 ifindex,
645                             __u8 ndm_flags)
646 {
647         struct vxlan_rdst *rd = NULL;
648         struct vxlan_fdb *f;
649         int notify = 0;
650
651         f = __vxlan_find_mac(vxlan, mac);
652         if (f) {
653                 if (flags & NLM_F_EXCL) {
654                         netdev_dbg(vxlan->dev,
655                                    "lost race to create %pM\n", mac);
656                         return -EEXIST;
657                 }
658                 if (f->state != state) {
659                         f->state = state;
660                         f->updated = jiffies;
661                         notify = 1;
662                 }
663                 if (f->flags != ndm_flags) {
664                         f->flags = ndm_flags;
665                         f->updated = jiffies;
666                         notify = 1;
667                 }
668                 if ((flags & NLM_F_REPLACE)) {
669                         /* Only change unicasts */
670                         if (!(is_multicast_ether_addr(f->eth_addr) ||
671                              is_zero_ether_addr(f->eth_addr))) {
672                                 int rc = vxlan_fdb_replace(f, ip, port, vni,
673                                                            ifindex);
674
675                                 if (rc < 0)
676                                         return rc;
677                                 notify |= rc;
678                         } else
679                                 return -EOPNOTSUPP;
680                 }
681                 if ((flags & NLM_F_APPEND) &&
682                     (is_multicast_ether_addr(f->eth_addr) ||
683                      is_zero_ether_addr(f->eth_addr))) {
684                         int rc = vxlan_fdb_append(f, ip, port, vni, ifindex,
685                                                   &rd);
686
687                         if (rc < 0)
688                                 return rc;
689                         notify |= rc;
690                 }
691         } else {
692                 if (!(flags & NLM_F_CREATE))
693                         return -ENOENT;
694
695                 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
696                         return -ENOSPC;
697
698                 /* Disallow replace to add a multicast entry */
699                 if ((flags & NLM_F_REPLACE) &&
700                     (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
701                         return -EOPNOTSUPP;
702
703                 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
704                 f = kmalloc(sizeof(*f), GFP_ATOMIC);
705                 if (!f)
706                         return -ENOMEM;
707
708                 notify = 1;
709                 f->state = state;
710                 f->flags = ndm_flags;
711                 f->updated = f->used = jiffies;
712                 INIT_LIST_HEAD(&f->remotes);
713                 memcpy(f->eth_addr, mac, ETH_ALEN);
714
715                 vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
716
717                 ++vxlan->addrcnt;
718                 hlist_add_head_rcu(&f->hlist,
719                                    vxlan_fdb_head(vxlan, mac));
720         }
721
722         if (notify) {
723                 if (rd == NULL)
724                         rd = first_remote_rtnl(f);
725                 vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH);
726         }
727
728         return 0;
729 }
730
731 static void vxlan_fdb_free(struct rcu_head *head)
732 {
733         struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
734         struct vxlan_rdst *rd, *nd;
735
736         list_for_each_entry_safe(rd, nd, &f->remotes, list)
737                 kfree(rd);
738         kfree(f);
739 }
740
741 static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
742 {
743         netdev_dbg(vxlan->dev,
744                     "delete %pM\n", f->eth_addr);
745
746         --vxlan->addrcnt;
747         vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
748
749         hlist_del_rcu(&f->hlist);
750         call_rcu(&f->rcu, vxlan_fdb_free);
751 }
752
753 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
754                            union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex)
755 {
756         struct net *net = dev_net(vxlan->dev);
757         int err;
758
759         if (tb[NDA_DST]) {
760                 err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
761                 if (err)
762                         return err;
763         } else {
764                 union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
765                 if (remote->sa.sa_family == AF_INET) {
766                         ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
767                         ip->sa.sa_family = AF_INET;
768 #if IS_ENABLED(CONFIG_IPV6)
769                 } else {
770                         ip->sin6.sin6_addr = in6addr_any;
771                         ip->sa.sa_family = AF_INET6;
772 #endif
773                 }
774         }
775
776         if (tb[NDA_PORT]) {
777                 if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
778                         return -EINVAL;
779                 *port = nla_get_be16(tb[NDA_PORT]);
780         } else {
781                 *port = vxlan->dst_port;
782         }
783
784         if (tb[NDA_VNI]) {
785                 if (nla_len(tb[NDA_VNI]) != sizeof(u32))
786                         return -EINVAL;
787                 *vni = nla_get_u32(tb[NDA_VNI]);
788         } else {
789                 *vni = vxlan->default_dst.remote_vni;
790         }
791
792         if (tb[NDA_IFINDEX]) {
793                 struct net_device *tdev;
794
795                 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
796                         return -EINVAL;
797                 *ifindex = nla_get_u32(tb[NDA_IFINDEX]);
798                 tdev = __dev_get_by_index(net, *ifindex);
799                 if (!tdev)
800                         return -EADDRNOTAVAIL;
801         } else {
802                 *ifindex = 0;
803         }
804
805         return 0;
806 }
807
808 /* Add static entry (via netlink) */
809 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
810                          struct net_device *dev,
811                          const unsigned char *addr, u16 vid, u16 flags)
812 {
813         struct vxlan_dev *vxlan = netdev_priv(dev);
814         /* struct net *net = dev_net(vxlan->dev); */
815         union vxlan_addr ip;
816         __be16 port;
817         u32 vni, ifindex;
818         int err;
819
820         if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
821                 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
822                         ndm->ndm_state);
823                 return -EINVAL;
824         }
825
826         if (tb[NDA_DST] == NULL)
827                 return -EINVAL;
828
829         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
830         if (err)
831                 return err;
832
833         if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
834                 return -EAFNOSUPPORT;
835
836         spin_lock_bh(&vxlan->hash_lock);
837         err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
838                                port, vni, ifindex, ndm->ndm_flags);
839         spin_unlock_bh(&vxlan->hash_lock);
840
841         return err;
842 }
843
844 /* Delete entry (via netlink) */
845 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
846                             struct net_device *dev,
847                             const unsigned char *addr, u16 vid)
848 {
849         struct vxlan_dev *vxlan = netdev_priv(dev);
850         struct vxlan_fdb *f;
851         struct vxlan_rdst *rd = NULL;
852         union vxlan_addr ip;
853         __be16 port;
854         u32 vni, ifindex;
855         int err;
856
857         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
858         if (err)
859                 return err;
860
861         err = -ENOENT;
862
863         spin_lock_bh(&vxlan->hash_lock);
864         f = vxlan_find_mac(vxlan, addr);
865         if (!f)
866                 goto out;
867
868         if (!vxlan_addr_any(&ip)) {
869                 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
870                 if (!rd)
871                         goto out;
872         }
873
874         err = 0;
875
876         /* remove a destination if it's not the only one on the list,
877          * otherwise destroy the fdb entry
878          */
879         if (rd && !list_is_singular(&f->remotes)) {
880                 list_del_rcu(&rd->list);
881                 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH);
882                 kfree_rcu(rd, rcu);
883                 goto out;
884         }
885
886         vxlan_fdb_destroy(vxlan, f);
887
888 out:
889         spin_unlock_bh(&vxlan->hash_lock);
890
891         return err;
892 }
893
894 /* Dump forwarding table */
895 static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
896                           struct net_device *dev,
897                           struct net_device *filter_dev, int idx)
898 {
899         struct vxlan_dev *vxlan = netdev_priv(dev);
900         unsigned int h;
901
902         for (h = 0; h < FDB_HASH_SIZE; ++h) {
903                 struct vxlan_fdb *f;
904                 int err;
905
906                 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
907                         struct vxlan_rdst *rd;
908
909                         if (idx < cb->args[0])
910                                 goto skip;
911
912                         list_for_each_entry_rcu(rd, &f->remotes, list) {
913                                 err = vxlan_fdb_info(skb, vxlan, f,
914                                                      NETLINK_CB(cb->skb).portid,
915                                                      cb->nlh->nlmsg_seq,
916                                                      RTM_NEWNEIGH,
917                                                      NLM_F_MULTI, rd);
918                                 if (err < 0)
919                                         goto out;
920                         }
921 skip:
922                         ++idx;
923                 }
924         }
925 out:
926         return idx;
927 }
928
929 /* Watch incoming packets to learn mapping between Ethernet address
930  * and Tunnel endpoint.
931  * Return true if packet is bogus and should be droppped.
932  */
933 static bool vxlan_snoop(struct net_device *dev,
934                         union vxlan_addr *src_ip, const u8 *src_mac)
935 {
936         struct vxlan_dev *vxlan = netdev_priv(dev);
937         struct vxlan_fdb *f;
938
939         f = vxlan_find_mac(vxlan, src_mac);
940         if (likely(f)) {
941                 struct vxlan_rdst *rdst = first_remote_rcu(f);
942
943                 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
944                         return false;
945
946                 /* Don't migrate static entries, drop packets */
947                 if (f->state & NUD_NOARP)
948                         return true;
949
950                 if (net_ratelimit())
951                         netdev_info(dev,
952                                     "%pM migrated from %pIS to %pIS\n",
953                                     src_mac, &rdst->remote_ip, &src_ip);
954
955                 rdst->remote_ip = *src_ip;
956                 f->updated = jiffies;
957                 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH);
958         } else {
959                 /* learned new entry */
960                 spin_lock(&vxlan->hash_lock);
961
962                 /* close off race between vxlan_flush and incoming packets */
963                 if (netif_running(dev))
964                         vxlan_fdb_create(vxlan, src_mac, src_ip,
965                                          NUD_REACHABLE,
966                                          NLM_F_EXCL|NLM_F_CREATE,
967                                          vxlan->dst_port,
968                                          vxlan->default_dst.remote_vni,
969                                          0, NTF_SELF);
970                 spin_unlock(&vxlan->hash_lock);
971         }
972
973         return false;
974 }
975
976 /* See if multicast group is already in use by other ID */
977 static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
978 {
979         struct vxlan_dev *vxlan;
980
981         /* The vxlan_sock is only used by dev, leaving group has
982          * no effect on other vxlan devices.
983          */
984         if (atomic_read(&dev->vn_sock->refcnt) == 1)
985                 return false;
986
987         list_for_each_entry(vxlan, &vn->vxlan_list, next) {
988                 if (!netif_running(vxlan->dev) || vxlan == dev)
989                         continue;
990
991                 if (vxlan->vn_sock != dev->vn_sock)
992                         continue;
993
994                 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
995                                       &dev->default_dst.remote_ip))
996                         continue;
997
998                 if (vxlan->default_dst.remote_ifindex !=
999                     dev->default_dst.remote_ifindex)
1000                         continue;
1001
1002                 return true;
1003         }
1004
1005         return false;
1006 }
1007
1008 static void vxlan_sock_hold(struct vxlan_sock *vs)
1009 {
1010         atomic_inc(&vs->refcnt);
1011 }
1012
1013 void vxlan_sock_release(struct vxlan_sock *vs)
1014 {
1015         struct sock *sk = vs->sock->sk;
1016         struct net *net = sock_net(sk);
1017         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1018
1019         if (!atomic_dec_and_test(&vs->refcnt))
1020                 return;
1021
1022         spin_lock(&vn->sock_lock);
1023         hlist_del_rcu(&vs->hlist);
1024         vxlan_notify_del_rx_port(vs);
1025         spin_unlock(&vn->sock_lock);
1026
1027         queue_work(vxlan_wq, &vs->del_work);
1028 }
1029 EXPORT_SYMBOL_GPL(vxlan_sock_release);
1030
1031 /* Callback to update multicast group membership when first VNI on
1032  * multicast asddress is brought up
1033  * Done as workqueue because ip_mc_join_group acquires RTNL.
1034  */
1035 static void vxlan_igmp_join(struct work_struct *work)
1036 {
1037         struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
1038         struct vxlan_sock *vs = vxlan->vn_sock;
1039         struct sock *sk = vs->sock->sk;
1040         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1041         int ifindex = vxlan->default_dst.remote_ifindex;
1042
1043         lock_sock(sk);
1044         if (ip->sa.sa_family == AF_INET) {
1045                 struct ip_mreqn mreq = {
1046                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1047                         .imr_ifindex            = ifindex,
1048                 };
1049
1050                 ip_mc_join_group(sk, &mreq);
1051 #if IS_ENABLED(CONFIG_IPV6)
1052         } else {
1053                 ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
1054                                              &ip->sin6.sin6_addr);
1055 #endif
1056         }
1057         release_sock(sk);
1058
1059         vxlan_sock_release(vs);
1060         dev_put(vxlan->dev);
1061 }
1062
1063 /* Inverse of vxlan_igmp_join when last VNI is brought down */
1064 static void vxlan_igmp_leave(struct work_struct *work)
1065 {
1066         struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
1067         struct vxlan_sock *vs = vxlan->vn_sock;
1068         struct sock *sk = vs->sock->sk;
1069         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1070         int ifindex = vxlan->default_dst.remote_ifindex;
1071
1072         lock_sock(sk);
1073         if (ip->sa.sa_family == AF_INET) {
1074                 struct ip_mreqn mreq = {
1075                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1076                         .imr_ifindex            = ifindex,
1077                 };
1078
1079                 ip_mc_leave_group(sk, &mreq);
1080 #if IS_ENABLED(CONFIG_IPV6)
1081         } else {
1082                 ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
1083                                              &ip->sin6.sin6_addr);
1084 #endif
1085         }
1086
1087         release_sock(sk);
1088
1089         vxlan_sock_release(vs);
1090         dev_put(vxlan->dev);
1091 }
1092
1093 /* Callback from net/ipv4/udp.c to receive packets */
1094 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
1095 {
1096         struct vxlan_sock *vs;
1097         struct vxlanhdr *vxh;
1098
1099         /* Need Vxlan and inner Ethernet header to be present */
1100         if (!pskb_may_pull(skb, VXLAN_HLEN))
1101                 goto error;
1102
1103         /* Return packets with reserved bits set */
1104         vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
1105         if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
1106             (vxh->vx_vni & htonl(0xff))) {
1107                 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
1108                            ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
1109                 goto error;
1110         }
1111
1112         if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
1113                 goto drop;
1114
1115         vs = rcu_dereference_sk_user_data(sk);
1116         if (!vs)
1117                 goto drop;
1118
1119         vs->rcv(vs, skb, vxh->vx_vni);
1120         return 0;
1121
1122 drop:
1123         /* Consume bad packet */
1124         kfree_skb(skb);
1125         return 0;
1126
1127 error:
1128         /* Return non vxlan pkt */
1129         return 1;
1130 }
1131
1132 static void vxlan_rcv(struct vxlan_sock *vs,
1133                       struct sk_buff *skb, __be32 vx_vni)
1134 {
1135         struct iphdr *oip = NULL;
1136         struct ipv6hdr *oip6 = NULL;
1137         struct vxlan_dev *vxlan;
1138         struct pcpu_sw_netstats *stats;
1139         union vxlan_addr saddr;
1140         __u32 vni;
1141         int err = 0;
1142         union vxlan_addr *remote_ip;
1143
1144         vni = ntohl(vx_vni) >> 8;
1145         /* Is this VNI defined? */
1146         vxlan = vxlan_vs_find_vni(vs, vni);
1147         if (!vxlan)
1148                 goto drop;
1149
1150         remote_ip = &vxlan->default_dst.remote_ip;
1151         skb_reset_mac_header(skb);
1152         skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
1153         skb->protocol = eth_type_trans(skb, vxlan->dev);
1154         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1155
1156         /* Ignore packet loops (and multicast echo) */
1157         if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1158                 goto drop;
1159
1160         /* Re-examine inner Ethernet packet */
1161         if (remote_ip->sa.sa_family == AF_INET) {
1162                 oip = ip_hdr(skb);
1163                 saddr.sin.sin_addr.s_addr = oip->saddr;
1164                 saddr.sa.sa_family = AF_INET;
1165 #if IS_ENABLED(CONFIG_IPV6)
1166         } else {
1167                 oip6 = ipv6_hdr(skb);
1168                 saddr.sin6.sin6_addr = oip6->saddr;
1169                 saddr.sa.sa_family = AF_INET6;
1170 #endif
1171         }
1172
1173         if ((vxlan->flags & VXLAN_F_LEARN) &&
1174             vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
1175                 goto drop;
1176
1177         skb_reset_network_header(skb);
1178
1179         if (oip6)
1180                 err = IP6_ECN_decapsulate(oip6, skb);
1181         if (oip)
1182                 err = IP_ECN_decapsulate(oip, skb);
1183
1184         if (unlikely(err)) {
1185                 if (log_ecn_error) {
1186                         if (oip6)
1187                                 net_info_ratelimited("non-ECT from %pI6\n",
1188                                                      &oip6->saddr);
1189                         if (oip)
1190                                 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
1191                                                      &oip->saddr, oip->tos);
1192                 }
1193                 if (err > 1) {
1194                         ++vxlan->dev->stats.rx_frame_errors;
1195                         ++vxlan->dev->stats.rx_errors;
1196                         goto drop;
1197                 }
1198         }
1199
1200         stats = this_cpu_ptr(vxlan->dev->tstats);
1201         u64_stats_update_begin(&stats->syncp);
1202         stats->rx_packets++;
1203         stats->rx_bytes += skb->len;
1204         u64_stats_update_end(&stats->syncp);
1205
1206         netif_rx(skb);
1207
1208         return;
1209 drop:
1210         /* Consume bad packet */
1211         kfree_skb(skb);
1212 }
1213
1214 static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
1215 {
1216         struct vxlan_dev *vxlan = netdev_priv(dev);
1217         struct arphdr *parp;
1218         u8 *arpptr, *sha;
1219         __be32 sip, tip;
1220         struct neighbour *n;
1221
1222         if (dev->flags & IFF_NOARP)
1223                 goto out;
1224
1225         if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
1226                 dev->stats.tx_dropped++;
1227                 goto out;
1228         }
1229         parp = arp_hdr(skb);
1230
1231         if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
1232              parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
1233             parp->ar_pro != htons(ETH_P_IP) ||
1234             parp->ar_op != htons(ARPOP_REQUEST) ||
1235             parp->ar_hln != dev->addr_len ||
1236             parp->ar_pln != 4)
1237                 goto out;
1238         arpptr = (u8 *)parp + sizeof(struct arphdr);
1239         sha = arpptr;
1240         arpptr += dev->addr_len;        /* sha */
1241         memcpy(&sip, arpptr, sizeof(sip));
1242         arpptr += sizeof(sip);
1243         arpptr += dev->addr_len;        /* tha */
1244         memcpy(&tip, arpptr, sizeof(tip));
1245
1246         if (ipv4_is_loopback(tip) ||
1247             ipv4_is_multicast(tip))
1248                 goto out;
1249
1250         n = neigh_lookup(&arp_tbl, &tip, dev);
1251
1252         if (n) {
1253                 struct vxlan_fdb *f;
1254                 struct sk_buff  *reply;
1255
1256                 if (!(n->nud_state & NUD_CONNECTED)) {
1257                         neigh_release(n);
1258                         goto out;
1259                 }
1260
1261                 f = vxlan_find_mac(vxlan, n->ha);
1262                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1263                         /* bridge-local neighbor */
1264                         neigh_release(n);
1265                         goto out;
1266                 }
1267
1268                 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
1269                                 n->ha, sha);
1270
1271                 neigh_release(n);
1272
1273                 if (reply == NULL)
1274                         goto out;
1275
1276                 skb_reset_mac_header(reply);
1277                 __skb_pull(reply, skb_network_offset(reply));
1278                 reply->ip_summed = CHECKSUM_UNNECESSARY;
1279                 reply->pkt_type = PACKET_HOST;
1280
1281                 if (netif_rx_ni(reply) == NET_RX_DROP)
1282                         dev->stats.rx_dropped++;
1283         } else if (vxlan->flags & VXLAN_F_L3MISS) {
1284                 union vxlan_addr ipa = {
1285                         .sin.sin_addr.s_addr = tip,
1286                         .sin.sin_family = AF_INET,
1287                 };
1288
1289                 vxlan_ip_miss(dev, &ipa);
1290         }
1291 out:
1292         consume_skb(skb);
1293         return NETDEV_TX_OK;
1294 }
1295
1296 #if IS_ENABLED(CONFIG_IPV6)
1297 static struct sk_buff *vxlan_na_create(struct sk_buff *request,
1298         struct neighbour *n, bool isrouter)
1299 {
1300         struct net_device *dev = request->dev;
1301         struct sk_buff *reply;
1302         struct nd_msg *ns, *na;
1303         struct ipv6hdr *pip6;
1304         u8 *daddr;
1305         int na_olen = 8; /* opt hdr + ETH_ALEN for target */
1306         int ns_olen;
1307         int i, len;
1308
1309         if (dev == NULL)
1310                 return NULL;
1311
1312         len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
1313                 sizeof(*na) + na_olen + dev->needed_tailroom;
1314         reply = alloc_skb(len, GFP_ATOMIC);
1315         if (reply == NULL)
1316                 return NULL;
1317
1318         reply->protocol = htons(ETH_P_IPV6);
1319         reply->dev = dev;
1320         skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
1321         skb_push(reply, sizeof(struct ethhdr));
1322         skb_set_mac_header(reply, 0);
1323
1324         ns = (struct nd_msg *)skb_transport_header(request);
1325
1326         daddr = eth_hdr(request)->h_source;
1327         ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns);
1328         for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
1329                 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
1330                         daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
1331                         break;
1332                 }
1333         }
1334
1335         /* Ethernet header */
1336         ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
1337         ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
1338         eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
1339         reply->protocol = htons(ETH_P_IPV6);
1340
1341         skb_pull(reply, sizeof(struct ethhdr));
1342         skb_set_network_header(reply, 0);
1343         skb_put(reply, sizeof(struct ipv6hdr));
1344
1345         /* IPv6 header */
1346
1347         pip6 = ipv6_hdr(reply);
1348         memset(pip6, 0, sizeof(struct ipv6hdr));
1349         pip6->version = 6;
1350         pip6->priority = ipv6_hdr(request)->priority;
1351         pip6->nexthdr = IPPROTO_ICMPV6;
1352         pip6->hop_limit = 255;
1353         pip6->daddr = ipv6_hdr(request)->saddr;
1354         pip6->saddr = *(struct in6_addr *)n->primary_key;
1355
1356         skb_pull(reply, sizeof(struct ipv6hdr));
1357         skb_set_transport_header(reply, 0);
1358
1359         na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
1360
1361         /* Neighbor Advertisement */
1362         memset(na, 0, sizeof(*na)+na_olen);
1363         na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
1364         na->icmph.icmp6_router = isrouter;
1365         na->icmph.icmp6_override = 1;
1366         na->icmph.icmp6_solicited = 1;
1367         na->target = ns->target;
1368         ether_addr_copy(&na->opt[2], n->ha);
1369         na->opt[0] = ND_OPT_TARGET_LL_ADDR;
1370         na->opt[1] = na_olen >> 3;
1371
1372         na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
1373                 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
1374                 csum_partial(na, sizeof(*na)+na_olen, 0));
1375
1376         pip6->payload_len = htons(sizeof(*na)+na_olen);
1377
1378         skb_push(reply, sizeof(struct ipv6hdr));
1379
1380         reply->ip_summed = CHECKSUM_UNNECESSARY;
1381
1382         return reply;
1383 }
1384
1385 static int neigh_reduce(struct net_device *dev, struct sk_buff *skb)
1386 {
1387         struct vxlan_dev *vxlan = netdev_priv(dev);
1388         struct nd_msg *msg;
1389         const struct ipv6hdr *iphdr;
1390         const struct in6_addr *saddr, *daddr;
1391         struct neighbour *n;
1392         struct inet6_dev *in6_dev;
1393
1394         in6_dev = __in6_dev_get(dev);
1395         if (!in6_dev)
1396                 goto out;
1397
1398         iphdr = ipv6_hdr(skb);
1399         saddr = &iphdr->saddr;
1400         daddr = &iphdr->daddr;
1401
1402         msg = (struct nd_msg *)skb_transport_header(skb);
1403         if (msg->icmph.icmp6_code != 0 ||
1404             msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
1405                 goto out;
1406
1407         if (ipv6_addr_loopback(daddr) ||
1408             ipv6_addr_is_multicast(&msg->target))
1409                 goto out;
1410
1411         n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
1412
1413         if (n) {
1414                 struct vxlan_fdb *f;
1415                 struct sk_buff *reply;
1416
1417                 if (!(n->nud_state & NUD_CONNECTED)) {
1418                         neigh_release(n);
1419                         goto out;
1420                 }
1421
1422                 f = vxlan_find_mac(vxlan, n->ha);
1423                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1424                         /* bridge-local neighbor */
1425                         neigh_release(n);
1426                         goto out;
1427                 }
1428
1429                 reply = vxlan_na_create(skb, n,
1430                                         !!(f ? f->flags & NTF_ROUTER : 0));
1431
1432                 neigh_release(n);
1433
1434                 if (reply == NULL)
1435                         goto out;
1436
1437                 if (netif_rx_ni(reply) == NET_RX_DROP)
1438                         dev->stats.rx_dropped++;
1439
1440         } else if (vxlan->flags & VXLAN_F_L3MISS) {
1441                 union vxlan_addr ipa = {
1442                         .sin6.sin6_addr = msg->target,
1443                         .sin6.sin6_family = AF_INET6,
1444                 };
1445
1446                 vxlan_ip_miss(dev, &ipa);
1447         }
1448
1449 out:
1450         consume_skb(skb);
1451         return NETDEV_TX_OK;
1452 }
1453 #endif
1454
1455 static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
1456 {
1457         struct vxlan_dev *vxlan = netdev_priv(dev);
1458         struct neighbour *n;
1459
1460         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
1461                 return false;
1462
1463         n = NULL;
1464         switch (ntohs(eth_hdr(skb)->h_proto)) {
1465         case ETH_P_IP:
1466         {
1467                 struct iphdr *pip;
1468
1469                 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
1470                         return false;
1471                 pip = ip_hdr(skb);
1472                 n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
1473                 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
1474                         union vxlan_addr ipa = {
1475                                 .sin.sin_addr.s_addr = pip->daddr,
1476                                 .sin.sin_family = AF_INET,
1477                         };
1478
1479                         vxlan_ip_miss(dev, &ipa);
1480                         return false;
1481                 }
1482
1483                 break;
1484         }
1485 #if IS_ENABLED(CONFIG_IPV6)
1486         case ETH_P_IPV6:
1487         {
1488                 struct ipv6hdr *pip6;
1489
1490                 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
1491                         return false;
1492                 pip6 = ipv6_hdr(skb);
1493                 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
1494                 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
1495                         union vxlan_addr ipa = {
1496                                 .sin6.sin6_addr = pip6->daddr,
1497                                 .sin6.sin6_family = AF_INET6,
1498                         };
1499
1500                         vxlan_ip_miss(dev, &ipa);
1501                         return false;
1502                 }
1503
1504                 break;
1505         }
1506 #endif
1507         default:
1508                 return false;
1509         }
1510
1511         if (n) {
1512                 bool diff;
1513
1514                 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
1515                 if (diff) {
1516                         memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
1517                                 dev->addr_len);
1518                         memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
1519                 }
1520                 neigh_release(n);
1521                 return diff;
1522         }
1523
1524         return false;
1525 }
1526
1527 #if IS_ENABLED(CONFIG_IPV6)
1528 static int vxlan6_xmit_skb(struct vxlan_sock *vs,
1529                            struct dst_entry *dst, struct sk_buff *skb,
1530                            struct net_device *dev, struct in6_addr *saddr,
1531                            struct in6_addr *daddr, __u8 prio, __u8 ttl,
1532                            __be16 src_port, __be16 dst_port, __be32 vni,
1533                            bool xnet)
1534 {
1535         struct vxlanhdr *vxh;
1536         int min_headroom;
1537         int err;
1538         bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
1539
1540         skb = udp_tunnel_handle_offloads(skb, udp_sum);
1541         if (IS_ERR(skb)) {
1542                 err = -EINVAL;
1543                 goto err;
1544         }
1545
1546         skb_scrub_packet(skb, xnet);
1547
1548         min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
1549                         + VXLAN_HLEN + sizeof(struct ipv6hdr)
1550                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
1551
1552         /* Need space for new headers (invalidates iph ptr) */
1553         err = skb_cow_head(skb, min_headroom);
1554         if (unlikely(err)) {
1555                 kfree_skb(skb);
1556                 goto err;
1557         }
1558
1559         skb = vlan_hwaccel_push_inside(skb);
1560         if (WARN_ON(!skb)) {
1561                 err = -ENOMEM;
1562                 goto err;
1563         }
1564
1565         vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
1566         vxh->vx_flags = htonl(VXLAN_FLAGS);
1567         vxh->vx_vni = vni;
1568
1569         skb_set_inner_protocol(skb, htons(ETH_P_TEB));
1570
1571         udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
1572                              ttl, src_port, dst_port);
1573         return 0;
1574 err:
1575         dst_release(dst);
1576         return err;
1577 }
1578 #endif
1579
1580 int vxlan_xmit_skb(struct vxlan_sock *vs,
1581                    struct rtable *rt, struct sk_buff *skb,
1582                    __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
1583                    __be16 src_port, __be16 dst_port, __be32 vni, bool xnet)
1584 {
1585         struct vxlanhdr *vxh;
1586         int min_headroom;
1587         int err;
1588         bool udp_sum = !vs->sock->sk->sk_no_check_tx;
1589
1590         skb = udp_tunnel_handle_offloads(skb, udp_sum);
1591         if (IS_ERR(skb))
1592                 return PTR_ERR(skb);
1593
1594         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
1595                         + VXLAN_HLEN + sizeof(struct iphdr)
1596                         + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
1597
1598         /* Need space for new headers (invalidates iph ptr) */
1599         err = skb_cow_head(skb, min_headroom);
1600         if (unlikely(err)) {
1601                 kfree_skb(skb);
1602                 return err;
1603         }
1604
1605         skb = vlan_hwaccel_push_inside(skb);
1606         if (WARN_ON(!skb))
1607                 return -ENOMEM;
1608
1609         vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
1610         vxh->vx_flags = htonl(VXLAN_FLAGS);
1611         vxh->vx_vni = vni;
1612
1613         skb_set_inner_protocol(skb, htons(ETH_P_TEB));
1614
1615         return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
1616                                    ttl, df, src_port, dst_port, xnet);
1617 }
1618 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
1619
1620 /* Bypass encapsulation if the destination is local */
1621 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
1622                                struct vxlan_dev *dst_vxlan)
1623 {
1624         struct pcpu_sw_netstats *tx_stats, *rx_stats;
1625         union vxlan_addr loopback;
1626         union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
1627         struct net_device *dev = skb->dev;
1628         int len = skb->len;
1629
1630         tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
1631         rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
1632         skb->pkt_type = PACKET_HOST;
1633         skb->encapsulation = 0;
1634         skb->dev = dst_vxlan->dev;
1635         __skb_pull(skb, skb_network_offset(skb));
1636
1637         if (remote_ip->sa.sa_family == AF_INET) {
1638                 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
1639                 loopback.sa.sa_family =  AF_INET;
1640 #if IS_ENABLED(CONFIG_IPV6)
1641         } else {
1642                 loopback.sin6.sin6_addr = in6addr_loopback;
1643                 loopback.sa.sa_family =  AF_INET6;
1644 #endif
1645         }
1646
1647         if (dst_vxlan->flags & VXLAN_F_LEARN)
1648                 vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
1649
1650         u64_stats_update_begin(&tx_stats->syncp);
1651         tx_stats->tx_packets++;
1652         tx_stats->tx_bytes += len;
1653         u64_stats_update_end(&tx_stats->syncp);
1654
1655         if (netif_rx(skb) == NET_RX_SUCCESS) {
1656                 u64_stats_update_begin(&rx_stats->syncp);
1657                 rx_stats->rx_packets++;
1658                 rx_stats->rx_bytes += len;
1659                 u64_stats_update_end(&rx_stats->syncp);
1660         } else {
1661                 dev->stats.rx_dropped++;
1662         }
1663 }
1664
1665 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1666                            struct vxlan_rdst *rdst, bool did_rsc)
1667 {
1668         struct vxlan_dev *vxlan = netdev_priv(dev);
1669         struct rtable *rt = NULL;
1670         const struct iphdr *old_iph;
1671         struct flowi4 fl4;
1672         union vxlan_addr *dst;
1673         __be16 src_port = 0, dst_port;
1674         u32 vni;
1675         __be16 df = 0;
1676         __u8 tos, ttl;
1677         int err;
1678
1679         dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
1680         vni = rdst->remote_vni;
1681         dst = &rdst->remote_ip;
1682
1683         if (vxlan_addr_any(dst)) {
1684                 if (did_rsc) {
1685                         /* short-circuited back to local bridge */
1686                         vxlan_encap_bypass(skb, vxlan, vxlan);
1687                         return;
1688                 }
1689                 goto drop;
1690         }
1691
1692         old_iph = ip_hdr(skb);
1693
1694         ttl = vxlan->ttl;
1695         if (!ttl && vxlan_addr_multicast(dst))
1696                 ttl = 1;
1697
1698         tos = vxlan->tos;
1699         if (tos == 1)
1700                 tos = ip_tunnel_get_dsfield(old_iph, skb);
1701
1702         src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min,
1703                                      vxlan->port_max, true);
1704
1705         if (dst->sa.sa_family == AF_INET) {
1706                 memset(&fl4, 0, sizeof(fl4));
1707                 fl4.flowi4_oif = rdst->remote_ifindex;
1708                 fl4.flowi4_tos = RT_TOS(tos);
1709                 fl4.daddr = dst->sin.sin_addr.s_addr;
1710                 fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr;
1711
1712                 rt = ip_route_output_key(vxlan->net, &fl4);
1713                 if (IS_ERR(rt)) {
1714                         netdev_dbg(dev, "no route to %pI4\n",
1715                                    &dst->sin.sin_addr.s_addr);
1716                         dev->stats.tx_carrier_errors++;
1717                         goto tx_error;
1718                 }
1719
1720                 if (rt->dst.dev == dev) {
1721                         netdev_dbg(dev, "circular route to %pI4\n",
1722                                    &dst->sin.sin_addr.s_addr);
1723                         dev->stats.collisions++;
1724                         goto rt_tx_error;
1725                 }
1726
1727                 /* Bypass encapsulation if the destination is local */
1728                 if (rt->rt_flags & RTCF_LOCAL &&
1729                     !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
1730                         struct vxlan_dev *dst_vxlan;
1731
1732                         ip_rt_put(rt);
1733                         dst_vxlan = vxlan_find_vni(vxlan->net, vni,
1734                                                    dst->sa.sa_family, dst_port);
1735                         if (!dst_vxlan)
1736                                 goto tx_error;
1737                         vxlan_encap_bypass(skb, vxlan, dst_vxlan);
1738                         return;
1739                 }
1740
1741                 tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
1742                 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
1743
1744                 err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
1745                                      fl4.saddr, dst->sin.sin_addr.s_addr,
1746                                      tos, ttl, df, src_port, dst_port,
1747                                      htonl(vni << 8),
1748                                      !net_eq(vxlan->net, dev_net(vxlan->dev)));
1749                 if (err < 0) {
1750                         /* skb is already freed. */
1751                         skb = NULL;
1752                         goto rt_tx_error;
1753                 }
1754
1755                 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
1756 #if IS_ENABLED(CONFIG_IPV6)
1757         } else {
1758                 struct sock *sk = vxlan->vn_sock->sock->sk;
1759                 struct dst_entry *ndst;
1760                 struct flowi6 fl6;
1761                 u32 flags;
1762
1763                 memset(&fl6, 0, sizeof(fl6));
1764                 fl6.flowi6_oif = rdst->remote_ifindex;
1765                 fl6.daddr = dst->sin6.sin6_addr;
1766                 fl6.saddr = vxlan->saddr.sin6.sin6_addr;
1767                 fl6.flowi6_proto = IPPROTO_UDP;
1768
1769                 if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
1770                         netdev_dbg(dev, "no route to %pI6\n",
1771                                    &dst->sin6.sin6_addr);
1772                         dev->stats.tx_carrier_errors++;
1773                         goto tx_error;
1774                 }
1775
1776                 if (ndst->dev == dev) {
1777                         netdev_dbg(dev, "circular route to %pI6\n",
1778                                    &dst->sin6.sin6_addr);
1779                         dst_release(ndst);
1780                         dev->stats.collisions++;
1781                         goto tx_error;
1782                 }
1783
1784                 /* Bypass encapsulation if the destination is local */
1785                 flags = ((struct rt6_info *)ndst)->rt6i_flags;
1786                 if (flags & RTF_LOCAL &&
1787                     !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
1788                         struct vxlan_dev *dst_vxlan;
1789
1790                         dst_release(ndst);
1791                         dst_vxlan = vxlan_find_vni(vxlan->net, vni,
1792                                                    dst->sa.sa_family, dst_port);
1793                         if (!dst_vxlan)
1794                                 goto tx_error;
1795                         vxlan_encap_bypass(skb, vxlan, dst_vxlan);
1796                         return;
1797                 }
1798
1799                 ttl = ttl ? : ip6_dst_hoplimit(ndst);
1800
1801                 err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
1802                                       dev, &fl6.saddr, &fl6.daddr, 0, ttl,
1803                                       src_port, dst_port, htonl(vni << 8),
1804                                       !net_eq(vxlan->net, dev_net(vxlan->dev)));
1805 #endif
1806         }
1807
1808         return;
1809
1810 drop:
1811         dev->stats.tx_dropped++;
1812         goto tx_free;
1813
1814 rt_tx_error:
1815         ip_rt_put(rt);
1816 tx_error:
1817         dev->stats.tx_errors++;
1818 tx_free:
1819         dev_kfree_skb(skb);
1820 }
1821
1822 /* Transmit local packets over Vxlan
1823  *
1824  * Outer IP header inherits ECN and DF from inner header.
1825  * Outer UDP destination is the VXLAN assigned port.
1826  *           source port is based on hash of flow
1827  */
1828 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
1829 {
1830         struct vxlan_dev *vxlan = netdev_priv(dev);
1831         struct ethhdr *eth;
1832         bool did_rsc = false;
1833         struct vxlan_rdst *rdst, *fdst = NULL;
1834         struct vxlan_fdb *f;
1835
1836         skb_reset_mac_header(skb);
1837         eth = eth_hdr(skb);
1838
1839         if ((vxlan->flags & VXLAN_F_PROXY)) {
1840                 if (ntohs(eth->h_proto) == ETH_P_ARP)
1841                         return arp_reduce(dev, skb);
1842 #if IS_ENABLED(CONFIG_IPV6)
1843                 else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
1844                          pskb_may_pull(skb, sizeof(struct ipv6hdr)
1845                                        + sizeof(struct nd_msg)) &&
1846                          ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
1847                                 struct nd_msg *msg;
1848
1849                                 msg = (struct nd_msg *)skb_transport_header(skb);
1850                                 if (msg->icmph.icmp6_code == 0 &&
1851                                     msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
1852                                         return neigh_reduce(dev, skb);
1853                 }
1854                 eth = eth_hdr(skb);
1855 #endif
1856         }
1857
1858         f = vxlan_find_mac(vxlan, eth->h_dest);
1859         did_rsc = false;
1860
1861         if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
1862             (ntohs(eth->h_proto) == ETH_P_IP ||
1863              ntohs(eth->h_proto) == ETH_P_IPV6)) {
1864                 did_rsc = route_shortcircuit(dev, skb);
1865                 if (did_rsc)
1866                         f = vxlan_find_mac(vxlan, eth->h_dest);
1867         }
1868
1869         if (f == NULL) {
1870                 f = vxlan_find_mac(vxlan, all_zeros_mac);
1871                 if (f == NULL) {
1872                         if ((vxlan->flags & VXLAN_F_L2MISS) &&
1873                             !is_multicast_ether_addr(eth->h_dest))
1874                                 vxlan_fdb_miss(vxlan, eth->h_dest);
1875
1876                         dev->stats.tx_dropped++;
1877                         kfree_skb(skb);
1878                         return NETDEV_TX_OK;
1879                 }
1880         }
1881
1882         list_for_each_entry_rcu(rdst, &f->remotes, list) {
1883                 struct sk_buff *skb1;
1884
1885                 if (!fdst) {
1886                         fdst = rdst;
1887                         continue;
1888                 }
1889                 skb1 = skb_clone(skb, GFP_ATOMIC);
1890                 if (skb1)
1891                         vxlan_xmit_one(skb1, dev, rdst, did_rsc);
1892         }
1893
1894         if (fdst)
1895                 vxlan_xmit_one(skb, dev, fdst, did_rsc);
1896         else
1897                 kfree_skb(skb);
1898         return NETDEV_TX_OK;
1899 }
1900
1901 /* Walk the forwarding table and purge stale entries */
1902 static void vxlan_cleanup(unsigned long arg)
1903 {
1904         struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
1905         unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
1906         unsigned int h;
1907
1908         if (!netif_running(vxlan->dev))
1909                 return;
1910
1911         spin_lock_bh(&vxlan->hash_lock);
1912         for (h = 0; h < FDB_HASH_SIZE; ++h) {
1913                 struct hlist_node *p, *n;
1914                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1915                         struct vxlan_fdb *f
1916                                 = container_of(p, struct vxlan_fdb, hlist);
1917                         unsigned long timeout;
1918
1919                         if (f->state & NUD_PERMANENT)
1920                                 continue;
1921
1922                         timeout = f->used + vxlan->age_interval * HZ;
1923                         if (time_before_eq(timeout, jiffies)) {
1924                                 netdev_dbg(vxlan->dev,
1925                                            "garbage collect %pM\n",
1926                                            f->eth_addr);
1927                                 f->state = NUD_STALE;
1928                                 vxlan_fdb_destroy(vxlan, f);
1929                         } else if (time_before(timeout, next_timer))
1930                                 next_timer = timeout;
1931                 }
1932         }
1933         spin_unlock_bh(&vxlan->hash_lock);
1934
1935         mod_timer(&vxlan->age_timer, next_timer);
1936 }
1937
1938 static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
1939 {
1940         __u32 vni = vxlan->default_dst.remote_vni;
1941
1942         vxlan->vn_sock = vs;
1943         hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
1944 }
1945
1946 /* Setup stats when device is created */
1947 static int vxlan_init(struct net_device *dev)
1948 {
1949         struct vxlan_dev *vxlan = netdev_priv(dev);
1950         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
1951         struct vxlan_sock *vs;
1952         bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
1953
1954         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1955         if (!dev->tstats)
1956                 return -ENOMEM;
1957
1958         spin_lock(&vn->sock_lock);
1959         vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
1960                              vxlan->dst_port);
1961         if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) {
1962                 /* If we have a socket with same port already, reuse it */
1963                 vxlan_vs_add_dev(vs, vxlan);
1964         } else {
1965                 /* otherwise make new socket outside of RTNL */
1966                 dev_hold(dev);
1967                 queue_work(vxlan_wq, &vxlan->sock_work);
1968         }
1969         spin_unlock(&vn->sock_lock);
1970
1971         return 0;
1972 }
1973
1974 static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan)
1975 {
1976         struct vxlan_fdb *f;
1977
1978         spin_lock_bh(&vxlan->hash_lock);
1979         f = __vxlan_find_mac(vxlan, all_zeros_mac);
1980         if (f)
1981                 vxlan_fdb_destroy(vxlan, f);
1982         spin_unlock_bh(&vxlan->hash_lock);
1983 }
1984
1985 static void vxlan_uninit(struct net_device *dev)
1986 {
1987         struct vxlan_dev *vxlan = netdev_priv(dev);
1988         struct vxlan_sock *vs = vxlan->vn_sock;
1989
1990         vxlan_fdb_delete_default(vxlan);
1991
1992         if (vs)
1993                 vxlan_sock_release(vs);
1994         free_percpu(dev->tstats);
1995 }
1996
1997 /* Start ageing timer and join group when device is brought up */
1998 static int vxlan_open(struct net_device *dev)
1999 {
2000         struct vxlan_dev *vxlan = netdev_priv(dev);
2001         struct vxlan_sock *vs = vxlan->vn_sock;
2002
2003         /* socket hasn't been created */
2004         if (!vs)
2005                 return -ENOTCONN;
2006
2007         if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
2008                 vxlan_sock_hold(vs);
2009                 dev_hold(dev);
2010                 queue_work(vxlan_wq, &vxlan->igmp_join);
2011         }
2012
2013         if (vxlan->age_interval)
2014                 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
2015
2016         return 0;
2017 }
2018
2019 /* Purge the forwarding table */
2020 static void vxlan_flush(struct vxlan_dev *vxlan)
2021 {
2022         unsigned int h;
2023
2024         spin_lock_bh(&vxlan->hash_lock);
2025         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2026                 struct hlist_node *p, *n;
2027                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2028                         struct vxlan_fdb *f
2029                                 = container_of(p, struct vxlan_fdb, hlist);
2030                         /* the all_zeros_mac entry is deleted at vxlan_uninit */
2031                         if (!is_zero_ether_addr(f->eth_addr))
2032                                 vxlan_fdb_destroy(vxlan, f);
2033                 }
2034         }
2035         spin_unlock_bh(&vxlan->hash_lock);
2036 }
2037
2038 /* Cleanup timer and forwarding table on shutdown */
2039 static int vxlan_stop(struct net_device *dev)
2040 {
2041         struct vxlan_dev *vxlan = netdev_priv(dev);
2042         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2043         struct vxlan_sock *vs = vxlan->vn_sock;
2044
2045         if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
2046             !vxlan_group_used(vn, vxlan)) {
2047                 vxlan_sock_hold(vs);
2048                 dev_hold(dev);
2049                 queue_work(vxlan_wq, &vxlan->igmp_leave);
2050         }
2051
2052         del_timer_sync(&vxlan->age_timer);
2053
2054         vxlan_flush(vxlan);
2055
2056         return 0;
2057 }
2058
2059 /* Stub, nothing needs to be done. */
2060 static void vxlan_set_multicast_list(struct net_device *dev)
2061 {
2062 }
2063
2064 static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
2065 {
2066         struct vxlan_dev *vxlan = netdev_priv(dev);
2067         struct vxlan_rdst *dst = &vxlan->default_dst;
2068         struct net_device *lowerdev;
2069         int max_mtu;
2070
2071         lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex);
2072         if (lowerdev == NULL)
2073                 return eth_change_mtu(dev, new_mtu);
2074
2075         if (dst->remote_ip.sa.sa_family == AF_INET6)
2076                 max_mtu = lowerdev->mtu - VXLAN6_HEADROOM;
2077         else
2078                 max_mtu = lowerdev->mtu - VXLAN_HEADROOM;
2079
2080         if (new_mtu < 68 || new_mtu > max_mtu)
2081                 return -EINVAL;
2082
2083         dev->mtu = new_mtu;
2084         return 0;
2085 }
2086
2087 static const struct net_device_ops vxlan_netdev_ops = {
2088         .ndo_init               = vxlan_init,
2089         .ndo_uninit             = vxlan_uninit,
2090         .ndo_open               = vxlan_open,
2091         .ndo_stop               = vxlan_stop,
2092         .ndo_start_xmit         = vxlan_xmit,
2093         .ndo_get_stats64        = ip_tunnel_get_stats64,
2094         .ndo_set_rx_mode        = vxlan_set_multicast_list,
2095         .ndo_change_mtu         = vxlan_change_mtu,
2096         .ndo_validate_addr      = eth_validate_addr,
2097         .ndo_set_mac_address    = eth_mac_addr,
2098         .ndo_fdb_add            = vxlan_fdb_add,
2099         .ndo_fdb_del            = vxlan_fdb_delete,
2100         .ndo_fdb_dump           = vxlan_fdb_dump,
2101 };
2102
2103 /* Info for udev, that this is a virtual tunnel endpoint */
2104 static struct device_type vxlan_type = {
2105         .name = "vxlan",
2106 };
2107
2108 /* Calls the ndo_add_vxlan_port of the caller in order to
2109  * supply the listening VXLAN udp ports. Callers are expected
2110  * to implement the ndo_add_vxlan_port.
2111  */
2112 void vxlan_get_rx_port(struct net_device *dev)
2113 {
2114         struct vxlan_sock *vs;
2115         struct net *net = dev_net(dev);
2116         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2117         sa_family_t sa_family;
2118         __be16 port;
2119         unsigned int i;
2120
2121         spin_lock(&vn->sock_lock);
2122         for (i = 0; i < PORT_HASH_SIZE; ++i) {
2123                 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
2124                         port = inet_sk(vs->sock->sk)->inet_sport;
2125                         sa_family = vs->sock->sk->sk_family;
2126                         dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
2127                                                             port);
2128                 }
2129         }
2130         spin_unlock(&vn->sock_lock);
2131 }
2132 EXPORT_SYMBOL_GPL(vxlan_get_rx_port);
2133
2134 /* Initialize the device structure. */
2135 static void vxlan_setup(struct net_device *dev)
2136 {
2137         struct vxlan_dev *vxlan = netdev_priv(dev);
2138         unsigned int h;
2139
2140         eth_hw_addr_random(dev);
2141         ether_setup(dev);
2142         if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6)
2143                 dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM;
2144         else
2145                 dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM;
2146
2147         dev->netdev_ops = &vxlan_netdev_ops;
2148         dev->destructor = free_netdev;
2149         SET_NETDEV_DEVTYPE(dev, &vxlan_type);
2150
2151         dev->tx_queue_len = 0;
2152         dev->features   |= NETIF_F_LLTX;
2153         dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
2154         dev->features   |= NETIF_F_RXCSUM;
2155         dev->features   |= NETIF_F_GSO_SOFTWARE;
2156
2157         dev->vlan_features = dev->features;
2158         dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
2159         dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
2160         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
2161         dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
2162         netif_keep_dst(dev);
2163         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
2164
2165         INIT_LIST_HEAD(&vxlan->next);
2166         spin_lock_init(&vxlan->hash_lock);
2167         INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join);
2168         INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave);
2169         INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
2170
2171         init_timer_deferrable(&vxlan->age_timer);
2172         vxlan->age_timer.function = vxlan_cleanup;
2173         vxlan->age_timer.data = (unsigned long) vxlan;
2174
2175         vxlan->dst_port = htons(vxlan_port);
2176
2177         vxlan->dev = dev;
2178
2179         for (h = 0; h < FDB_HASH_SIZE; ++h)
2180                 INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
2181 }
2182
2183 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
2184         [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
2185         [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
2186         [IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
2187         [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
2188         [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
2189         [IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
2190         [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
2191         [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
2192         [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
2193         [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
2194         [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
2195         [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
2196         [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
2197         [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
2198         [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
2199         [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
2200         [IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
2201         [IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
2202         [IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
2203         [IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
2204 };
2205
2206 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
2207 {
2208         if (tb[IFLA_ADDRESS]) {
2209                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
2210                         pr_debug("invalid link address (not ethernet)\n");
2211                         return -EINVAL;
2212                 }
2213
2214                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
2215                         pr_debug("invalid all zero ethernet address\n");
2216                         return -EADDRNOTAVAIL;
2217                 }
2218         }
2219
2220         if (!data)
2221                 return -EINVAL;
2222
2223         if (data[IFLA_VXLAN_ID]) {
2224                 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
2225                 if (id >= VXLAN_VID_MASK)
2226                         return -ERANGE;
2227         }
2228
2229         if (data[IFLA_VXLAN_PORT_RANGE]) {
2230                 const struct ifla_vxlan_port_range *p
2231                         = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
2232
2233                 if (ntohs(p->high) < ntohs(p->low)) {
2234                         pr_debug("port range %u .. %u not valid\n",
2235                                  ntohs(p->low), ntohs(p->high));
2236                         return -EINVAL;
2237                 }
2238         }
2239
2240         return 0;
2241 }
2242
2243 static void vxlan_get_drvinfo(struct net_device *netdev,
2244                               struct ethtool_drvinfo *drvinfo)
2245 {
2246         strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
2247         strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
2248 }
2249
2250 static const struct ethtool_ops vxlan_ethtool_ops = {
2251         .get_drvinfo    = vxlan_get_drvinfo,
2252         .get_link       = ethtool_op_get_link,
2253 };
2254
2255 static void vxlan_del_work(struct work_struct *work)
2256 {
2257         struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
2258         udp_tunnel_sock_release(vs->sock);
2259         kfree_rcu(vs, rcu);
2260 }
2261
2262 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
2263                                         __be16 port, u32 flags)
2264 {
2265         struct socket *sock;
2266         struct udp_port_cfg udp_conf;
2267         int err;
2268
2269         memset(&udp_conf, 0, sizeof(udp_conf));
2270
2271         if (ipv6) {
2272                 udp_conf.family = AF_INET6;
2273                 udp_conf.use_udp6_tx_checksums =
2274                     !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
2275                 udp_conf.use_udp6_rx_checksums =
2276                     !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
2277         } else {
2278                 udp_conf.family = AF_INET;
2279                 udp_conf.local_ip.s_addr = INADDR_ANY;
2280                 udp_conf.use_udp_checksums =
2281                     !!(flags & VXLAN_F_UDP_CSUM);
2282         }
2283
2284         udp_conf.local_udp_port = port;
2285
2286         /* Open UDP socket */
2287         err = udp_sock_create(net, &udp_conf, &sock);
2288         if (err < 0)
2289                 return ERR_PTR(err);
2290
2291         return sock;
2292 }
2293
2294 /* Create new listen socket if needed */
2295 static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
2296                                               vxlan_rcv_t *rcv, void *data,
2297                                               u32 flags)
2298 {
2299         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2300         struct vxlan_sock *vs;
2301         struct socket *sock;
2302         unsigned int h;
2303         bool ipv6 = !!(flags & VXLAN_F_IPV6);
2304         struct udp_tunnel_sock_cfg tunnel_cfg;
2305
2306         vs = kzalloc(sizeof(*vs), GFP_KERNEL);
2307         if (!vs)
2308                 return ERR_PTR(-ENOMEM);
2309
2310         for (h = 0; h < VNI_HASH_SIZE; ++h)
2311                 INIT_HLIST_HEAD(&vs->vni_list[h]);
2312
2313         INIT_WORK(&vs->del_work, vxlan_del_work);
2314
2315         sock = vxlan_create_sock(net, ipv6, port, flags);
2316         if (IS_ERR(sock)) {
2317                 kfree(vs);
2318                 return ERR_CAST(sock);
2319         }
2320
2321         vs->sock = sock;
2322         atomic_set(&vs->refcnt, 1);
2323         vs->rcv = rcv;
2324         vs->data = data;
2325
2326         /* Initialize the vxlan udp offloads structure */
2327         vs->udp_offloads.port = port;
2328         vs->udp_offloads.callbacks.gro_receive  = vxlan_gro_receive;
2329         vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete;
2330
2331         spin_lock(&vn->sock_lock);
2332         hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
2333         vxlan_notify_add_rx_port(vs);
2334         spin_unlock(&vn->sock_lock);
2335
2336         /* Mark socket as an encapsulation socket. */
2337         tunnel_cfg.sk_user_data = vs;
2338         tunnel_cfg.encap_type = 1;
2339         tunnel_cfg.encap_rcv = vxlan_udp_encap_recv;
2340         tunnel_cfg.encap_destroy = NULL;
2341
2342         setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
2343
2344         return vs;
2345 }
2346
2347 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
2348                                   vxlan_rcv_t *rcv, void *data,
2349                                   bool no_share, u32 flags)
2350 {
2351         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2352         struct vxlan_sock *vs;
2353         bool ipv6 = flags & VXLAN_F_IPV6;
2354
2355         vs = vxlan_socket_create(net, port, rcv, data, flags);
2356         if (!IS_ERR(vs))
2357                 return vs;
2358
2359         if (no_share)   /* Return error if sharing is not allowed. */
2360                 return vs;
2361
2362         spin_lock(&vn->sock_lock);
2363         vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
2364         if (vs && ((vs->rcv != rcv) ||
2365                    !atomic_add_unless(&vs->refcnt, 1, 0)))
2366                         vs = ERR_PTR(-EBUSY);
2367         spin_unlock(&vn->sock_lock);
2368
2369         if (!vs)
2370                 vs = ERR_PTR(-EINVAL);
2371
2372         return vs;
2373 }
2374 EXPORT_SYMBOL_GPL(vxlan_sock_add);
2375
2376 /* Scheduled at device creation to bind to a socket */
2377 static void vxlan_sock_work(struct work_struct *work)
2378 {
2379         struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work);
2380         struct net *net = vxlan->net;
2381         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2382         __be16 port = vxlan->dst_port;
2383         struct vxlan_sock *nvs;
2384
2385         nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags);
2386         spin_lock(&vn->sock_lock);
2387         if (!IS_ERR(nvs))
2388                 vxlan_vs_add_dev(nvs, vxlan);
2389         spin_unlock(&vn->sock_lock);
2390
2391         dev_put(vxlan->dev);
2392 }
2393
2394 static int vxlan_newlink(struct net *net, struct net_device *dev,
2395                          struct nlattr *tb[], struct nlattr *data[])
2396 {
2397         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2398         struct vxlan_dev *vxlan = netdev_priv(dev);
2399         struct vxlan_rdst *dst = &vxlan->default_dst;
2400         __u32 vni;
2401         int err;
2402         bool use_ipv6 = false;
2403
2404         if (!data[IFLA_VXLAN_ID])
2405                 return -EINVAL;
2406
2407         vxlan->net = dev_net(dev);
2408
2409         vni = nla_get_u32(data[IFLA_VXLAN_ID]);
2410         dst->remote_vni = vni;
2411
2412         /* Unless IPv6 is explicitly requested, assume IPv4 */
2413         dst->remote_ip.sa.sa_family = AF_INET;
2414         if (data[IFLA_VXLAN_GROUP]) {
2415                 dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
2416         } else if (data[IFLA_VXLAN_GROUP6]) {
2417                 if (!IS_ENABLED(CONFIG_IPV6))
2418                         return -EPFNOSUPPORT;
2419
2420                 nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6],
2421                            sizeof(struct in6_addr));
2422                 dst->remote_ip.sa.sa_family = AF_INET6;
2423                 use_ipv6 = true;
2424         }
2425
2426         if (data[IFLA_VXLAN_LOCAL]) {
2427                 vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
2428                 vxlan->saddr.sa.sa_family = AF_INET;
2429         } else if (data[IFLA_VXLAN_LOCAL6]) {
2430                 if (!IS_ENABLED(CONFIG_IPV6))
2431                         return -EPFNOSUPPORT;
2432
2433                 /* TODO: respect scope id */
2434                 nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6],
2435                            sizeof(struct in6_addr));
2436                 vxlan->saddr.sa.sa_family = AF_INET6;
2437                 use_ipv6 = true;
2438         }
2439
2440         if (data[IFLA_VXLAN_LINK] &&
2441             (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
2442                 struct net_device *lowerdev
2443                          = __dev_get_by_index(net, dst->remote_ifindex);
2444
2445                 if (!lowerdev) {
2446                         pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
2447                         return -ENODEV;
2448                 }
2449
2450 #if IS_ENABLED(CONFIG_IPV6)
2451                 if (use_ipv6) {
2452                         struct inet6_dev *idev = __in6_dev_get(lowerdev);
2453                         if (idev && idev->cnf.disable_ipv6) {
2454                                 pr_info("IPv6 is disabled via sysctl\n");
2455                                 return -EPERM;
2456                         }
2457                         vxlan->flags |= VXLAN_F_IPV6;
2458                 }
2459 #endif
2460
2461                 if (!tb[IFLA_MTU])
2462                         dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
2463
2464                 dev->needed_headroom = lowerdev->hard_header_len +
2465                                        (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
2466         } else if (use_ipv6)
2467                 vxlan->flags |= VXLAN_F_IPV6;
2468
2469         if (data[IFLA_VXLAN_TOS])
2470                 vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
2471
2472         if (data[IFLA_VXLAN_TTL])
2473                 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
2474
2475         if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
2476                 vxlan->flags |= VXLAN_F_LEARN;
2477
2478         if (data[IFLA_VXLAN_AGEING])
2479                 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
2480         else
2481                 vxlan->age_interval = FDB_AGE_DEFAULT;
2482
2483         if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
2484                 vxlan->flags |= VXLAN_F_PROXY;
2485
2486         if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
2487                 vxlan->flags |= VXLAN_F_RSC;
2488
2489         if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
2490                 vxlan->flags |= VXLAN_F_L2MISS;
2491
2492         if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
2493                 vxlan->flags |= VXLAN_F_L3MISS;
2494
2495         if (data[IFLA_VXLAN_LIMIT])
2496                 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
2497
2498         if (data[IFLA_VXLAN_PORT_RANGE]) {
2499                 const struct ifla_vxlan_port_range *p
2500                         = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
2501                 vxlan->port_min = ntohs(p->low);
2502                 vxlan->port_max = ntohs(p->high);
2503         }
2504
2505         if (data[IFLA_VXLAN_PORT])
2506                 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
2507
2508         if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
2509                 vxlan->flags |= VXLAN_F_UDP_CSUM;
2510
2511         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
2512             nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
2513                 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
2514
2515         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
2516             nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
2517                 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
2518
2519         if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
2520                            vxlan->dst_port)) {
2521                 pr_info("duplicate VNI %u\n", vni);
2522                 return -EEXIST;
2523         }
2524
2525         dev->ethtool_ops = &vxlan_ethtool_ops;
2526
2527         /* create an fdb entry for a valid default destination */
2528         if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
2529                 err = vxlan_fdb_create(vxlan, all_zeros_mac,
2530                                        &vxlan->default_dst.remote_ip,
2531                                        NUD_REACHABLE|NUD_PERMANENT,
2532                                        NLM_F_EXCL|NLM_F_CREATE,
2533                                        vxlan->dst_port,
2534                                        vxlan->default_dst.remote_vni,
2535                                        vxlan->default_dst.remote_ifindex,
2536                                        NTF_SELF);
2537                 if (err)
2538                         return err;
2539         }
2540
2541         err = register_netdevice(dev);
2542         if (err) {
2543                 vxlan_fdb_delete_default(vxlan);
2544                 return err;
2545         }
2546
2547         list_add(&vxlan->next, &vn->vxlan_list);
2548
2549         return 0;
2550 }
2551
2552 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
2553 {
2554         struct vxlan_dev *vxlan = netdev_priv(dev);
2555         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2556
2557         spin_lock(&vn->sock_lock);
2558         if (!hlist_unhashed(&vxlan->hlist))
2559                 hlist_del_rcu(&vxlan->hlist);
2560         spin_unlock(&vn->sock_lock);
2561
2562         list_del(&vxlan->next);
2563         unregister_netdevice_queue(dev, head);
2564 }
2565
2566 static size_t vxlan_get_size(const struct net_device *dev)
2567 {
2568
2569         return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
2570                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
2571                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
2572                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
2573                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
2574                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
2575                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
2576                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
2577                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
2578                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
2579                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
2580                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
2581                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
2582                 nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
2583                 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
2584                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
2585                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
2586                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
2587                 0;
2588 }
2589
2590 static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
2591 {
2592         const struct vxlan_dev *vxlan = netdev_priv(dev);
2593         const struct vxlan_rdst *dst = &vxlan->default_dst;
2594         struct ifla_vxlan_port_range ports = {
2595                 .low =  htons(vxlan->port_min),
2596                 .high = htons(vxlan->port_max),
2597         };
2598
2599         if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
2600                 goto nla_put_failure;
2601
2602         if (!vxlan_addr_any(&dst->remote_ip)) {
2603                 if (dst->remote_ip.sa.sa_family == AF_INET) {
2604                         if (nla_put_be32(skb, IFLA_VXLAN_GROUP,
2605                                          dst->remote_ip.sin.sin_addr.s_addr))
2606                                 goto nla_put_failure;
2607 #if IS_ENABLED(CONFIG_IPV6)
2608                 } else {
2609                         if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr),
2610                                     &dst->remote_ip.sin6.sin6_addr))
2611                                 goto nla_put_failure;
2612 #endif
2613                 }
2614         }
2615
2616         if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
2617                 goto nla_put_failure;
2618
2619         if (!vxlan_addr_any(&vxlan->saddr)) {
2620                 if (vxlan->saddr.sa.sa_family == AF_INET) {
2621                         if (nla_put_be32(skb, IFLA_VXLAN_LOCAL,
2622                                          vxlan->saddr.sin.sin_addr.s_addr))
2623                                 goto nla_put_failure;
2624 #if IS_ENABLED(CONFIG_IPV6)
2625                 } else {
2626                         if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr),
2627                                     &vxlan->saddr.sin6.sin6_addr))
2628                                 goto nla_put_failure;
2629 #endif
2630                 }
2631         }
2632
2633         if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
2634             nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
2635             nla_put_u8(skb, IFLA_VXLAN_LEARNING,
2636                         !!(vxlan->flags & VXLAN_F_LEARN)) ||
2637             nla_put_u8(skb, IFLA_VXLAN_PROXY,
2638                         !!(vxlan->flags & VXLAN_F_PROXY)) ||
2639             nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
2640             nla_put_u8(skb, IFLA_VXLAN_L2MISS,
2641                         !!(vxlan->flags & VXLAN_F_L2MISS)) ||
2642             nla_put_u8(skb, IFLA_VXLAN_L3MISS,
2643                         !!(vxlan->flags & VXLAN_F_L3MISS)) ||
2644             nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
2645             nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
2646             nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
2647             nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
2648                         !!(vxlan->flags & VXLAN_F_UDP_CSUM)) ||
2649             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
2650                         !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
2651             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
2652                         !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)))
2653                 goto nla_put_failure;
2654
2655         if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
2656                 goto nla_put_failure;
2657
2658         return 0;
2659
2660 nla_put_failure:
2661         return -EMSGSIZE;
2662 }
2663
2664 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
2665         .kind           = "vxlan",
2666         .maxtype        = IFLA_VXLAN_MAX,
2667         .policy         = vxlan_policy,
2668         .priv_size      = sizeof(struct vxlan_dev),
2669         .setup          = vxlan_setup,
2670         .validate       = vxlan_validate,
2671         .newlink        = vxlan_newlink,
2672         .dellink        = vxlan_dellink,
2673         .get_size       = vxlan_get_size,
2674         .fill_info      = vxlan_fill_info,
2675 };
2676
2677 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
2678                                              struct net_device *dev)
2679 {
2680         struct vxlan_dev *vxlan, *next;
2681         LIST_HEAD(list_kill);
2682
2683         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
2684                 struct vxlan_rdst *dst = &vxlan->default_dst;
2685
2686                 /* In case we created vxlan device with carrier
2687                  * and we loose the carrier due to module unload
2688                  * we also need to remove vxlan device. In other
2689                  * cases, it's not necessary and remote_ifindex
2690                  * is 0 here, so no matches.
2691                  */
2692                 if (dst->remote_ifindex == dev->ifindex)
2693                         vxlan_dellink(vxlan->dev, &list_kill);
2694         }
2695
2696         unregister_netdevice_many(&list_kill);
2697 }
2698
2699 static int vxlan_lowerdev_event(struct notifier_block *unused,
2700                                 unsigned long event, void *ptr)
2701 {
2702         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2703         struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
2704
2705         if (event == NETDEV_UNREGISTER)
2706                 vxlan_handle_lowerdev_unregister(vn, dev);
2707
2708         return NOTIFY_DONE;
2709 }
2710
2711 static struct notifier_block vxlan_notifier_block __read_mostly = {
2712         .notifier_call = vxlan_lowerdev_event,
2713 };
2714
2715 static __net_init int vxlan_init_net(struct net *net)
2716 {
2717         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2718         unsigned int h;
2719
2720         INIT_LIST_HEAD(&vn->vxlan_list);
2721         spin_lock_init(&vn->sock_lock);
2722
2723         for (h = 0; h < PORT_HASH_SIZE; ++h)
2724                 INIT_HLIST_HEAD(&vn->sock_list[h]);
2725
2726         return 0;
2727 }
2728
2729 static void __net_exit vxlan_exit_net(struct net *net)
2730 {
2731         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2732         struct vxlan_dev *vxlan, *next;
2733         struct net_device *dev, *aux;
2734         LIST_HEAD(list);
2735
2736         rtnl_lock();
2737         for_each_netdev_safe(net, dev, aux)
2738                 if (dev->rtnl_link_ops == &vxlan_link_ops)
2739                         unregister_netdevice_queue(dev, &list);
2740
2741         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
2742                 /* If vxlan->dev is in the same netns, it has already been added
2743                  * to the list by the previous loop.
2744                  */
2745                 if (!net_eq(dev_net(vxlan->dev), net))
2746                         unregister_netdevice_queue(dev, &list);
2747         }
2748
2749         unregister_netdevice_many(&list);
2750         rtnl_unlock();
2751 }
2752
2753 static struct pernet_operations vxlan_net_ops = {
2754         .init = vxlan_init_net,
2755         .exit = vxlan_exit_net,
2756         .id   = &vxlan_net_id,
2757         .size = sizeof(struct vxlan_net),
2758 };
2759
2760 static int __init vxlan_init_module(void)
2761 {
2762         int rc;
2763
2764         vxlan_wq = alloc_workqueue("vxlan", 0, 0);
2765         if (!vxlan_wq)
2766                 return -ENOMEM;
2767
2768         get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
2769
2770         rc = register_pernet_subsys(&vxlan_net_ops);
2771         if (rc)
2772                 goto out1;
2773
2774         rc = register_netdevice_notifier(&vxlan_notifier_block);
2775         if (rc)
2776                 goto out2;
2777
2778         rc = rtnl_link_register(&vxlan_link_ops);
2779         if (rc)
2780                 goto out3;
2781
2782         return 0;
2783 out3:
2784         unregister_netdevice_notifier(&vxlan_notifier_block);
2785 out2:
2786         unregister_pernet_subsys(&vxlan_net_ops);
2787 out1:
2788         destroy_workqueue(vxlan_wq);
2789         return rc;
2790 }
2791 late_initcall(vxlan_init_module);
2792
2793 static void __exit vxlan_cleanup_module(void)
2794 {
2795         rtnl_link_unregister(&vxlan_link_ops);
2796         unregister_netdevice_notifier(&vxlan_notifier_block);
2797         destroy_workqueue(vxlan_wq);
2798         unregister_pernet_subsys(&vxlan_net_ops);
2799         /* rcu_barrier() is called by netns */
2800 }
2801 module_exit(vxlan_cleanup_module);
2802
2803 MODULE_LICENSE("GPL");
2804 MODULE_VERSION(VXLAN_VERSION);
2805 MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
2806 MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
2807 MODULE_ALIAS_RTNL_LINK("vxlan");