63e745aadab6466b2e32e083c99ad31af39243c4
[cascardo/linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73                              struct dst_entry *dst, __be32 saddr)
74 {
75         struct dst_entry *old_dst;
76
77         dst_clone(dst);
78         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79         dst_release(old_dst);
80         idst->saddr = saddr;
81 }
82
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84                            struct dst_entry *dst, __be32 saddr)
85 {
86         __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91         tunnel_dst_set(t, NULL, 0);
92 }
93
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96         int i;
97
98         for_each_possible_cpu(i)
99                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104                                         u32 cookie, __be32 *saddr)
105 {
106         struct ip_tunnel_dst *idst;
107         struct dst_entry *dst;
108
109         rcu_read_lock();
110         idst = raw_cpu_ptr(t->dst_cache);
111         dst = rcu_dereference(idst->dst);
112         if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113                 dst = NULL;
114         if (dst) {
115                 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116                         *saddr = idst->saddr;
117                 } else {
118                         tunnel_dst_reset(t);
119                         dst_release(dst);
120                         dst = NULL;
121                 }
122         }
123         rcu_read_unlock();
124         return (struct rtable *)dst;
125 }
126
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128                                 __be16 flags, __be32 key)
129 {
130         if (p->i_flags & TUNNEL_KEY) {
131                 if (flags & TUNNEL_KEY)
132                         return key == p->i_key;
133                 else
134                         /* key expected, none present */
135                         return false;
136         } else
137                 return !(flags & TUNNEL_KEY);
138 }
139
140 /* Fallback tunnel: no source, no destination, no key, no options
141
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152                                    int link, __be16 flags,
153                                    __be32 remote, __be32 local,
154                                    __be32 key)
155 {
156         unsigned int hash;
157         struct ip_tunnel *t, *cand = NULL;
158         struct hlist_head *head;
159
160         hash = ip_tunnel_hash(key, remote);
161         head = &itn->tunnels[hash];
162
163         hlist_for_each_entry_rcu(t, head, hash_node) {
164                 if (local != t->parms.iph.saddr ||
165                     remote != t->parms.iph.daddr ||
166                     !(t->dev->flags & IFF_UP))
167                         continue;
168
169                 if (!ip_tunnel_key_match(&t->parms, flags, key))
170                         continue;
171
172                 if (t->parms.link == link)
173                         return t;
174                 else
175                         cand = t;
176         }
177
178         hlist_for_each_entry_rcu(t, head, hash_node) {
179                 if (remote != t->parms.iph.daddr ||
180                     t->parms.iph.saddr != 0 ||
181                     !(t->dev->flags & IFF_UP))
182                         continue;
183
184                 if (!ip_tunnel_key_match(&t->parms, flags, key))
185                         continue;
186
187                 if (t->parms.link == link)
188                         return t;
189                 else if (!cand)
190                         cand = t;
191         }
192
193         hash = ip_tunnel_hash(key, 0);
194         head = &itn->tunnels[hash];
195
196         hlist_for_each_entry_rcu(t, head, hash_node) {
197                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199                         continue;
200
201                 if (!(t->dev->flags & IFF_UP))
202                         continue;
203
204                 if (!ip_tunnel_key_match(&t->parms, flags, key))
205                         continue;
206
207                 if (t->parms.link == link)
208                         return t;
209                 else if (!cand)
210                         cand = t;
211         }
212
213         if (flags & TUNNEL_NO_KEY)
214                 goto skip_key_lookup;
215
216         hlist_for_each_entry_rcu(t, head, hash_node) {
217                 if (t->parms.i_key != key ||
218                     t->parms.iph.saddr != 0 ||
219                     t->parms.iph.daddr != 0 ||
220                     !(t->dev->flags & IFF_UP))
221                         continue;
222
223                 if (t->parms.link == link)
224                         return t;
225                 else if (!cand)
226                         cand = t;
227         }
228
229 skip_key_lookup:
230         if (cand)
231                 return cand;
232
233         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234                 return netdev_priv(itn->fb_tunnel_dev);
235
236
237         return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242                                     struct ip_tunnel_parm *parms)
243 {
244         unsigned int h;
245         __be32 remote;
246         __be32 i_key = parms->i_key;
247
248         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249                 remote = parms->iph.daddr;
250         else
251                 remote = 0;
252
253         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254                 i_key = 0;
255
256         h = ip_tunnel_hash(i_key, remote);
257         return &itn->tunnels[h];
258 }
259
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262         struct hlist_head *head = ip_bucket(itn, &t->parms);
263
264         hlist_add_head_rcu(&t->hash_node, head);
265 }
266
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269         hlist_del_init_rcu(&t->hash_node);
270 }
271
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273                                         struct ip_tunnel_parm *parms,
274                                         int type)
275 {
276         __be32 remote = parms->iph.daddr;
277         __be32 local = parms->iph.saddr;
278         __be32 key = parms->i_key;
279         __be16 flags = parms->i_flags;
280         int link = parms->link;
281         struct ip_tunnel *t = NULL;
282         struct hlist_head *head = ip_bucket(itn, parms);
283
284         hlist_for_each_entry_rcu(t, head, hash_node) {
285                 if (local == t->parms.iph.saddr &&
286                     remote == t->parms.iph.daddr &&
287                     link == t->parms.link &&
288                     type == t->dev->type &&
289                     ip_tunnel_key_match(&t->parms, flags, key))
290                         break;
291         }
292         return t;
293 }
294
295 static struct net_device *__ip_tunnel_create(struct net *net,
296                                              const struct rtnl_link_ops *ops,
297                                              struct ip_tunnel_parm *parms)
298 {
299         int err;
300         struct ip_tunnel *tunnel;
301         struct net_device *dev;
302         char name[IFNAMSIZ];
303
304         if (parms->name[0])
305                 strlcpy(name, parms->name, IFNAMSIZ);
306         else {
307                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308                         err = -E2BIG;
309                         goto failed;
310                 }
311                 strlcpy(name, ops->kind, IFNAMSIZ);
312                 strncat(name, "%d", 2);
313         }
314
315         ASSERT_RTNL();
316         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317         if (!dev) {
318                 err = -ENOMEM;
319                 goto failed;
320         }
321         dev_net_set(dev, net);
322
323         dev->rtnl_link_ops = ops;
324
325         tunnel = netdev_priv(dev);
326         tunnel->parms = *parms;
327         tunnel->net = net;
328
329         err = register_netdevice(dev);
330         if (err)
331                 goto failed_free;
332
333         return dev;
334
335 failed_free:
336         free_netdev(dev);
337 failed:
338         return ERR_PTR(err);
339 }
340
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342                                     int proto,
343                                     __be32 daddr, __be32 saddr,
344                                     __be32 key, __u8 tos, int oif)
345 {
346         memset(fl4, 0, sizeof(*fl4));
347         fl4->flowi4_oif = oif;
348         fl4->daddr = daddr;
349         fl4->saddr = saddr;
350         fl4->flowi4_tos = tos;
351         fl4->flowi4_proto = proto;
352         fl4->fl4_gre_key = key;
353 }
354
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357         struct net_device *tdev = NULL;
358         struct ip_tunnel *tunnel = netdev_priv(dev);
359         const struct iphdr *iph;
360         int hlen = LL_MAX_HEADER;
361         int mtu = ETH_DATA_LEN;
362         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363
364         iph = &tunnel->parms.iph;
365
366         /* Guess output device to choose reasonable mtu and needed_headroom */
367         if (iph->daddr) {
368                 struct flowi4 fl4;
369                 struct rtable *rt;
370
371                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372                                  iph->saddr, tunnel->parms.o_key,
373                                  RT_TOS(iph->tos), tunnel->parms.link);
374                 rt = ip_route_output_key(tunnel->net, &fl4);
375
376                 if (!IS_ERR(rt)) {
377                         tdev = rt->dst.dev;
378                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379                         ip_rt_put(rt);
380                 }
381                 if (dev->type != ARPHRD_ETHER)
382                         dev->flags |= IFF_POINTOPOINT;
383         }
384
385         if (!tdev && tunnel->parms.link)
386                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387
388         if (tdev) {
389                 hlen = tdev->hard_header_len + tdev->needed_headroom;
390                 mtu = tdev->mtu;
391         }
392         dev->iflink = tunnel->parms.link;
393
394         dev->needed_headroom = t_hlen + hlen;
395         mtu -= (dev->hard_header_len + t_hlen);
396
397         if (mtu < 68)
398                 mtu = 68;
399
400         return mtu;
401 }
402
403 static struct ip_tunnel *ip_tunnel_create(struct net *net,
404                                           struct ip_tunnel_net *itn,
405                                           struct ip_tunnel_parm *parms)
406 {
407         struct ip_tunnel *nt;
408         struct net_device *dev;
409
410         BUG_ON(!itn->fb_tunnel_dev);
411         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
412         if (IS_ERR(dev))
413                 return ERR_CAST(dev);
414
415         dev->mtu = ip_tunnel_bind_dev(dev);
416
417         nt = netdev_priv(dev);
418         ip_tunnel_add(itn, nt);
419         return nt;
420 }
421
422 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
423                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
424 {
425         struct pcpu_sw_netstats *tstats;
426         const struct iphdr *iph = ip_hdr(skb);
427         int err;
428
429 #ifdef CONFIG_NET_IPGRE_BROADCAST
430         if (ipv4_is_multicast(iph->daddr)) {
431                 tunnel->dev->stats.multicast++;
432                 skb->pkt_type = PACKET_BROADCAST;
433         }
434 #endif
435
436         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
437              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
438                 tunnel->dev->stats.rx_crc_errors++;
439                 tunnel->dev->stats.rx_errors++;
440                 goto drop;
441         }
442
443         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
444                 if (!(tpi->flags&TUNNEL_SEQ) ||
445                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
446                         tunnel->dev->stats.rx_fifo_errors++;
447                         tunnel->dev->stats.rx_errors++;
448                         goto drop;
449                 }
450                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
451         }
452
453         skb_reset_network_header(skb);
454
455         err = IP_ECN_decapsulate(iph, skb);
456         if (unlikely(err)) {
457                 if (log_ecn_error)
458                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
459                                         &iph->saddr, iph->tos);
460                 if (err > 1) {
461                         ++tunnel->dev->stats.rx_frame_errors;
462                         ++tunnel->dev->stats.rx_errors;
463                         goto drop;
464                 }
465         }
466
467         tstats = this_cpu_ptr(tunnel->dev->tstats);
468         u64_stats_update_begin(&tstats->syncp);
469         tstats->rx_packets++;
470         tstats->rx_bytes += skb->len;
471         u64_stats_update_end(&tstats->syncp);
472
473         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
474
475         if (tunnel->dev->type == ARPHRD_ETHER) {
476                 skb->protocol = eth_type_trans(skb, tunnel->dev);
477                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
478         } else {
479                 skb->dev = tunnel->dev;
480         }
481
482         gro_cells_receive(&tunnel->gro_cells, skb);
483         return 0;
484
485 drop:
486         kfree_skb(skb);
487         return 0;
488 }
489 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
490
491 static int ip_encap_hlen(struct ip_tunnel_encap *e)
492 {
493         const struct ip_tunnel_encap_ops *ops;
494         int hlen = -EINVAL;
495
496         if (e->type == TUNNEL_ENCAP_NONE)
497                 return 0;
498
499         if (e->type >= MAX_IPTUN_ENCAP_OPS)
500                 return -EINVAL;
501
502         rcu_read_lock();
503         ops = rcu_dereference(iptun_encaps[e->type]);
504         if (likely(ops && ops->encap_hlen))
505                 hlen = ops->encap_hlen(e);
506         rcu_read_unlock();
507
508         return hlen;
509 }
510
511 const struct ip_tunnel_encap_ops __rcu *
512                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
513
514 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
515                             unsigned int num)
516 {
517         return !cmpxchg((const struct ip_tunnel_encap_ops **)
518                         &iptun_encaps[num],
519                         NULL, ops) ? 0 : -1;
520 }
521 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
522
523 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
524                             unsigned int num)
525 {
526         int ret;
527
528         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
529                        &iptun_encaps[num],
530                        ops, NULL) == ops) ? 0 : -1;
531
532         synchronize_net();
533
534         return ret;
535 }
536 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
537
538 int ip_tunnel_encap_setup(struct ip_tunnel *t,
539                           struct ip_tunnel_encap *ipencap)
540 {
541         int hlen;
542
543         memset(&t->encap, 0, sizeof(t->encap));
544
545         hlen = ip_encap_hlen(ipencap);
546         if (hlen < 0)
547                 return hlen;
548
549         t->encap.type = ipencap->type;
550         t->encap.sport = ipencap->sport;
551         t->encap.dport = ipencap->dport;
552         t->encap.flags = ipencap->flags;
553
554         t->encap_hlen = hlen;
555         t->hlen = t->encap_hlen + t->tun_hlen;
556
557         return 0;
558 }
559 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
560
561 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
562                     u8 *protocol, struct flowi4 *fl4)
563 {
564         const struct ip_tunnel_encap_ops *ops;
565         int ret = -EINVAL;
566
567         if (t->encap.type == TUNNEL_ENCAP_NONE)
568                 return 0;
569
570         rcu_read_lock();
571         ops = rcu_dereference(iptun_encaps[t->encap.type]);
572         if (likely(ops && ops->build_header))
573                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
574         rcu_read_unlock();
575
576         return ret;
577 }
578 EXPORT_SYMBOL(ip_tunnel_encap);
579
580 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
581                             struct rtable *rt, __be16 df)
582 {
583         struct ip_tunnel *tunnel = netdev_priv(dev);
584         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
585         int mtu;
586
587         if (df)
588                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
589                                         - sizeof(struct iphdr) - tunnel->hlen;
590         else
591                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
592
593         if (skb_dst(skb))
594                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
595
596         if (skb->protocol == htons(ETH_P_IP)) {
597                 if (!skb_is_gso(skb) &&
598                     (df & htons(IP_DF)) && mtu < pkt_size) {
599                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
600                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
601                         return -E2BIG;
602                 }
603         }
604 #if IS_ENABLED(CONFIG_IPV6)
605         else if (skb->protocol == htons(ETH_P_IPV6)) {
606                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
607
608                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
609                            mtu >= IPV6_MIN_MTU) {
610                         if ((tunnel->parms.iph.daddr &&
611                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
612                             rt6->rt6i_dst.plen == 128) {
613                                 rt6->rt6i_flags |= RTF_MODIFIED;
614                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
615                         }
616                 }
617
618                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
619                                         mtu < pkt_size) {
620                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
621                         return -E2BIG;
622                 }
623         }
624 #endif
625         return 0;
626 }
627
628 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
629                     const struct iphdr *tnl_params, u8 protocol)
630 {
631         struct ip_tunnel *tunnel = netdev_priv(dev);
632         const struct iphdr *inner_iph;
633         struct flowi4 fl4;
634         u8     tos, ttl;
635         __be16 df;
636         struct rtable *rt;              /* Route to the other host */
637         unsigned int max_headroom;      /* The extra header space needed */
638         __be32 dst;
639         int err;
640         bool connected;
641
642         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
643         connected = (tunnel->parms.iph.daddr != 0);
644
645         dst = tnl_params->daddr;
646         if (dst == 0) {
647                 /* NBMA tunnel */
648
649                 if (skb_dst(skb) == NULL) {
650                         dev->stats.tx_fifo_errors++;
651                         goto tx_error;
652                 }
653
654                 if (skb->protocol == htons(ETH_P_IP)) {
655                         rt = skb_rtable(skb);
656                         dst = rt_nexthop(rt, inner_iph->daddr);
657                 }
658 #if IS_ENABLED(CONFIG_IPV6)
659                 else if (skb->protocol == htons(ETH_P_IPV6)) {
660                         const struct in6_addr *addr6;
661                         struct neighbour *neigh;
662                         bool do_tx_error_icmp;
663                         int addr_type;
664
665                         neigh = dst_neigh_lookup(skb_dst(skb),
666                                                  &ipv6_hdr(skb)->daddr);
667                         if (neigh == NULL)
668                                 goto tx_error;
669
670                         addr6 = (const struct in6_addr *)&neigh->primary_key;
671                         addr_type = ipv6_addr_type(addr6);
672
673                         if (addr_type == IPV6_ADDR_ANY) {
674                                 addr6 = &ipv6_hdr(skb)->daddr;
675                                 addr_type = ipv6_addr_type(addr6);
676                         }
677
678                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
679                                 do_tx_error_icmp = true;
680                         else {
681                                 do_tx_error_icmp = false;
682                                 dst = addr6->s6_addr32[3];
683                         }
684                         neigh_release(neigh);
685                         if (do_tx_error_icmp)
686                                 goto tx_error_icmp;
687                 }
688 #endif
689                 else
690                         goto tx_error;
691
692                 connected = false;
693         }
694
695         tos = tnl_params->tos;
696         if (tos & 0x1) {
697                 tos &= ~0x1;
698                 if (skb->protocol == htons(ETH_P_IP)) {
699                         tos = inner_iph->tos;
700                         connected = false;
701                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
702                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
703                         connected = false;
704                 }
705         }
706
707         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
708                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
709
710         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
711                 goto tx_error;
712
713         rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
714
715         if (!rt) {
716                 rt = ip_route_output_key(tunnel->net, &fl4);
717
718                 if (IS_ERR(rt)) {
719                         dev->stats.tx_carrier_errors++;
720                         goto tx_error;
721                 }
722                 if (connected)
723                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
724         }
725
726         if (rt->dst.dev == dev) {
727                 ip_rt_put(rt);
728                 dev->stats.collisions++;
729                 goto tx_error;
730         }
731
732         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
733                 ip_rt_put(rt);
734                 goto tx_error;
735         }
736
737         if (tunnel->err_count > 0) {
738                 if (time_before(jiffies,
739                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
740                         tunnel->err_count--;
741
742                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
743                         dst_link_failure(skb);
744                 } else
745                         tunnel->err_count = 0;
746         }
747
748         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
749         ttl = tnl_params->ttl;
750         if (ttl == 0) {
751                 if (skb->protocol == htons(ETH_P_IP))
752                         ttl = inner_iph->ttl;
753 #if IS_ENABLED(CONFIG_IPV6)
754                 else if (skb->protocol == htons(ETH_P_IPV6))
755                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
756 #endif
757                 else
758                         ttl = ip4_dst_hoplimit(&rt->dst);
759         }
760
761         df = tnl_params->frag_off;
762         if (skb->protocol == htons(ETH_P_IP))
763                 df |= (inner_iph->frag_off&htons(IP_DF));
764
765         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
766                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
767         if (max_headroom > dev->needed_headroom)
768                 dev->needed_headroom = max_headroom;
769
770         if (skb_cow_head(skb, dev->needed_headroom)) {
771                 ip_rt_put(rt);
772                 dev->stats.tx_dropped++;
773                 kfree_skb(skb);
774                 return;
775         }
776
777         err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
778                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
779         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
780
781         return;
782
783 #if IS_ENABLED(CONFIG_IPV6)
784 tx_error_icmp:
785         dst_link_failure(skb);
786 #endif
787 tx_error:
788         dev->stats.tx_errors++;
789         kfree_skb(skb);
790 }
791 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
792
793 static void ip_tunnel_update(struct ip_tunnel_net *itn,
794                              struct ip_tunnel *t,
795                              struct net_device *dev,
796                              struct ip_tunnel_parm *p,
797                              bool set_mtu)
798 {
799         ip_tunnel_del(t);
800         t->parms.iph.saddr = p->iph.saddr;
801         t->parms.iph.daddr = p->iph.daddr;
802         t->parms.i_key = p->i_key;
803         t->parms.o_key = p->o_key;
804         if (dev->type != ARPHRD_ETHER) {
805                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
806                 memcpy(dev->broadcast, &p->iph.daddr, 4);
807         }
808         ip_tunnel_add(itn, t);
809
810         t->parms.iph.ttl = p->iph.ttl;
811         t->parms.iph.tos = p->iph.tos;
812         t->parms.iph.frag_off = p->iph.frag_off;
813
814         if (t->parms.link != p->link) {
815                 int mtu;
816
817                 t->parms.link = p->link;
818                 mtu = ip_tunnel_bind_dev(dev);
819                 if (set_mtu)
820                         dev->mtu = mtu;
821         }
822         ip_tunnel_dst_reset_all(t);
823         netdev_state_change(dev);
824 }
825
826 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
827 {
828         int err = 0;
829         struct ip_tunnel *t = netdev_priv(dev);
830         struct net *net = t->net;
831         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
832
833         BUG_ON(!itn->fb_tunnel_dev);
834         switch (cmd) {
835         case SIOCGETTUNNEL:
836                 if (dev == itn->fb_tunnel_dev) {
837                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
838                         if (t == NULL)
839                                 t = netdev_priv(dev);
840                 }
841                 memcpy(p, &t->parms, sizeof(*p));
842                 break;
843
844         case SIOCADDTUNNEL:
845         case SIOCCHGTUNNEL:
846                 err = -EPERM;
847                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
848                         goto done;
849                 if (p->iph.ttl)
850                         p->iph.frag_off |= htons(IP_DF);
851                 if (!(p->i_flags & VTI_ISVTI)) {
852                         if (!(p->i_flags & TUNNEL_KEY))
853                                 p->i_key = 0;
854                         if (!(p->o_flags & TUNNEL_KEY))
855                                 p->o_key = 0;
856                 }
857
858                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
859
860                 if (cmd == SIOCADDTUNNEL) {
861                         if (!t) {
862                                 t = ip_tunnel_create(net, itn, p);
863                                 err = PTR_ERR_OR_ZERO(t);
864                                 break;
865                         }
866
867                         err = -EEXIST;
868                         break;
869                 }
870                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
871                         if (t != NULL) {
872                                 if (t->dev != dev) {
873                                         err = -EEXIST;
874                                         break;
875                                 }
876                         } else {
877                                 unsigned int nflags = 0;
878
879                                 if (ipv4_is_multicast(p->iph.daddr))
880                                         nflags = IFF_BROADCAST;
881                                 else if (p->iph.daddr)
882                                         nflags = IFF_POINTOPOINT;
883
884                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
885                                         err = -EINVAL;
886                                         break;
887                                 }
888
889                                 t = netdev_priv(dev);
890                         }
891                 }
892
893                 if (t) {
894                         err = 0;
895                         ip_tunnel_update(itn, t, dev, p, true);
896                 } else {
897                         err = -ENOENT;
898                 }
899                 break;
900
901         case SIOCDELTUNNEL:
902                 err = -EPERM;
903                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
904                         goto done;
905
906                 if (dev == itn->fb_tunnel_dev) {
907                         err = -ENOENT;
908                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
909                         if (t == NULL)
910                                 goto done;
911                         err = -EPERM;
912                         if (t == netdev_priv(itn->fb_tunnel_dev))
913                                 goto done;
914                         dev = t->dev;
915                 }
916                 unregister_netdevice(dev);
917                 err = 0;
918                 break;
919
920         default:
921                 err = -EINVAL;
922         }
923
924 done:
925         return err;
926 }
927 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
928
929 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
930 {
931         struct ip_tunnel *tunnel = netdev_priv(dev);
932         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
933
934         if (new_mtu < 68 ||
935             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
936                 return -EINVAL;
937         dev->mtu = new_mtu;
938         return 0;
939 }
940 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
941
942 static void ip_tunnel_dev_free(struct net_device *dev)
943 {
944         struct ip_tunnel *tunnel = netdev_priv(dev);
945
946         gro_cells_destroy(&tunnel->gro_cells);
947         free_percpu(tunnel->dst_cache);
948         free_percpu(dev->tstats);
949         free_netdev(dev);
950 }
951
952 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
953 {
954         struct ip_tunnel *tunnel = netdev_priv(dev);
955         struct ip_tunnel_net *itn;
956
957         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
958
959         if (itn->fb_tunnel_dev != dev) {
960                 ip_tunnel_del(netdev_priv(dev));
961                 unregister_netdevice_queue(dev, head);
962         }
963 }
964 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
965
966 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
967                                   struct rtnl_link_ops *ops, char *devname)
968 {
969         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
970         struct ip_tunnel_parm parms;
971         unsigned int i;
972
973         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
974                 INIT_HLIST_HEAD(&itn->tunnels[i]);
975
976         if (!ops) {
977                 itn->fb_tunnel_dev = NULL;
978                 return 0;
979         }
980
981         memset(&parms, 0, sizeof(parms));
982         if (devname)
983                 strlcpy(parms.name, devname, IFNAMSIZ);
984
985         rtnl_lock();
986         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
987         /* FB netdevice is special: we have one, and only one per netns.
988          * Allowing to move it to another netns is clearly unsafe.
989          */
990         if (!IS_ERR(itn->fb_tunnel_dev)) {
991                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
992                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
993                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
994         }
995         rtnl_unlock();
996
997         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
998 }
999 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1000
1001 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1002                               struct rtnl_link_ops *ops)
1003 {
1004         struct net *net = dev_net(itn->fb_tunnel_dev);
1005         struct net_device *dev, *aux;
1006         int h;
1007
1008         for_each_netdev_safe(net, dev, aux)
1009                 if (dev->rtnl_link_ops == ops)
1010                         unregister_netdevice_queue(dev, head);
1011
1012         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1013                 struct ip_tunnel *t;
1014                 struct hlist_node *n;
1015                 struct hlist_head *thead = &itn->tunnels[h];
1016
1017                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1018                         /* If dev is in the same netns, it has already
1019                          * been added to the list by the previous loop.
1020                          */
1021                         if (!net_eq(dev_net(t->dev), net))
1022                                 unregister_netdevice_queue(t->dev, head);
1023         }
1024 }
1025
1026 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1027 {
1028         LIST_HEAD(list);
1029
1030         rtnl_lock();
1031         ip_tunnel_destroy(itn, &list, ops);
1032         unregister_netdevice_many(&list);
1033         rtnl_unlock();
1034 }
1035 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1036
1037 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1038                       struct ip_tunnel_parm *p)
1039 {
1040         struct ip_tunnel *nt;
1041         struct net *net = dev_net(dev);
1042         struct ip_tunnel_net *itn;
1043         int mtu;
1044         int err;
1045
1046         nt = netdev_priv(dev);
1047         itn = net_generic(net, nt->ip_tnl_net_id);
1048
1049         if (ip_tunnel_find(itn, p, dev->type))
1050                 return -EEXIST;
1051
1052         nt->net = net;
1053         nt->parms = *p;
1054         err = register_netdevice(dev);
1055         if (err)
1056                 goto out;
1057
1058         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1059                 eth_hw_addr_random(dev);
1060
1061         mtu = ip_tunnel_bind_dev(dev);
1062         if (!tb[IFLA_MTU])
1063                 dev->mtu = mtu;
1064
1065         ip_tunnel_add(itn, nt);
1066
1067 out:
1068         return err;
1069 }
1070 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1071
1072 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1073                          struct ip_tunnel_parm *p)
1074 {
1075         struct ip_tunnel *t;
1076         struct ip_tunnel *tunnel = netdev_priv(dev);
1077         struct net *net = tunnel->net;
1078         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1079
1080         if (dev == itn->fb_tunnel_dev)
1081                 return -EINVAL;
1082
1083         t = ip_tunnel_find(itn, p, dev->type);
1084
1085         if (t) {
1086                 if (t->dev != dev)
1087                         return -EEXIST;
1088         } else {
1089                 t = tunnel;
1090
1091                 if (dev->type != ARPHRD_ETHER) {
1092                         unsigned int nflags = 0;
1093
1094                         if (ipv4_is_multicast(p->iph.daddr))
1095                                 nflags = IFF_BROADCAST;
1096                         else if (p->iph.daddr)
1097                                 nflags = IFF_POINTOPOINT;
1098
1099                         if ((dev->flags ^ nflags) &
1100                             (IFF_POINTOPOINT | IFF_BROADCAST))
1101                                 return -EINVAL;
1102                 }
1103         }
1104
1105         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1106         return 0;
1107 }
1108 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1109
1110 int ip_tunnel_init(struct net_device *dev)
1111 {
1112         struct ip_tunnel *tunnel = netdev_priv(dev);
1113         struct iphdr *iph = &tunnel->parms.iph;
1114         int err;
1115
1116         dev->destructor = ip_tunnel_dev_free;
1117         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1118         if (!dev->tstats)
1119                 return -ENOMEM;
1120
1121         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1122         if (!tunnel->dst_cache) {
1123                 free_percpu(dev->tstats);
1124                 return -ENOMEM;
1125         }
1126
1127         err = gro_cells_init(&tunnel->gro_cells, dev);
1128         if (err) {
1129                 free_percpu(tunnel->dst_cache);
1130                 free_percpu(dev->tstats);
1131                 return err;
1132         }
1133
1134         tunnel->dev = dev;
1135         tunnel->net = dev_net(dev);
1136         strcpy(tunnel->parms.name, dev->name);
1137         iph->version            = 4;
1138         iph->ihl                = 5;
1139
1140         return 0;
1141 }
1142 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1143
1144 void ip_tunnel_uninit(struct net_device *dev)
1145 {
1146         struct ip_tunnel *tunnel = netdev_priv(dev);
1147         struct net *net = tunnel->net;
1148         struct ip_tunnel_net *itn;
1149
1150         itn = net_generic(net, tunnel->ip_tnl_net_id);
1151         /* fb_tunnel_dev will be unregisted in net-exit call. */
1152         if (itn->fb_tunnel_dev != dev)
1153                 ip_tunnel_del(netdev_priv(dev));
1154
1155         ip_tunnel_dst_reset_all(tunnel);
1156 }
1157 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1158
1159 /* Do least required initialization, rest of init is done in tunnel_init call */
1160 void ip_tunnel_setup(struct net_device *dev, int net_id)
1161 {
1162         struct ip_tunnel *tunnel = netdev_priv(dev);
1163         tunnel->ip_tnl_net_id = net_id;
1164 }
1165 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1166
1167 MODULE_LICENSE("GPL");