Merge branch 'akpm' (patches from Andrew)
[cascardo/linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         if (parms->name[0])
257                 strlcpy(name, parms->name, IFNAMSIZ);
258         else {
259                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260                         err = -E2BIG;
261                         goto failed;
262                 }
263                 strlcpy(name, ops->kind, IFNAMSIZ);
264                 strncat(name, "%d", 2);
265         }
266
267         ASSERT_RTNL();
268         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269         if (!dev) {
270                 err = -ENOMEM;
271                 goto failed;
272         }
273         dev_net_set(dev, net);
274
275         dev->rtnl_link_ops = ops;
276
277         tunnel = netdev_priv(dev);
278         tunnel->parms = *parms;
279         tunnel->net = net;
280
281         err = register_netdevice(dev);
282         if (err)
283                 goto failed_free;
284
285         return dev;
286
287 failed_free:
288         free_netdev(dev);
289 failed:
290         return ERR_PTR(err);
291 }
292
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294                                     int proto,
295                                     __be32 daddr, __be32 saddr,
296                                     __be32 key, __u8 tos, int oif)
297 {
298         memset(fl4, 0, sizeof(*fl4));
299         fl4->flowi4_oif = oif;
300         fl4->daddr = daddr;
301         fl4->saddr = saddr;
302         fl4->flowi4_tos = tos;
303         fl4->flowi4_proto = proto;
304         fl4->fl4_gre_key = key;
305 }
306
307 static int ip_tunnel_bind_dev(struct net_device *dev)
308 {
309         struct net_device *tdev = NULL;
310         struct ip_tunnel *tunnel = netdev_priv(dev);
311         const struct iphdr *iph;
312         int hlen = LL_MAX_HEADER;
313         int mtu = ETH_DATA_LEN;
314         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
315
316         iph = &tunnel->parms.iph;
317
318         /* Guess output device to choose reasonable mtu and needed_headroom */
319         if (iph->daddr) {
320                 struct flowi4 fl4;
321                 struct rtable *rt;
322
323                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
324                                  iph->saddr, tunnel->parms.o_key,
325                                  RT_TOS(iph->tos), tunnel->parms.link);
326                 rt = ip_route_output_key(tunnel->net, &fl4);
327
328                 if (!IS_ERR(rt)) {
329                         tdev = rt->dst.dev;
330                         ip_rt_put(rt);
331                 }
332                 if (dev->type != ARPHRD_ETHER)
333                         dev->flags |= IFF_POINTOPOINT;
334
335                 dst_cache_reset(&tunnel->dst_cache);
336         }
337
338         if (!tdev && tunnel->parms.link)
339                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
340
341         if (tdev) {
342                 hlen = tdev->hard_header_len + tdev->needed_headroom;
343                 mtu = tdev->mtu;
344         }
345
346         dev->needed_headroom = t_hlen + hlen;
347         mtu -= (dev->hard_header_len + t_hlen);
348
349         if (mtu < 68)
350                 mtu = 68;
351
352         return mtu;
353 }
354
355 static struct ip_tunnel *ip_tunnel_create(struct net *net,
356                                           struct ip_tunnel_net *itn,
357                                           struct ip_tunnel_parm *parms)
358 {
359         struct ip_tunnel *nt;
360         struct net_device *dev;
361
362         BUG_ON(!itn->fb_tunnel_dev);
363         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
364         if (IS_ERR(dev))
365                 return ERR_CAST(dev);
366
367         dev->mtu = ip_tunnel_bind_dev(dev);
368
369         nt = netdev_priv(dev);
370         ip_tunnel_add(itn, nt);
371         return nt;
372 }
373
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376                   bool log_ecn_error)
377 {
378         struct pcpu_sw_netstats *tstats;
379         const struct iphdr *iph = ip_hdr(skb);
380         int err;
381
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383         if (ipv4_is_multicast(iph->daddr)) {
384                 tunnel->dev->stats.multicast++;
385                 skb->pkt_type = PACKET_BROADCAST;
386         }
387 #endif
388
389         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391                 tunnel->dev->stats.rx_crc_errors++;
392                 tunnel->dev->stats.rx_errors++;
393                 goto drop;
394         }
395
396         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397                 if (!(tpi->flags&TUNNEL_SEQ) ||
398                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399                         tunnel->dev->stats.rx_fifo_errors++;
400                         tunnel->dev->stats.rx_errors++;
401                         goto drop;
402                 }
403                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
404         }
405
406         skb_reset_network_header(skb);
407
408         err = IP_ECN_decapsulate(iph, skb);
409         if (unlikely(err)) {
410                 if (log_ecn_error)
411                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412                                         &iph->saddr, iph->tos);
413                 if (err > 1) {
414                         ++tunnel->dev->stats.rx_frame_errors;
415                         ++tunnel->dev->stats.rx_errors;
416                         goto drop;
417                 }
418         }
419
420         tstats = this_cpu_ptr(tunnel->dev->tstats);
421         u64_stats_update_begin(&tstats->syncp);
422         tstats->rx_packets++;
423         tstats->rx_bytes += skb->len;
424         u64_stats_update_end(&tstats->syncp);
425
426         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427
428         if (tunnel->dev->type == ARPHRD_ETHER) {
429                 skb->protocol = eth_type_trans(skb, tunnel->dev);
430                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431         } else {
432                 skb->dev = tunnel->dev;
433         }
434
435         if (tun_dst)
436                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
437
438         gro_cells_receive(&tunnel->gro_cells, skb);
439         return 0;
440
441 drop:
442         kfree_skb(skb);
443         return 0;
444 }
445 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
446
447 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
448                             unsigned int num)
449 {
450         if (num >= MAX_IPTUN_ENCAP_OPS)
451                 return -ERANGE;
452
453         return !cmpxchg((const struct ip_tunnel_encap_ops **)
454                         &iptun_encaps[num],
455                         NULL, ops) ? 0 : -1;
456 }
457 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
458
459 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
460                             unsigned int num)
461 {
462         int ret;
463
464         if (num >= MAX_IPTUN_ENCAP_OPS)
465                 return -ERANGE;
466
467         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
468                        &iptun_encaps[num],
469                        ops, NULL) == ops) ? 0 : -1;
470
471         synchronize_net();
472
473         return ret;
474 }
475 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
476
477 int ip_tunnel_encap_setup(struct ip_tunnel *t,
478                           struct ip_tunnel_encap *ipencap)
479 {
480         int hlen;
481
482         memset(&t->encap, 0, sizeof(t->encap));
483
484         hlen = ip_encap_hlen(ipencap);
485         if (hlen < 0)
486                 return hlen;
487
488         t->encap.type = ipencap->type;
489         t->encap.sport = ipencap->sport;
490         t->encap.dport = ipencap->dport;
491         t->encap.flags = ipencap->flags;
492
493         t->encap_hlen = hlen;
494         t->hlen = t->encap_hlen + t->tun_hlen;
495
496         return 0;
497 }
498 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
499
500 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
501                             struct rtable *rt, __be16 df,
502                             const struct iphdr *inner_iph)
503 {
504         struct ip_tunnel *tunnel = netdev_priv(dev);
505         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
506         int mtu;
507
508         if (df)
509                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
510                                         - sizeof(struct iphdr) - tunnel->hlen;
511         else
512                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
513
514         if (skb_dst(skb))
515                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
516
517         if (skb->protocol == htons(ETH_P_IP)) {
518                 if (!skb_is_gso(skb) &&
519                     (inner_iph->frag_off & htons(IP_DF)) &&
520                     mtu < pkt_size) {
521                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
522                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
523                         return -E2BIG;
524                 }
525         }
526 #if IS_ENABLED(CONFIG_IPV6)
527         else if (skb->protocol == htons(ETH_P_IPV6)) {
528                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
529
530                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
531                            mtu >= IPV6_MIN_MTU) {
532                         if ((tunnel->parms.iph.daddr &&
533                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
534                             rt6->rt6i_dst.plen == 128) {
535                                 rt6->rt6i_flags |= RTF_MODIFIED;
536                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
537                         }
538                 }
539
540                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
541                                         mtu < pkt_size) {
542                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
543                         return -E2BIG;
544                 }
545         }
546 #endif
547         return 0;
548 }
549
550 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
551 {
552         struct ip_tunnel *tunnel = netdev_priv(dev);
553         u32 headroom = sizeof(struct iphdr);
554         struct ip_tunnel_info *tun_info;
555         const struct ip_tunnel_key *key;
556         const struct iphdr *inner_iph;
557         struct rtable *rt;
558         struct flowi4 fl4;
559         __be16 df = 0;
560         u8 tos, ttl;
561
562         tun_info = skb_tunnel_info(skb);
563         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
564                      ip_tunnel_info_af(tun_info) != AF_INET))
565                 goto tx_error;
566         key = &tun_info->key;
567         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
568         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
569         tos = key->tos;
570         if (tos == 1) {
571                 if (skb->protocol == htons(ETH_P_IP))
572                         tos = inner_iph->tos;
573                 else if (skb->protocol == htons(ETH_P_IPV6))
574                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
575         }
576         init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
577                          RT_TOS(tos), tunnel->parms.link);
578         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
579                 goto tx_error;
580         rt = ip_route_output_key(tunnel->net, &fl4);
581         if (IS_ERR(rt)) {
582                 dev->stats.tx_carrier_errors++;
583                 goto tx_error;
584         }
585         if (rt->dst.dev == dev) {
586                 ip_rt_put(rt);
587                 dev->stats.collisions++;
588                 goto tx_error;
589         }
590         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
591         ttl = key->ttl;
592         if (ttl == 0) {
593                 if (skb->protocol == htons(ETH_P_IP))
594                         ttl = inner_iph->ttl;
595                 else if (skb->protocol == htons(ETH_P_IPV6))
596                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
597                 else
598                         ttl = ip4_dst_hoplimit(&rt->dst);
599         }
600         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
601                 df = htons(IP_DF);
602         else if (skb->protocol == htons(ETH_P_IP))
603                 df = inner_iph->frag_off & htons(IP_DF);
604         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
605         if (headroom > dev->needed_headroom)
606                 dev->needed_headroom = headroom;
607
608         if (skb_cow_head(skb, dev->needed_headroom)) {
609                 ip_rt_put(rt);
610                 goto tx_dropped;
611         }
612         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
613                       key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
614         return;
615 tx_error:
616         dev->stats.tx_errors++;
617         goto kfree;
618 tx_dropped:
619         dev->stats.tx_dropped++;
620 kfree:
621         kfree_skb(skb);
622 }
623 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
624
625 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
626                     const struct iphdr *tnl_params, u8 protocol)
627 {
628         struct ip_tunnel *tunnel = netdev_priv(dev);
629         const struct iphdr *inner_iph;
630         struct flowi4 fl4;
631         u8     tos, ttl;
632         __be16 df;
633         struct rtable *rt;              /* Route to the other host */
634         unsigned int max_headroom;      /* The extra header space needed */
635         __be32 dst;
636         bool connected;
637
638         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
639         connected = (tunnel->parms.iph.daddr != 0);
640
641         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
642
643         dst = tnl_params->daddr;
644         if (dst == 0) {
645                 /* NBMA tunnel */
646
647                 if (!skb_dst(skb)) {
648                         dev->stats.tx_fifo_errors++;
649                         goto tx_error;
650                 }
651
652                 if (skb->protocol == htons(ETH_P_IP)) {
653                         rt = skb_rtable(skb);
654                         dst = rt_nexthop(rt, inner_iph->daddr);
655                 }
656 #if IS_ENABLED(CONFIG_IPV6)
657                 else if (skb->protocol == htons(ETH_P_IPV6)) {
658                         const struct in6_addr *addr6;
659                         struct neighbour *neigh;
660                         bool do_tx_error_icmp;
661                         int addr_type;
662
663                         neigh = dst_neigh_lookup(skb_dst(skb),
664                                                  &ipv6_hdr(skb)->daddr);
665                         if (!neigh)
666                                 goto tx_error;
667
668                         addr6 = (const struct in6_addr *)&neigh->primary_key;
669                         addr_type = ipv6_addr_type(addr6);
670
671                         if (addr_type == IPV6_ADDR_ANY) {
672                                 addr6 = &ipv6_hdr(skb)->daddr;
673                                 addr_type = ipv6_addr_type(addr6);
674                         }
675
676                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
677                                 do_tx_error_icmp = true;
678                         else {
679                                 do_tx_error_icmp = false;
680                                 dst = addr6->s6_addr32[3];
681                         }
682                         neigh_release(neigh);
683                         if (do_tx_error_icmp)
684                                 goto tx_error_icmp;
685                 }
686 #endif
687                 else
688                         goto tx_error;
689
690                 connected = false;
691         }
692
693         tos = tnl_params->tos;
694         if (tos & 0x1) {
695                 tos &= ~0x1;
696                 if (skb->protocol == htons(ETH_P_IP)) {
697                         tos = inner_iph->tos;
698                         connected = false;
699                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
700                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
701                         connected = false;
702                 }
703         }
704
705         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
706                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
707
708         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
709                 goto tx_error;
710
711         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
712                          NULL;
713
714         if (!rt) {
715                 rt = ip_route_output_key(tunnel->net, &fl4);
716
717                 if (IS_ERR(rt)) {
718                         dev->stats.tx_carrier_errors++;
719                         goto tx_error;
720                 }
721                 if (connected)
722                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
723                                           fl4.saddr);
724         }
725
726         if (rt->dst.dev == dev) {
727                 ip_rt_put(rt);
728                 dev->stats.collisions++;
729                 goto tx_error;
730         }
731
732         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
733                 ip_rt_put(rt);
734                 goto tx_error;
735         }
736
737         if (tunnel->err_count > 0) {
738                 if (time_before(jiffies,
739                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
740                         tunnel->err_count--;
741
742                         dst_link_failure(skb);
743                 } else
744                         tunnel->err_count = 0;
745         }
746
747         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
748         ttl = tnl_params->ttl;
749         if (ttl == 0) {
750                 if (skb->protocol == htons(ETH_P_IP))
751                         ttl = inner_iph->ttl;
752 #if IS_ENABLED(CONFIG_IPV6)
753                 else if (skb->protocol == htons(ETH_P_IPV6))
754                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
755 #endif
756                 else
757                         ttl = ip4_dst_hoplimit(&rt->dst);
758         }
759
760         df = tnl_params->frag_off;
761         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
762                 df |= (inner_iph->frag_off&htons(IP_DF));
763
764         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
765                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
766         if (max_headroom > dev->needed_headroom)
767                 dev->needed_headroom = max_headroom;
768
769         if (skb_cow_head(skb, dev->needed_headroom)) {
770                 ip_rt_put(rt);
771                 dev->stats.tx_dropped++;
772                 kfree_skb(skb);
773                 return;
774         }
775
776         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
777                       df, !net_eq(tunnel->net, dev_net(dev)));
778         return;
779
780 #if IS_ENABLED(CONFIG_IPV6)
781 tx_error_icmp:
782         dst_link_failure(skb);
783 #endif
784 tx_error:
785         dev->stats.tx_errors++;
786         kfree_skb(skb);
787 }
788 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
789
790 static void ip_tunnel_update(struct ip_tunnel_net *itn,
791                              struct ip_tunnel *t,
792                              struct net_device *dev,
793                              struct ip_tunnel_parm *p,
794                              bool set_mtu)
795 {
796         ip_tunnel_del(itn, t);
797         t->parms.iph.saddr = p->iph.saddr;
798         t->parms.iph.daddr = p->iph.daddr;
799         t->parms.i_key = p->i_key;
800         t->parms.o_key = p->o_key;
801         if (dev->type != ARPHRD_ETHER) {
802                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
803                 memcpy(dev->broadcast, &p->iph.daddr, 4);
804         }
805         ip_tunnel_add(itn, t);
806
807         t->parms.iph.ttl = p->iph.ttl;
808         t->parms.iph.tos = p->iph.tos;
809         t->parms.iph.frag_off = p->iph.frag_off;
810
811         if (t->parms.link != p->link) {
812                 int mtu;
813
814                 t->parms.link = p->link;
815                 mtu = ip_tunnel_bind_dev(dev);
816                 if (set_mtu)
817                         dev->mtu = mtu;
818         }
819         dst_cache_reset(&t->dst_cache);
820         netdev_state_change(dev);
821 }
822
823 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
824 {
825         int err = 0;
826         struct ip_tunnel *t = netdev_priv(dev);
827         struct net *net = t->net;
828         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
829
830         BUG_ON(!itn->fb_tunnel_dev);
831         switch (cmd) {
832         case SIOCGETTUNNEL:
833                 if (dev == itn->fb_tunnel_dev) {
834                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
835                         if (!t)
836                                 t = netdev_priv(dev);
837                 }
838                 memcpy(p, &t->parms, sizeof(*p));
839                 break;
840
841         case SIOCADDTUNNEL:
842         case SIOCCHGTUNNEL:
843                 err = -EPERM;
844                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
845                         goto done;
846                 if (p->iph.ttl)
847                         p->iph.frag_off |= htons(IP_DF);
848                 if (!(p->i_flags & VTI_ISVTI)) {
849                         if (!(p->i_flags & TUNNEL_KEY))
850                                 p->i_key = 0;
851                         if (!(p->o_flags & TUNNEL_KEY))
852                                 p->o_key = 0;
853                 }
854
855                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
856
857                 if (cmd == SIOCADDTUNNEL) {
858                         if (!t) {
859                                 t = ip_tunnel_create(net, itn, p);
860                                 err = PTR_ERR_OR_ZERO(t);
861                                 break;
862                         }
863
864                         err = -EEXIST;
865                         break;
866                 }
867                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
868                         if (t) {
869                                 if (t->dev != dev) {
870                                         err = -EEXIST;
871                                         break;
872                                 }
873                         } else {
874                                 unsigned int nflags = 0;
875
876                                 if (ipv4_is_multicast(p->iph.daddr))
877                                         nflags = IFF_BROADCAST;
878                                 else if (p->iph.daddr)
879                                         nflags = IFF_POINTOPOINT;
880
881                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
882                                         err = -EINVAL;
883                                         break;
884                                 }
885
886                                 t = netdev_priv(dev);
887                         }
888                 }
889
890                 if (t) {
891                         err = 0;
892                         ip_tunnel_update(itn, t, dev, p, true);
893                 } else {
894                         err = -ENOENT;
895                 }
896                 break;
897
898         case SIOCDELTUNNEL:
899                 err = -EPERM;
900                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
901                         goto done;
902
903                 if (dev == itn->fb_tunnel_dev) {
904                         err = -ENOENT;
905                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
906                         if (!t)
907                                 goto done;
908                         err = -EPERM;
909                         if (t == netdev_priv(itn->fb_tunnel_dev))
910                                 goto done;
911                         dev = t->dev;
912                 }
913                 unregister_netdevice(dev);
914                 err = 0;
915                 break;
916
917         default:
918                 err = -EINVAL;
919         }
920
921 done:
922         return err;
923 }
924 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
925
926 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
927 {
928         struct ip_tunnel *tunnel = netdev_priv(dev);
929         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
930         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
931
932         if (new_mtu < 68)
933                 return -EINVAL;
934
935         if (new_mtu > max_mtu) {
936                 if (strict)
937                         return -EINVAL;
938
939                 new_mtu = max_mtu;
940         }
941
942         dev->mtu = new_mtu;
943         return 0;
944 }
945 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
946
947 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
948 {
949         return __ip_tunnel_change_mtu(dev, new_mtu, true);
950 }
951 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
952
953 static void ip_tunnel_dev_free(struct net_device *dev)
954 {
955         struct ip_tunnel *tunnel = netdev_priv(dev);
956
957         gro_cells_destroy(&tunnel->gro_cells);
958         dst_cache_destroy(&tunnel->dst_cache);
959         free_percpu(dev->tstats);
960         free_netdev(dev);
961 }
962
963 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
964 {
965         struct ip_tunnel *tunnel = netdev_priv(dev);
966         struct ip_tunnel_net *itn;
967
968         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
969
970         if (itn->fb_tunnel_dev != dev) {
971                 ip_tunnel_del(itn, netdev_priv(dev));
972                 unregister_netdevice_queue(dev, head);
973         }
974 }
975 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
976
977 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
978 {
979         struct ip_tunnel *tunnel = netdev_priv(dev);
980
981         return tunnel->net;
982 }
983 EXPORT_SYMBOL(ip_tunnel_get_link_net);
984
985 int ip_tunnel_get_iflink(const struct net_device *dev)
986 {
987         struct ip_tunnel *tunnel = netdev_priv(dev);
988
989         return tunnel->parms.link;
990 }
991 EXPORT_SYMBOL(ip_tunnel_get_iflink);
992
993 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
994                                   struct rtnl_link_ops *ops, char *devname)
995 {
996         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
997         struct ip_tunnel_parm parms;
998         unsigned int i;
999
1000         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1001                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1002
1003         if (!ops) {
1004                 itn->fb_tunnel_dev = NULL;
1005                 return 0;
1006         }
1007
1008         memset(&parms, 0, sizeof(parms));
1009         if (devname)
1010                 strlcpy(parms.name, devname, IFNAMSIZ);
1011
1012         rtnl_lock();
1013         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1014         /* FB netdevice is special: we have one, and only one per netns.
1015          * Allowing to move it to another netns is clearly unsafe.
1016          */
1017         if (!IS_ERR(itn->fb_tunnel_dev)) {
1018                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1019                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1020                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1021         }
1022         rtnl_unlock();
1023
1024         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1025 }
1026 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1027
1028 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1029                               struct rtnl_link_ops *ops)
1030 {
1031         struct net *net = dev_net(itn->fb_tunnel_dev);
1032         struct net_device *dev, *aux;
1033         int h;
1034
1035         for_each_netdev_safe(net, dev, aux)
1036                 if (dev->rtnl_link_ops == ops)
1037                         unregister_netdevice_queue(dev, head);
1038
1039         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1040                 struct ip_tunnel *t;
1041                 struct hlist_node *n;
1042                 struct hlist_head *thead = &itn->tunnels[h];
1043
1044                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1045                         /* If dev is in the same netns, it has already
1046                          * been added to the list by the previous loop.
1047                          */
1048                         if (!net_eq(dev_net(t->dev), net))
1049                                 unregister_netdevice_queue(t->dev, head);
1050         }
1051 }
1052
1053 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1054 {
1055         LIST_HEAD(list);
1056
1057         rtnl_lock();
1058         ip_tunnel_destroy(itn, &list, ops);
1059         unregister_netdevice_many(&list);
1060         rtnl_unlock();
1061 }
1062 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1063
1064 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1065                       struct ip_tunnel_parm *p)
1066 {
1067         struct ip_tunnel *nt;
1068         struct net *net = dev_net(dev);
1069         struct ip_tunnel_net *itn;
1070         int mtu;
1071         int err;
1072
1073         nt = netdev_priv(dev);
1074         itn = net_generic(net, nt->ip_tnl_net_id);
1075
1076         if (nt->collect_md) {
1077                 if (rtnl_dereference(itn->collect_md_tun))
1078                         return -EEXIST;
1079         } else {
1080                 if (ip_tunnel_find(itn, p, dev->type))
1081                         return -EEXIST;
1082         }
1083
1084         nt->net = net;
1085         nt->parms = *p;
1086         err = register_netdevice(dev);
1087         if (err)
1088                 goto out;
1089
1090         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1091                 eth_hw_addr_random(dev);
1092
1093         mtu = ip_tunnel_bind_dev(dev);
1094         if (!tb[IFLA_MTU])
1095                 dev->mtu = mtu;
1096
1097         ip_tunnel_add(itn, nt);
1098 out:
1099         return err;
1100 }
1101 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1102
1103 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1104                          struct ip_tunnel_parm *p)
1105 {
1106         struct ip_tunnel *t;
1107         struct ip_tunnel *tunnel = netdev_priv(dev);
1108         struct net *net = tunnel->net;
1109         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1110
1111         if (dev == itn->fb_tunnel_dev)
1112                 return -EINVAL;
1113
1114         t = ip_tunnel_find(itn, p, dev->type);
1115
1116         if (t) {
1117                 if (t->dev != dev)
1118                         return -EEXIST;
1119         } else {
1120                 t = tunnel;
1121
1122                 if (dev->type != ARPHRD_ETHER) {
1123                         unsigned int nflags = 0;
1124
1125                         if (ipv4_is_multicast(p->iph.daddr))
1126                                 nflags = IFF_BROADCAST;
1127                         else if (p->iph.daddr)
1128                                 nflags = IFF_POINTOPOINT;
1129
1130                         if ((dev->flags ^ nflags) &
1131                             (IFF_POINTOPOINT | IFF_BROADCAST))
1132                                 return -EINVAL;
1133                 }
1134         }
1135
1136         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1137         return 0;
1138 }
1139 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1140
1141 int ip_tunnel_init(struct net_device *dev)
1142 {
1143         struct ip_tunnel *tunnel = netdev_priv(dev);
1144         struct iphdr *iph = &tunnel->parms.iph;
1145         int err;
1146
1147         dev->destructor = ip_tunnel_dev_free;
1148         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1149         if (!dev->tstats)
1150                 return -ENOMEM;
1151
1152         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1153         if (err) {
1154                 free_percpu(dev->tstats);
1155                 return err;
1156         }
1157
1158         err = gro_cells_init(&tunnel->gro_cells, dev);
1159         if (err) {
1160                 dst_cache_destroy(&tunnel->dst_cache);
1161                 free_percpu(dev->tstats);
1162                 return err;
1163         }
1164
1165         tunnel->dev = dev;
1166         tunnel->net = dev_net(dev);
1167         strcpy(tunnel->parms.name, dev->name);
1168         iph->version            = 4;
1169         iph->ihl                = 5;
1170
1171         if (tunnel->collect_md) {
1172                 dev->features |= NETIF_F_NETNS_LOCAL;
1173                 netif_keep_dst(dev);
1174         }
1175         return 0;
1176 }
1177 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1178
1179 void ip_tunnel_uninit(struct net_device *dev)
1180 {
1181         struct ip_tunnel *tunnel = netdev_priv(dev);
1182         struct net *net = tunnel->net;
1183         struct ip_tunnel_net *itn;
1184
1185         itn = net_generic(net, tunnel->ip_tnl_net_id);
1186         /* fb_tunnel_dev will be unregisted in net-exit call. */
1187         if (itn->fb_tunnel_dev != dev)
1188                 ip_tunnel_del(itn, netdev_priv(dev));
1189
1190         dst_cache_reset(&tunnel->dst_cache);
1191 }
1192 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1193
1194 /* Do least required initialization, rest of init is done in tunnel_init call */
1195 void ip_tunnel_setup(struct net_device *dev, int net_id)
1196 {
1197         struct ip_tunnel *tunnel = netdev_priv(dev);
1198         tunnel->ip_tnl_net_id = net_id;
1199 }
1200 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1201
1202 MODULE_LICENSE("GPL");