ipv4: Cache dst in tunnels
[cascardo/linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57
58 #if IS_ENABLED(CONFIG_IPV6)
59 #include <net/ipv6.h>
60 #include <net/ip6_fib.h>
61 #include <net/ip6_route.h>
62 #endif
63
64 static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65                                    __be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static inline void __tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
72 {
73         struct dst_entry *old_dst;
74
75         if (dst && (dst->flags & DST_NOCACHE))
76                 dst = NULL;
77
78         spin_lock_bh(&t->dst_lock);
79         old_dst = rcu_dereference_raw(t->dst_cache);
80         rcu_assign_pointer(t->dst_cache, dst);
81         dst_release(old_dst);
82         spin_unlock_bh(&t->dst_lock);
83 }
84
85 static inline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
86 {
87         __tunnel_dst_set(t, dst);
88 }
89
90 static inline void tunnel_dst_reset(struct ip_tunnel *t)
91 {
92         tunnel_dst_set(t, NULL);
93 }
94
95 static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t)
96 {
97         struct dst_entry *dst;
98
99         rcu_read_lock();
100         dst = rcu_dereference(t->dst_cache);
101         if (dst)
102                 dst_hold(dst);
103         rcu_read_unlock();
104         return dst;
105 }
106
107 struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie)
108 {
109         struct dst_entry *dst = tunnel_dst_get(t);
110
111         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
112                 tunnel_dst_reset(t);
113                 return NULL;
114         }
115
116         return dst;
117 }
118
119 /* Often modified stats are per cpu, other are shared (netdev->stats) */
120 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
121                                                 struct rtnl_link_stats64 *tot)
122 {
123         int i;
124
125         for_each_possible_cpu(i) {
126                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
127                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
128                 unsigned int start;
129
130                 do {
131                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
132                         rx_packets = tstats->rx_packets;
133                         tx_packets = tstats->tx_packets;
134                         rx_bytes = tstats->rx_bytes;
135                         tx_bytes = tstats->tx_bytes;
136                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
137
138                 tot->rx_packets += rx_packets;
139                 tot->tx_packets += tx_packets;
140                 tot->rx_bytes   += rx_bytes;
141                 tot->tx_bytes   += tx_bytes;
142         }
143
144         tot->multicast = dev->stats.multicast;
145
146         tot->rx_crc_errors = dev->stats.rx_crc_errors;
147         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
148         tot->rx_length_errors = dev->stats.rx_length_errors;
149         tot->rx_frame_errors = dev->stats.rx_frame_errors;
150         tot->rx_errors = dev->stats.rx_errors;
151
152         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
153         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
154         tot->tx_dropped = dev->stats.tx_dropped;
155         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
156         tot->tx_errors = dev->stats.tx_errors;
157
158         tot->collisions  = dev->stats.collisions;
159
160         return tot;
161 }
162 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
163
164 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
165                                 __be16 flags, __be32 key)
166 {
167         if (p->i_flags & TUNNEL_KEY) {
168                 if (flags & TUNNEL_KEY)
169                         return key == p->i_key;
170                 else
171                         /* key expected, none present */
172                         return false;
173         } else
174                 return !(flags & TUNNEL_KEY);
175 }
176
177 /* Fallback tunnel: no source, no destination, no key, no options
178
179    Tunnel hash table:
180    We require exact key match i.e. if a key is present in packet
181    it will match only tunnel with the same key; if it is not present,
182    it will match only keyless tunnel.
183
184    All keysless packets, if not matched configured keyless tunnels
185    will match fallback tunnel.
186    Given src, dst and key, find appropriate for input tunnel.
187 */
188 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
189                                    int link, __be16 flags,
190                                    __be32 remote, __be32 local,
191                                    __be32 key)
192 {
193         unsigned int hash;
194         struct ip_tunnel *t, *cand = NULL;
195         struct hlist_head *head;
196
197         hash = ip_tunnel_hash(itn, key, remote);
198         head = &itn->tunnels[hash];
199
200         hlist_for_each_entry_rcu(t, head, hash_node) {
201                 if (local != t->parms.iph.saddr ||
202                     remote != t->parms.iph.daddr ||
203                     !(t->dev->flags & IFF_UP))
204                         continue;
205
206                 if (!ip_tunnel_key_match(&t->parms, flags, key))
207                         continue;
208
209                 if (t->parms.link == link)
210                         return t;
211                 else
212                         cand = t;
213         }
214
215         hlist_for_each_entry_rcu(t, head, hash_node) {
216                 if (remote != t->parms.iph.daddr ||
217                     !(t->dev->flags & IFF_UP))
218                         continue;
219
220                 if (!ip_tunnel_key_match(&t->parms, flags, key))
221                         continue;
222
223                 if (t->parms.link == link)
224                         return t;
225                 else if (!cand)
226                         cand = t;
227         }
228
229         hash = ip_tunnel_hash(itn, key, 0);
230         head = &itn->tunnels[hash];
231
232         hlist_for_each_entry_rcu(t, head, hash_node) {
233                 if ((local != t->parms.iph.saddr &&
234                      (local != t->parms.iph.daddr ||
235                       !ipv4_is_multicast(local))) ||
236                     !(t->dev->flags & IFF_UP))
237                         continue;
238
239                 if (!ip_tunnel_key_match(&t->parms, flags, key))
240                         continue;
241
242                 if (t->parms.link == link)
243                         return t;
244                 else if (!cand)
245                         cand = t;
246         }
247
248         if (flags & TUNNEL_NO_KEY)
249                 goto skip_key_lookup;
250
251         hlist_for_each_entry_rcu(t, head, hash_node) {
252                 if (t->parms.i_key != key ||
253                     !(t->dev->flags & IFF_UP))
254                         continue;
255
256                 if (t->parms.link == link)
257                         return t;
258                 else if (!cand)
259                         cand = t;
260         }
261
262 skip_key_lookup:
263         if (cand)
264                 return cand;
265
266         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
267                 return netdev_priv(itn->fb_tunnel_dev);
268
269
270         return NULL;
271 }
272 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
273
274 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
275                                     struct ip_tunnel_parm *parms)
276 {
277         unsigned int h;
278         __be32 remote;
279
280         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
281                 remote = parms->iph.daddr;
282         else
283                 remote = 0;
284
285         h = ip_tunnel_hash(itn, parms->i_key, remote);
286         return &itn->tunnels[h];
287 }
288
289 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
290 {
291         struct hlist_head *head = ip_bucket(itn, &t->parms);
292
293         hlist_add_head_rcu(&t->hash_node, head);
294 }
295
296 static void ip_tunnel_del(struct ip_tunnel *t)
297 {
298         hlist_del_init_rcu(&t->hash_node);
299 }
300
301 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
302                                         struct ip_tunnel_parm *parms,
303                                         int type)
304 {
305         __be32 remote = parms->iph.daddr;
306         __be32 local = parms->iph.saddr;
307         __be32 key = parms->i_key;
308         int link = parms->link;
309         struct ip_tunnel *t = NULL;
310         struct hlist_head *head = ip_bucket(itn, parms);
311
312         hlist_for_each_entry_rcu(t, head, hash_node) {
313                 if (local == t->parms.iph.saddr &&
314                     remote == t->parms.iph.daddr &&
315                     key == t->parms.i_key &&
316                     link == t->parms.link &&
317                     type == t->dev->type)
318                         break;
319         }
320         return t;
321 }
322
323 static struct net_device *__ip_tunnel_create(struct net *net,
324                                              const struct rtnl_link_ops *ops,
325                                              struct ip_tunnel_parm *parms)
326 {
327         int err;
328         struct ip_tunnel *tunnel;
329         struct net_device *dev;
330         char name[IFNAMSIZ];
331
332         if (parms->name[0])
333                 strlcpy(name, parms->name, IFNAMSIZ);
334         else {
335                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
336                         err = -E2BIG;
337                         goto failed;
338                 }
339                 strlcpy(name, ops->kind, IFNAMSIZ);
340                 strncat(name, "%d", 2);
341         }
342
343         ASSERT_RTNL();
344         dev = alloc_netdev(ops->priv_size, name, ops->setup);
345         if (!dev) {
346                 err = -ENOMEM;
347                 goto failed;
348         }
349         dev_net_set(dev, net);
350
351         dev->rtnl_link_ops = ops;
352
353         tunnel = netdev_priv(dev);
354         tunnel->parms = *parms;
355         tunnel->net = net;
356
357         err = register_netdevice(dev);
358         if (err)
359                 goto failed_free;
360
361         return dev;
362
363 failed_free:
364         free_netdev(dev);
365 failed:
366         return ERR_PTR(err);
367 }
368
369 static inline void init_tunnel_flow(struct flowi4 *fl4,
370                                     int proto,
371                                     __be32 daddr, __be32 saddr,
372                                     __be32 key, __u8 tos, int oif)
373 {
374         memset(fl4, 0, sizeof(*fl4));
375         fl4->flowi4_oif = oif;
376         fl4->daddr = daddr;
377         fl4->saddr = saddr;
378         fl4->flowi4_tos = tos;
379         fl4->flowi4_proto = proto;
380         fl4->fl4_gre_key = key;
381 }
382
383 static int ip_tunnel_bind_dev(struct net_device *dev)
384 {
385         struct net_device *tdev = NULL;
386         struct ip_tunnel *tunnel = netdev_priv(dev);
387         const struct iphdr *iph;
388         int hlen = LL_MAX_HEADER;
389         int mtu = ETH_DATA_LEN;
390         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
391
392         iph = &tunnel->parms.iph;
393
394         /* Guess output device to choose reasonable mtu and needed_headroom */
395         if (iph->daddr) {
396                 struct flowi4 fl4;
397                 struct rtable *rt;
398
399                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
400                                  iph->saddr, tunnel->parms.o_key,
401                                  RT_TOS(iph->tos), tunnel->parms.link);
402                 rt = ip_route_output_key(tunnel->net, &fl4);
403
404                 if (!IS_ERR(rt)) {
405                         tdev = rt->dst.dev;
406                         tunnel_dst_set(tunnel, dst_clone(&rt->dst));
407                         ip_rt_put(rt);
408                 }
409                 if (dev->type != ARPHRD_ETHER)
410                         dev->flags |= IFF_POINTOPOINT;
411         }
412
413         if (!tdev && tunnel->parms.link)
414                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
415
416         if (tdev) {
417                 hlen = tdev->hard_header_len + tdev->needed_headroom;
418                 mtu = tdev->mtu;
419         }
420         dev->iflink = tunnel->parms.link;
421
422         dev->needed_headroom = t_hlen + hlen;
423         mtu -= (dev->hard_header_len + t_hlen);
424
425         if (mtu < 68)
426                 mtu = 68;
427
428         return mtu;
429 }
430
431 static struct ip_tunnel *ip_tunnel_create(struct net *net,
432                                           struct ip_tunnel_net *itn,
433                                           struct ip_tunnel_parm *parms)
434 {
435         struct ip_tunnel *nt, *fbt;
436         struct net_device *dev;
437
438         BUG_ON(!itn->fb_tunnel_dev);
439         fbt = netdev_priv(itn->fb_tunnel_dev);
440         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
441         if (IS_ERR(dev))
442                 return NULL;
443
444         dev->mtu = ip_tunnel_bind_dev(dev);
445
446         nt = netdev_priv(dev);
447         ip_tunnel_add(itn, nt);
448         return nt;
449 }
450
451 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
452                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
453 {
454         struct pcpu_tstats *tstats;
455         const struct iphdr *iph = ip_hdr(skb);
456         int err;
457
458 #ifdef CONFIG_NET_IPGRE_BROADCAST
459         if (ipv4_is_multicast(iph->daddr)) {
460                 /* Looped back packet, drop it! */
461                 if (rt_is_output_route(skb_rtable(skb)))
462                         goto drop;
463                 tunnel->dev->stats.multicast++;
464                 skb->pkt_type = PACKET_BROADCAST;
465         }
466 #endif
467
468         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
469              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
470                 tunnel->dev->stats.rx_crc_errors++;
471                 tunnel->dev->stats.rx_errors++;
472                 goto drop;
473         }
474
475         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
476                 if (!(tpi->flags&TUNNEL_SEQ) ||
477                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
478                         tunnel->dev->stats.rx_fifo_errors++;
479                         tunnel->dev->stats.rx_errors++;
480                         goto drop;
481                 }
482                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
483         }
484
485         err = IP_ECN_decapsulate(iph, skb);
486         if (unlikely(err)) {
487                 if (log_ecn_error)
488                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
489                                         &iph->saddr, iph->tos);
490                 if (err > 1) {
491                         ++tunnel->dev->stats.rx_frame_errors;
492                         ++tunnel->dev->stats.rx_errors;
493                         goto drop;
494                 }
495         }
496
497         tstats = this_cpu_ptr(tunnel->dev->tstats);
498         u64_stats_update_begin(&tstats->syncp);
499         tstats->rx_packets++;
500         tstats->rx_bytes += skb->len;
501         u64_stats_update_end(&tstats->syncp);
502
503         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
504
505         if (tunnel->dev->type == ARPHRD_ETHER) {
506                 skb->protocol = eth_type_trans(skb, tunnel->dev);
507                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
508         } else {
509                 skb->dev = tunnel->dev;
510         }
511
512         gro_cells_receive(&tunnel->gro_cells, skb);
513         return 0;
514
515 drop:
516         kfree_skb(skb);
517         return 0;
518 }
519 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
520
521 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
522                             struct rtable *rt, __be16 df)
523 {
524         struct ip_tunnel *tunnel = netdev_priv(dev);
525         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
526         int mtu;
527
528         if (df)
529                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
530                                         - sizeof(struct iphdr) - tunnel->hlen;
531         else
532                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
533
534         if (skb_dst(skb))
535                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
536
537         if (skb->protocol == htons(ETH_P_IP)) {
538                 if (!skb_is_gso(skb) &&
539                     (df & htons(IP_DF)) && mtu < pkt_size) {
540                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
541                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
542                         return -E2BIG;
543                 }
544         }
545 #if IS_ENABLED(CONFIG_IPV6)
546         else if (skb->protocol == htons(ETH_P_IPV6)) {
547                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
548
549                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
550                            mtu >= IPV6_MIN_MTU) {
551                         if ((tunnel->parms.iph.daddr &&
552                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
553                             rt6->rt6i_dst.plen == 128) {
554                                 rt6->rt6i_flags |= RTF_MODIFIED;
555                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
556                         }
557                 }
558
559                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
560                                         mtu < pkt_size) {
561                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
562                         return -E2BIG;
563                 }
564         }
565 #endif
566         return 0;
567 }
568
569 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
570                     const struct iphdr *tnl_params, const u8 protocol)
571 {
572         struct ip_tunnel *tunnel = netdev_priv(dev);
573         const struct iphdr *inner_iph;
574         struct flowi4 fl4;
575         u8     tos, ttl;
576         __be16 df;
577         struct rtable *rt = NULL;       /* Route to the other host */
578         unsigned int max_headroom;      /* The extra header space needed */
579         __be32 dst;
580         int err;
581         bool connected = true;
582
583         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
584
585         dst = tnl_params->daddr;
586         if (dst == 0) {
587                 /* NBMA tunnel */
588
589                 if (skb_dst(skb) == NULL) {
590                         dev->stats.tx_fifo_errors++;
591                         goto tx_error;
592                 }
593
594                 if (skb->protocol == htons(ETH_P_IP)) {
595                         rt = skb_rtable(skb);
596                         dst = rt_nexthop(rt, inner_iph->daddr);
597                 }
598 #if IS_ENABLED(CONFIG_IPV6)
599                 else if (skb->protocol == htons(ETH_P_IPV6)) {
600                         const struct in6_addr *addr6;
601                         struct neighbour *neigh;
602                         bool do_tx_error_icmp;
603                         int addr_type;
604
605                         neigh = dst_neigh_lookup(skb_dst(skb),
606                                                  &ipv6_hdr(skb)->daddr);
607                         if (neigh == NULL)
608                                 goto tx_error;
609
610                         addr6 = (const struct in6_addr *)&neigh->primary_key;
611                         addr_type = ipv6_addr_type(addr6);
612
613                         if (addr_type == IPV6_ADDR_ANY) {
614                                 addr6 = &ipv6_hdr(skb)->daddr;
615                                 addr_type = ipv6_addr_type(addr6);
616                         }
617
618                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
619                                 do_tx_error_icmp = true;
620                         else {
621                                 do_tx_error_icmp = false;
622                                 dst = addr6->s6_addr32[3];
623                         }
624                         neigh_release(neigh);
625                         if (do_tx_error_icmp)
626                                 goto tx_error_icmp;
627                 }
628 #endif
629                 else
630                         goto tx_error;
631
632                 connected = false;
633         }
634
635         tos = tnl_params->tos;
636         if (tos & 0x1) {
637                 tos &= ~0x1;
638                 if (skb->protocol == htons(ETH_P_IP)) {
639                         tos = inner_iph->tos;
640                         connected = false;
641                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
642                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
643                         connected = false;
644                 }
645         }
646
647         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
648                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
649
650         if (connected)
651                 rt = (struct rtable *)tunnel_dst_check(tunnel, 0);
652
653         if (!rt) {
654                 rt = ip_route_output_key(tunnel->net, &fl4);
655
656                 if (IS_ERR(rt)) {
657                         dev->stats.tx_carrier_errors++;
658                         goto tx_error;
659                 }
660                 if (connected)
661                         tunnel_dst_set(tunnel, dst_clone(&rt->dst));
662         }
663
664         if (rt->dst.dev == dev) {
665                 ip_rt_put(rt);
666                 dev->stats.collisions++;
667                 goto tx_error;
668         }
669
670         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
671                 ip_rt_put(rt);
672                 goto tx_error;
673         }
674
675         if (tunnel->err_count > 0) {
676                 if (time_before(jiffies,
677                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
678                         tunnel->err_count--;
679
680                         dst_link_failure(skb);
681                 } else
682                         tunnel->err_count = 0;
683         }
684
685         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
686         ttl = tnl_params->ttl;
687         if (ttl == 0) {
688                 if (skb->protocol == htons(ETH_P_IP))
689                         ttl = inner_iph->ttl;
690 #if IS_ENABLED(CONFIG_IPV6)
691                 else if (skb->protocol == htons(ETH_P_IPV6))
692                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
693 #endif
694                 else
695                         ttl = ip4_dst_hoplimit(&rt->dst);
696         }
697
698         df = tnl_params->frag_off;
699         if (skb->protocol == htons(ETH_P_IP))
700                 df |= (inner_iph->frag_off&htons(IP_DF));
701
702         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
703                         + rt->dst.header_len;
704         if (max_headroom > dev->needed_headroom)
705                 dev->needed_headroom = max_headroom;
706
707         if (skb_cow_head(skb, dev->needed_headroom)) {
708                 dev->stats.tx_dropped++;
709                 dev_kfree_skb(skb);
710                 return;
711         }
712
713         err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
714                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
715         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
716
717         return;
718
719 #if IS_ENABLED(CONFIG_IPV6)
720 tx_error_icmp:
721         dst_link_failure(skb);
722 #endif
723 tx_error:
724         dev->stats.tx_errors++;
725         dev_kfree_skb(skb);
726 }
727 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
728
729 static void ip_tunnel_update(struct ip_tunnel_net *itn,
730                              struct ip_tunnel *t,
731                              struct net_device *dev,
732                              struct ip_tunnel_parm *p,
733                              bool set_mtu)
734 {
735         ip_tunnel_del(t);
736         t->parms.iph.saddr = p->iph.saddr;
737         t->parms.iph.daddr = p->iph.daddr;
738         t->parms.i_key = p->i_key;
739         t->parms.o_key = p->o_key;
740         if (dev->type != ARPHRD_ETHER) {
741                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
742                 memcpy(dev->broadcast, &p->iph.daddr, 4);
743         }
744         ip_tunnel_add(itn, t);
745
746         t->parms.iph.ttl = p->iph.ttl;
747         t->parms.iph.tos = p->iph.tos;
748         t->parms.iph.frag_off = p->iph.frag_off;
749
750         if (t->parms.link != p->link) {
751                 int mtu;
752
753                 t->parms.link = p->link;
754                 mtu = ip_tunnel_bind_dev(dev);
755                 if (set_mtu)
756                         dev->mtu = mtu;
757         }
758         tunnel_dst_reset(t);
759         netdev_state_change(dev);
760 }
761
762 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
763 {
764         int err = 0;
765         struct ip_tunnel *t;
766         struct net *net = dev_net(dev);
767         struct ip_tunnel *tunnel = netdev_priv(dev);
768         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
769
770         BUG_ON(!itn->fb_tunnel_dev);
771         switch (cmd) {
772         case SIOCGETTUNNEL:
773                 t = NULL;
774                 if (dev == itn->fb_tunnel_dev)
775                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
776                 if (t == NULL)
777                         t = netdev_priv(dev);
778                 memcpy(p, &t->parms, sizeof(*p));
779                 break;
780
781         case SIOCADDTUNNEL:
782         case SIOCCHGTUNNEL:
783                 err = -EPERM;
784                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
785                         goto done;
786                 if (p->iph.ttl)
787                         p->iph.frag_off |= htons(IP_DF);
788                 if (!(p->i_flags&TUNNEL_KEY))
789                         p->i_key = 0;
790                 if (!(p->o_flags&TUNNEL_KEY))
791                         p->o_key = 0;
792
793                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
794
795                 if (!t && (cmd == SIOCADDTUNNEL))
796                         t = ip_tunnel_create(net, itn, p);
797
798                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
799                         if (t != NULL) {
800                                 if (t->dev != dev) {
801                                         err = -EEXIST;
802                                         break;
803                                 }
804                         } else {
805                                 unsigned int nflags = 0;
806
807                                 if (ipv4_is_multicast(p->iph.daddr))
808                                         nflags = IFF_BROADCAST;
809                                 else if (p->iph.daddr)
810                                         nflags = IFF_POINTOPOINT;
811
812                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
813                                         err = -EINVAL;
814                                         break;
815                                 }
816
817                                 t = netdev_priv(dev);
818                         }
819                 }
820
821                 if (t) {
822                         err = 0;
823                         ip_tunnel_update(itn, t, dev, p, true);
824                 } else
825                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
826                 break;
827
828         case SIOCDELTUNNEL:
829                 err = -EPERM;
830                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
831                         goto done;
832
833                 if (dev == itn->fb_tunnel_dev) {
834                         err = -ENOENT;
835                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
836                         if (t == NULL)
837                                 goto done;
838                         err = -EPERM;
839                         if (t == netdev_priv(itn->fb_tunnel_dev))
840                                 goto done;
841                         dev = t->dev;
842                 }
843                 unregister_netdevice(dev);
844                 err = 0;
845                 break;
846
847         default:
848                 err = -EINVAL;
849         }
850
851 done:
852         return err;
853 }
854 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
855
856 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
857 {
858         struct ip_tunnel *tunnel = netdev_priv(dev);
859         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
860
861         if (new_mtu < 68 ||
862             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
863                 return -EINVAL;
864         dev->mtu = new_mtu;
865         return 0;
866 }
867 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
868
869 static void ip_tunnel_dev_free(struct net_device *dev)
870 {
871         struct ip_tunnel *tunnel = netdev_priv(dev);
872
873         gro_cells_destroy(&tunnel->gro_cells);
874         free_percpu(dev->tstats);
875         free_netdev(dev);
876 }
877
878 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
879 {
880         struct ip_tunnel *tunnel = netdev_priv(dev);
881         struct ip_tunnel_net *itn;
882
883         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
884
885         if (itn->fb_tunnel_dev != dev) {
886                 ip_tunnel_del(netdev_priv(dev));
887                 unregister_netdevice_queue(dev, head);
888         }
889 }
890 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
891
892 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
893                                   struct rtnl_link_ops *ops, char *devname)
894 {
895         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
896         struct ip_tunnel_parm parms;
897         unsigned int i;
898
899         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
900                 INIT_HLIST_HEAD(&itn->tunnels[i]);
901
902         if (!ops) {
903                 itn->fb_tunnel_dev = NULL;
904                 return 0;
905         }
906
907         memset(&parms, 0, sizeof(parms));
908         if (devname)
909                 strlcpy(parms.name, devname, IFNAMSIZ);
910
911         rtnl_lock();
912         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
913         /* FB netdevice is special: we have one, and only one per netns.
914          * Allowing to move it to another netns is clearly unsafe.
915          */
916         if (!IS_ERR(itn->fb_tunnel_dev)) {
917                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
918                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
919         }
920         rtnl_unlock();
921
922         return PTR_RET(itn->fb_tunnel_dev);
923 }
924 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
925
926 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
927                               struct rtnl_link_ops *ops)
928 {
929         struct net *net = dev_net(itn->fb_tunnel_dev);
930         struct net_device *dev, *aux;
931         int h;
932
933         for_each_netdev_safe(net, dev, aux)
934                 if (dev->rtnl_link_ops == ops)
935                         unregister_netdevice_queue(dev, head);
936
937         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
938                 struct ip_tunnel *t;
939                 struct hlist_node *n;
940                 struct hlist_head *thead = &itn->tunnels[h];
941
942                 hlist_for_each_entry_safe(t, n, thead, hash_node)
943                         /* If dev is in the same netns, it has already
944                          * been added to the list by the previous loop.
945                          */
946                         if (!net_eq(dev_net(t->dev), net))
947                                 unregister_netdevice_queue(t->dev, head);
948         }
949 }
950
951 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
952 {
953         LIST_HEAD(list);
954
955         rtnl_lock();
956         ip_tunnel_destroy(itn, &list, ops);
957         unregister_netdevice_many(&list);
958         rtnl_unlock();
959 }
960 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
961
962 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
963                       struct ip_tunnel_parm *p)
964 {
965         struct ip_tunnel *nt;
966         struct net *net = dev_net(dev);
967         struct ip_tunnel_net *itn;
968         int mtu;
969         int err;
970
971         nt = netdev_priv(dev);
972         itn = net_generic(net, nt->ip_tnl_net_id);
973
974         if (ip_tunnel_find(itn, p, dev->type))
975                 return -EEXIST;
976
977         nt->net = net;
978         nt->parms = *p;
979         err = register_netdevice(dev);
980         if (err)
981                 goto out;
982
983         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
984                 eth_hw_addr_random(dev);
985
986         mtu = ip_tunnel_bind_dev(dev);
987         if (!tb[IFLA_MTU])
988                 dev->mtu = mtu;
989
990         ip_tunnel_add(itn, nt);
991
992 out:
993         return err;
994 }
995 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
996
997 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
998                          struct ip_tunnel_parm *p)
999 {
1000         struct ip_tunnel *t;
1001         struct ip_tunnel *tunnel = netdev_priv(dev);
1002         struct net *net = tunnel->net;
1003         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1004
1005         if (dev == itn->fb_tunnel_dev)
1006                 return -EINVAL;
1007
1008         t = ip_tunnel_find(itn, p, dev->type);
1009
1010         if (t) {
1011                 if (t->dev != dev)
1012                         return -EEXIST;
1013         } else {
1014                 t = tunnel;
1015
1016                 if (dev->type != ARPHRD_ETHER) {
1017                         unsigned int nflags = 0;
1018
1019                         if (ipv4_is_multicast(p->iph.daddr))
1020                                 nflags = IFF_BROADCAST;
1021                         else if (p->iph.daddr)
1022                                 nflags = IFF_POINTOPOINT;
1023
1024                         if ((dev->flags ^ nflags) &
1025                             (IFF_POINTOPOINT | IFF_BROADCAST))
1026                                 return -EINVAL;
1027                 }
1028         }
1029
1030         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1031         return 0;
1032 }
1033 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1034
1035 int ip_tunnel_init(struct net_device *dev)
1036 {
1037         struct ip_tunnel *tunnel = netdev_priv(dev);
1038         struct iphdr *iph = &tunnel->parms.iph;
1039         int i, err;
1040
1041         dev->destructor = ip_tunnel_dev_free;
1042         dev->tstats = alloc_percpu(struct pcpu_tstats);
1043         if (!dev->tstats)
1044                 return -ENOMEM;
1045
1046         for_each_possible_cpu(i) {
1047                 struct pcpu_tstats *ipt_stats;
1048                 ipt_stats = per_cpu_ptr(dev->tstats, i);
1049                 u64_stats_init(&ipt_stats->syncp);
1050         }
1051
1052         err = gro_cells_init(&tunnel->gro_cells, dev);
1053         if (err) {
1054                 free_percpu(dev->tstats);
1055                 return err;
1056         }
1057
1058         tunnel->dev = dev;
1059         tunnel->net = dev_net(dev);
1060         strcpy(tunnel->parms.name, dev->name);
1061         iph->version            = 4;
1062         iph->ihl                = 5;
1063
1064         tunnel->dst_cache = NULL;
1065         spin_lock_init(&tunnel->dst_lock);
1066
1067         return 0;
1068 }
1069 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1070
1071 void ip_tunnel_uninit(struct net_device *dev)
1072 {
1073         struct ip_tunnel *tunnel = netdev_priv(dev);
1074         struct net *net = tunnel->net;
1075         struct ip_tunnel_net *itn;
1076
1077         itn = net_generic(net, tunnel->ip_tnl_net_id);
1078         /* fb_tunnel_dev will be unregisted in net-exit call. */
1079         if (itn->fb_tunnel_dev != dev)
1080                 ip_tunnel_del(netdev_priv(dev));
1081
1082         tunnel_dst_reset(tunnel);
1083 }
1084 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1085
1086 /* Do least required initialization, rest of init is done in tunnel_init call */
1087 void ip_tunnel_setup(struct net_device *dev, int net_id)
1088 {
1089         struct ip_tunnel *tunnel = netdev_priv(dev);
1090         tunnel->ip_tnl_net_id = net_id;
1091 }
1092 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1093
1094 MODULE_LICENSE("GPL");