ip_gre: comments change
[cascardo/linux.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170         unsigned long   rx_packets;
171         unsigned long   rx_bytes;
172         unsigned long   tx_packets;
173         unsigned long   tx_bytes;
174 };
175
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178         struct pcpu_tstats sum = { 0 };
179         int i;
180
181         for_each_possible_cpu(i) {
182                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184                 sum.rx_packets += tstats->rx_packets;
185                 sum.rx_bytes   += tstats->rx_bytes;
186                 sum.tx_packets += tstats->tx_packets;
187                 sum.tx_bytes   += tstats->tx_bytes;
188         }
189         dev->stats.rx_packets = sum.rx_packets;
190         dev->stats.rx_bytes   = sum.rx_bytes;
191         dev->stats.tx_packets = sum.tx_packets;
192         dev->stats.tx_bytes   = sum.tx_bytes;
193         return &dev->stats;
194 }
195
196 /* Given src, dst and key, find appropriate for input tunnel. */
197
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199                                               __be32 remote, __be32 local,
200                                               __be32 key, __be16 gre_proto)
201 {
202         struct net *net = dev_net(dev);
203         int link = dev->ifindex;
204         unsigned int h0 = HASH(remote);
205         unsigned int h1 = HASH(key);
206         struct ip_tunnel *t, *cand = NULL;
207         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209                        ARPHRD_ETHER : ARPHRD_IPGRE;
210         int score, cand_score = 4;
211
212         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213                 if (local != t->parms.iph.saddr ||
214                     remote != t->parms.iph.daddr ||
215                     key != t->parms.i_key ||
216                     !(t->dev->flags & IFF_UP))
217                         continue;
218
219                 if (t->dev->type != ARPHRD_IPGRE &&
220                     t->dev->type != dev_type)
221                         continue;
222
223                 score = 0;
224                 if (t->parms.link != link)
225                         score |= 1;
226                 if (t->dev->type != dev_type)
227                         score |= 2;
228                 if (score == 0)
229                         return t;
230
231                 if (score < cand_score) {
232                         cand = t;
233                         cand_score = score;
234                 }
235         }
236
237         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238                 if (remote != t->parms.iph.daddr ||
239                     key != t->parms.i_key ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (t->dev->type != ARPHRD_IPGRE &&
244                     t->dev->type != dev_type)
245                         continue;
246
247                 score = 0;
248                 if (t->parms.link != link)
249                         score |= 1;
250                 if (t->dev->type != dev_type)
251                         score |= 2;
252                 if (score == 0)
253                         return t;
254
255                 if (score < cand_score) {
256                         cand = t;
257                         cand_score = score;
258                 }
259         }
260
261         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262                 if ((local != t->parms.iph.saddr &&
263                      (local != t->parms.iph.daddr ||
264                       !ipv4_is_multicast(local))) ||
265                     key != t->parms.i_key ||
266                     !(t->dev->flags & IFF_UP))
267                         continue;
268
269                 if (t->dev->type != ARPHRD_IPGRE &&
270                     t->dev->type != dev_type)
271                         continue;
272
273                 score = 0;
274                 if (t->parms.link != link)
275                         score |= 1;
276                 if (t->dev->type != dev_type)
277                         score |= 2;
278                 if (score == 0)
279                         return t;
280
281                 if (score < cand_score) {
282                         cand = t;
283                         cand_score = score;
284                 }
285         }
286
287         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288                 if (t->parms.i_key != key ||
289                     !(t->dev->flags & IFF_UP))
290                         continue;
291
292                 if (t->dev->type != ARPHRD_IPGRE &&
293                     t->dev->type != dev_type)
294                         continue;
295
296                 score = 0;
297                 if (t->parms.link != link)
298                         score |= 1;
299                 if (t->dev->type != dev_type)
300                         score |= 2;
301                 if (score == 0)
302                         return t;
303
304                 if (score < cand_score) {
305                         cand = t;
306                         cand_score = score;
307                 }
308         }
309
310         if (cand != NULL)
311                 return cand;
312
313         dev = ign->fb_tunnel_dev;
314         if (dev->flags & IFF_UP)
315                 return netdev_priv(dev);
316
317         return NULL;
318 }
319
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321                 struct ip_tunnel_parm *parms)
322 {
323         __be32 remote = parms->iph.daddr;
324         __be32 local = parms->iph.saddr;
325         __be32 key = parms->i_key;
326         unsigned int h = HASH(key);
327         int prio = 0;
328
329         if (local)
330                 prio |= 1;
331         if (remote && !ipv4_is_multicast(remote)) {
332                 prio |= 2;
333                 h ^= HASH(remote);
334         }
335
336         return &ign->tunnels[prio][h];
337 }
338
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340                 struct ip_tunnel *t)
341 {
342         return __ipgre_bucket(ign, &t->parms);
343 }
344
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348
349         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350         rcu_assign_pointer(*tp, t);
351 }
352
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355         struct ip_tunnel __rcu **tp;
356         struct ip_tunnel *iter;
357
358         for (tp = ipgre_bucket(ign, t);
359              (iter = rtnl_dereference(*tp)) != NULL;
360              tp = &iter->next) {
361                 if (t == iter) {
362                         rcu_assign_pointer(*tp, t->next);
363                         break;
364                 }
365         }
366 }
367
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369                                            struct ip_tunnel_parm *parms,
370                                            int type)
371 {
372         __be32 remote = parms->iph.daddr;
373         __be32 local = parms->iph.saddr;
374         __be32 key = parms->i_key;
375         int link = parms->link;
376         struct ip_tunnel *t;
377         struct ip_tunnel __rcu **tp;
378         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379
380         for (tp = __ipgre_bucket(ign, parms);
381              (t = rtnl_dereference(*tp)) != NULL;
382              tp = &t->next)
383                 if (local == t->parms.iph.saddr &&
384                     remote == t->parms.iph.daddr &&
385                     key == t->parms.i_key &&
386                     link == t->parms.link &&
387                     type == t->dev->type)
388                         break;
389
390         return t;
391 }
392
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394                 struct ip_tunnel_parm *parms, int create)
395 {
396         struct ip_tunnel *t, *nt;
397         struct net_device *dev;
398         char name[IFNAMSIZ];
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402         if (t || !create)
403                 return t;
404
405         if (parms->name[0])
406                 strlcpy(name, parms->name, IFNAMSIZ);
407         else
408                 sprintf(name, "gre%%d");
409
410         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411         if (!dev)
412           return NULL;
413
414         dev_net_set(dev, net);
415
416         if (strchr(name, '%')) {
417                 if (dev_alloc_name(dev, name) < 0)
418                         goto failed_free;
419         }
420
421         nt = netdev_priv(dev);
422         nt->parms = *parms;
423         dev->rtnl_link_ops = &ipgre_link_ops;
424
425         dev->mtu = ipgre_tunnel_bind_dev(dev);
426
427         if (register_netdevice(dev) < 0)
428                 goto failed_free;
429
430         dev_hold(dev);
431         ipgre_tunnel_link(ign, nt);
432         return nt;
433
434 failed_free:
435         free_netdev(dev);
436         return NULL;
437 }
438
439 static void ipgre_tunnel_uninit(struct net_device *dev)
440 {
441         struct net *net = dev_net(dev);
442         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
443
444         ipgre_tunnel_unlink(ign, netdev_priv(dev));
445         dev_put(dev);
446 }
447
448
449 static void ipgre_err(struct sk_buff *skb, u32 info)
450 {
451
452 /* All the routers (except for Linux) return only
453    8 bytes of packet payload. It means, that precise relaying of
454    ICMP in the real Internet is absolutely infeasible.
455
456    Moreover, Cisco "wise men" put GRE key to the third word
457    in GRE header. It makes impossible maintaining even soft state for keyed
458    GRE tunnels with enabled checksum. Tell them "thank you".
459
460    Well, I wonder, rfc1812 was written by Cisco employee,
461    what the hell these idiots break standrads established
462    by themself???
463  */
464
465         struct iphdr *iph = (struct iphdr *)skb->data;
466         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
467         int grehlen = (iph->ihl<<2) + 4;
468         const int type = icmp_hdr(skb)->type;
469         const int code = icmp_hdr(skb)->code;
470         struct ip_tunnel *t;
471         __be16 flags;
472
473         flags = p[0];
474         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
475                 if (flags&(GRE_VERSION|GRE_ROUTING))
476                         return;
477                 if (flags&GRE_KEY) {
478                         grehlen += 4;
479                         if (flags&GRE_CSUM)
480                                 grehlen += 4;
481                 }
482         }
483
484         /* If only 8 bytes returned, keyed message will be dropped here */
485         if (skb_headlen(skb) < grehlen)
486                 return;
487
488         switch (type) {
489         default:
490         case ICMP_PARAMETERPROB:
491                 return;
492
493         case ICMP_DEST_UNREACH:
494                 switch (code) {
495                 case ICMP_SR_FAILED:
496                 case ICMP_PORT_UNREACH:
497                         /* Impossible event. */
498                         return;
499                 case ICMP_FRAG_NEEDED:
500                         /* Soft state for pmtu is maintained by IP core. */
501                         return;
502                 default:
503                         /* All others are translated to HOST_UNREACH.
504                            rfc2003 contains "deep thoughts" about NET_UNREACH,
505                            I believe they are just ether pollution. --ANK
506                          */
507                         break;
508                 }
509                 break;
510         case ICMP_TIME_EXCEEDED:
511                 if (code != ICMP_EXC_TTL)
512                         return;
513                 break;
514         }
515
516         rcu_read_lock();
517         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
518                                 flags & GRE_KEY ?
519                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
520                                 p[1]);
521         if (t == NULL || t->parms.iph.daddr == 0 ||
522             ipv4_is_multicast(t->parms.iph.daddr))
523                 goto out;
524
525         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
526                 goto out;
527
528         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
529                 t->err_count++;
530         else
531                 t->err_count = 1;
532         t->err_time = jiffies;
533 out:
534         rcu_read_unlock();
535 }
536
537 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
538 {
539         if (INET_ECN_is_ce(iph->tos)) {
540                 if (skb->protocol == htons(ETH_P_IP)) {
541                         IP_ECN_set_ce(ip_hdr(skb));
542                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
543                         IP6_ECN_set_ce(ipv6_hdr(skb));
544                 }
545         }
546 }
547
548 static inline u8
549 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 {
551         u8 inner = 0;
552         if (skb->protocol == htons(ETH_P_IP))
553                 inner = old_iph->tos;
554         else if (skb->protocol == htons(ETH_P_IPV6))
555                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
556         return INET_ECN_encapsulate(tos, inner);
557 }
558
559 static int ipgre_rcv(struct sk_buff *skb)
560 {
561         struct iphdr *iph;
562         u8     *h;
563         __be16    flags;
564         __sum16   csum = 0;
565         __be32 key = 0;
566         u32    seqno = 0;
567         struct ip_tunnel *tunnel;
568         int    offset = 4;
569         __be16 gre_proto;
570
571         if (!pskb_may_pull(skb, 16))
572                 goto drop_nolock;
573
574         iph = ip_hdr(skb);
575         h = skb->data;
576         flags = *(__be16*)h;
577
578         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
579                 /* - Version must be 0.
580                    - We do not support routing headers.
581                  */
582                 if (flags&(GRE_VERSION|GRE_ROUTING))
583                         goto drop_nolock;
584
585                 if (flags&GRE_CSUM) {
586                         switch (skb->ip_summed) {
587                         case CHECKSUM_COMPLETE:
588                                 csum = csum_fold(skb->csum);
589                                 if (!csum)
590                                         break;
591                                 /* fall through */
592                         case CHECKSUM_NONE:
593                                 skb->csum = 0;
594                                 csum = __skb_checksum_complete(skb);
595                                 skb->ip_summed = CHECKSUM_COMPLETE;
596                         }
597                         offset += 4;
598                 }
599                 if (flags&GRE_KEY) {
600                         key = *(__be32*)(h + offset);
601                         offset += 4;
602                 }
603                 if (flags&GRE_SEQ) {
604                         seqno = ntohl(*(__be32*)(h + offset));
605                         offset += 4;
606                 }
607         }
608
609         gre_proto = *(__be16 *)(h + 2);
610
611         rcu_read_lock();
612         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
613                                           iph->saddr, iph->daddr, key,
614                                           gre_proto))) {
615                 struct pcpu_tstats *tstats;
616
617                 secpath_reset(skb);
618
619                 skb->protocol = gre_proto;
620                 /* WCCP version 1 and 2 protocol decoding.
621                  * - Change protocol to IP
622                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
623                  */
624                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
625                         skb->protocol = htons(ETH_P_IP);
626                         if ((*(h + offset) & 0xF0) != 0x40)
627                                 offset += 4;
628                 }
629
630                 skb->mac_header = skb->network_header;
631                 __pskb_pull(skb, offset);
632                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
633                 skb->pkt_type = PACKET_HOST;
634 #ifdef CONFIG_NET_IPGRE_BROADCAST
635                 if (ipv4_is_multicast(iph->daddr)) {
636                         /* Looped back packet, drop it! */
637                         if (skb_rtable(skb)->fl.iif == 0)
638                                 goto drop;
639                         tunnel->dev->stats.multicast++;
640                         skb->pkt_type = PACKET_BROADCAST;
641                 }
642 #endif
643
644                 if (((flags&GRE_CSUM) && csum) ||
645                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
646                         tunnel->dev->stats.rx_crc_errors++;
647                         tunnel->dev->stats.rx_errors++;
648                         goto drop;
649                 }
650                 if (tunnel->parms.i_flags&GRE_SEQ) {
651                         if (!(flags&GRE_SEQ) ||
652                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
653                                 tunnel->dev->stats.rx_fifo_errors++;
654                                 tunnel->dev->stats.rx_errors++;
655                                 goto drop;
656                         }
657                         tunnel->i_seqno = seqno + 1;
658                 }
659
660                 /* Warning: All skb pointers will be invalidated! */
661                 if (tunnel->dev->type == ARPHRD_ETHER) {
662                         if (!pskb_may_pull(skb, ETH_HLEN)) {
663                                 tunnel->dev->stats.rx_length_errors++;
664                                 tunnel->dev->stats.rx_errors++;
665                                 goto drop;
666                         }
667
668                         iph = ip_hdr(skb);
669                         skb->protocol = eth_type_trans(skb, tunnel->dev);
670                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
671                 }
672
673                 tstats = this_cpu_ptr(tunnel->dev->tstats);
674                 tstats->rx_packets++;
675                 tstats->rx_bytes += skb->len;
676
677                 __skb_tunnel_rx(skb, tunnel->dev);
678
679                 skb_reset_network_header(skb);
680                 ipgre_ecn_decapsulate(iph, skb);
681
682                 if (netif_rx(skb) == NET_RX_DROP)
683                         tunnel->dev->stats.rx_dropped++;
684
685                 rcu_read_unlock();
686                 return 0;
687         }
688         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
689
690 drop:
691         rcu_read_unlock();
692 drop_nolock:
693         kfree_skb(skb);
694         return 0;
695 }
696
697 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
698 {
699         struct ip_tunnel *tunnel = netdev_priv(dev);
700         struct pcpu_tstats *tstats;
701         struct iphdr  *old_iph = ip_hdr(skb);
702         struct iphdr  *tiph;
703         u8     tos;
704         __be16 df;
705         struct rtable *rt;                      /* Route to the other host */
706         struct net_device *tdev;                /* Device to other host */
707         struct iphdr  *iph;                     /* Our new IP header */
708         unsigned int max_headroom;              /* The extra header space needed */
709         int    gre_hlen;
710         __be32 dst;
711         int    mtu;
712
713         if (dev->type == ARPHRD_ETHER)
714                 IPCB(skb)->flags = 0;
715
716         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
717                 gre_hlen = 0;
718                 tiph = (struct iphdr *)skb->data;
719         } else {
720                 gre_hlen = tunnel->hlen;
721                 tiph = &tunnel->parms.iph;
722         }
723
724         if ((dst = tiph->daddr) == 0) {
725                 /* NBMA tunnel */
726
727                 if (skb_dst(skb) == NULL) {
728                         dev->stats.tx_fifo_errors++;
729                         goto tx_error;
730                 }
731
732                 if (skb->protocol == htons(ETH_P_IP)) {
733                         rt = skb_rtable(skb);
734                         if ((dst = rt->rt_gateway) == 0)
735                                 goto tx_error_icmp;
736                 }
737 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
738                 else if (skb->protocol == htons(ETH_P_IPV6)) {
739                         struct in6_addr *addr6;
740                         int addr_type;
741                         struct neighbour *neigh = skb_dst(skb)->neighbour;
742
743                         if (neigh == NULL)
744                                 goto tx_error;
745
746                         addr6 = (struct in6_addr *)&neigh->primary_key;
747                         addr_type = ipv6_addr_type(addr6);
748
749                         if (addr_type == IPV6_ADDR_ANY) {
750                                 addr6 = &ipv6_hdr(skb)->daddr;
751                                 addr_type = ipv6_addr_type(addr6);
752                         }
753
754                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
755                                 goto tx_error_icmp;
756
757                         dst = addr6->s6_addr32[3];
758                 }
759 #endif
760                 else
761                         goto tx_error;
762         }
763
764         tos = tiph->tos;
765         if (tos == 1) {
766                 tos = 0;
767                 if (skb->protocol == htons(ETH_P_IP))
768                         tos = old_iph->tos;
769                 else if (skb->protocol == htons(ETH_P_IPV6))
770                         tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
771         }
772
773         {
774                 struct flowi fl = {
775                         .oif = tunnel->parms.link,
776                         .nl_u = {
777                                 .ip4_u = {
778                                         .daddr = dst,
779                                         .saddr = tiph->saddr,
780                                         .tos = RT_TOS(tos)
781                                 }
782                         },
783                         .proto = IPPROTO_GRE
784                 }
785 ;
786                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
787                         dev->stats.tx_carrier_errors++;
788                         goto tx_error;
789                 }
790         }
791         tdev = rt->dst.dev;
792
793         if (tdev == dev) {
794                 ip_rt_put(rt);
795                 dev->stats.collisions++;
796                 goto tx_error;
797         }
798
799         df = tiph->frag_off;
800         if (df)
801                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
802         else
803                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
804
805         if (skb_dst(skb))
806                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
807
808         if (skb->protocol == htons(ETH_P_IP)) {
809                 df |= (old_iph->frag_off&htons(IP_DF));
810
811                 if ((old_iph->frag_off&htons(IP_DF)) &&
812                     mtu < ntohs(old_iph->tot_len)) {
813                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
814                         ip_rt_put(rt);
815                         goto tx_error;
816                 }
817         }
818 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
819         else if (skb->protocol == htons(ETH_P_IPV6)) {
820                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
821
822                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
823                         if ((tunnel->parms.iph.daddr &&
824                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
825                             rt6->rt6i_dst.plen == 128) {
826                                 rt6->rt6i_flags |= RTF_MODIFIED;
827                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
828                         }
829                 }
830
831                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
832                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
833                         ip_rt_put(rt);
834                         goto tx_error;
835                 }
836         }
837 #endif
838
839         if (tunnel->err_count > 0) {
840                 if (time_before(jiffies,
841                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
842                         tunnel->err_count--;
843
844                         dst_link_failure(skb);
845                 } else
846                         tunnel->err_count = 0;
847         }
848
849         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
850
851         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
852             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
853                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
854                 if (max_headroom > dev->needed_headroom)
855                         dev->needed_headroom = max_headroom;
856                 if (!new_skb) {
857                         ip_rt_put(rt);
858                         dev->stats.tx_dropped++;
859                         dev_kfree_skb(skb);
860                         return NETDEV_TX_OK;
861                 }
862                 if (skb->sk)
863                         skb_set_owner_w(new_skb, skb->sk);
864                 dev_kfree_skb(skb);
865                 skb = new_skb;
866                 old_iph = ip_hdr(skb);
867         }
868
869         skb_reset_transport_header(skb);
870         skb_push(skb, gre_hlen);
871         skb_reset_network_header(skb);
872         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
873         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
874                               IPSKB_REROUTED);
875         skb_dst_drop(skb);
876         skb_dst_set(skb, &rt->dst);
877
878         /*
879          *      Push down and install the IPIP header.
880          */
881
882         iph                     =       ip_hdr(skb);
883         iph->version            =       4;
884         iph->ihl                =       sizeof(struct iphdr) >> 2;
885         iph->frag_off           =       df;
886         iph->protocol           =       IPPROTO_GRE;
887         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
888         iph->daddr              =       rt->rt_dst;
889         iph->saddr              =       rt->rt_src;
890
891         if ((iph->ttl = tiph->ttl) == 0) {
892                 if (skb->protocol == htons(ETH_P_IP))
893                         iph->ttl = old_iph->ttl;
894 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
895                 else if (skb->protocol == htons(ETH_P_IPV6))
896                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
897 #endif
898                 else
899                         iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
900         }
901
902         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
903         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
904                                    htons(ETH_P_TEB) : skb->protocol;
905
906         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
907                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
908
909                 if (tunnel->parms.o_flags&GRE_SEQ) {
910                         ++tunnel->o_seqno;
911                         *ptr = htonl(tunnel->o_seqno);
912                         ptr--;
913                 }
914                 if (tunnel->parms.o_flags&GRE_KEY) {
915                         *ptr = tunnel->parms.o_key;
916                         ptr--;
917                 }
918                 if (tunnel->parms.o_flags&GRE_CSUM) {
919                         *ptr = 0;
920                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
921                 }
922         }
923
924         nf_reset(skb);
925         tstats = this_cpu_ptr(dev->tstats);
926         __IPTUNNEL_XMIT(tstats, &dev->stats);
927         return NETDEV_TX_OK;
928
929 tx_error_icmp:
930         dst_link_failure(skb);
931
932 tx_error:
933         dev->stats.tx_errors++;
934         dev_kfree_skb(skb);
935         return NETDEV_TX_OK;
936 }
937
938 static int ipgre_tunnel_bind_dev(struct net_device *dev)
939 {
940         struct net_device *tdev = NULL;
941         struct ip_tunnel *tunnel;
942         struct iphdr *iph;
943         int hlen = LL_MAX_HEADER;
944         int mtu = ETH_DATA_LEN;
945         int addend = sizeof(struct iphdr) + 4;
946
947         tunnel = netdev_priv(dev);
948         iph = &tunnel->parms.iph;
949
950         /* Guess output device to choose reasonable mtu and needed_headroom */
951
952         if (iph->daddr) {
953                 struct flowi fl = {
954                         .oif = tunnel->parms.link,
955                         .nl_u = {
956                                 .ip4_u = {
957                                         .daddr = iph->daddr,
958                                         .saddr = iph->saddr,
959                                         .tos = RT_TOS(iph->tos)
960                                 }
961                         },
962                         .proto = IPPROTO_GRE
963                 };
964                 struct rtable *rt;
965
966                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
967                         tdev = rt->dst.dev;
968                         ip_rt_put(rt);
969                 }
970
971                 if (dev->type != ARPHRD_ETHER)
972                         dev->flags |= IFF_POINTOPOINT;
973         }
974
975         if (!tdev && tunnel->parms.link)
976                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
977
978         if (tdev) {
979                 hlen = tdev->hard_header_len + tdev->needed_headroom;
980                 mtu = tdev->mtu;
981         }
982         dev->iflink = tunnel->parms.link;
983
984         /* Precalculate GRE options length */
985         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
986                 if (tunnel->parms.o_flags&GRE_CSUM)
987                         addend += 4;
988                 if (tunnel->parms.o_flags&GRE_KEY)
989                         addend += 4;
990                 if (tunnel->parms.o_flags&GRE_SEQ)
991                         addend += 4;
992         }
993         dev->needed_headroom = addend + hlen;
994         mtu -= dev->hard_header_len + addend;
995
996         if (mtu < 68)
997                 mtu = 68;
998
999         tunnel->hlen = addend;
1000
1001         return mtu;
1002 }
1003
1004 static int
1005 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1006 {
1007         int err = 0;
1008         struct ip_tunnel_parm p;
1009         struct ip_tunnel *t;
1010         struct net *net = dev_net(dev);
1011         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1012
1013         switch (cmd) {
1014         case SIOCGETTUNNEL:
1015                 t = NULL;
1016                 if (dev == ign->fb_tunnel_dev) {
1017                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1018                                 err = -EFAULT;
1019                                 break;
1020                         }
1021                         t = ipgre_tunnel_locate(net, &p, 0);
1022                 }
1023                 if (t == NULL)
1024                         t = netdev_priv(dev);
1025                 memcpy(&p, &t->parms, sizeof(p));
1026                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1027                         err = -EFAULT;
1028                 break;
1029
1030         case SIOCADDTUNNEL:
1031         case SIOCCHGTUNNEL:
1032                 err = -EPERM;
1033                 if (!capable(CAP_NET_ADMIN))
1034                         goto done;
1035
1036                 err = -EFAULT;
1037                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1038                         goto done;
1039
1040                 err = -EINVAL;
1041                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1042                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1043                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1044                         goto done;
1045                 if (p.iph.ttl)
1046                         p.iph.frag_off |= htons(IP_DF);
1047
1048                 if (!(p.i_flags&GRE_KEY))
1049                         p.i_key = 0;
1050                 if (!(p.o_flags&GRE_KEY))
1051                         p.o_key = 0;
1052
1053                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1054
1055                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1056                         if (t != NULL) {
1057                                 if (t->dev != dev) {
1058                                         err = -EEXIST;
1059                                         break;
1060                                 }
1061                         } else {
1062                                 unsigned int nflags = 0;
1063
1064                                 t = netdev_priv(dev);
1065
1066                                 if (ipv4_is_multicast(p.iph.daddr))
1067                                         nflags = IFF_BROADCAST;
1068                                 else if (p.iph.daddr)
1069                                         nflags = IFF_POINTOPOINT;
1070
1071                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1072                                         err = -EINVAL;
1073                                         break;
1074                                 }
1075                                 ipgre_tunnel_unlink(ign, t);
1076                                 t->parms.iph.saddr = p.iph.saddr;
1077                                 t->parms.iph.daddr = p.iph.daddr;
1078                                 t->parms.i_key = p.i_key;
1079                                 t->parms.o_key = p.o_key;
1080                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1081                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1082                                 ipgre_tunnel_link(ign, t);
1083                                 netdev_state_change(dev);
1084                         }
1085                 }
1086
1087                 if (t) {
1088                         err = 0;
1089                         if (cmd == SIOCCHGTUNNEL) {
1090                                 t->parms.iph.ttl = p.iph.ttl;
1091                                 t->parms.iph.tos = p.iph.tos;
1092                                 t->parms.iph.frag_off = p.iph.frag_off;
1093                                 if (t->parms.link != p.link) {
1094                                         t->parms.link = p.link;
1095                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1096                                         netdev_state_change(dev);
1097                                 }
1098                         }
1099                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1100                                 err = -EFAULT;
1101                 } else
1102                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1103                 break;
1104
1105         case SIOCDELTUNNEL:
1106                 err = -EPERM;
1107                 if (!capable(CAP_NET_ADMIN))
1108                         goto done;
1109
1110                 if (dev == ign->fb_tunnel_dev) {
1111                         err = -EFAULT;
1112                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1113                                 goto done;
1114                         err = -ENOENT;
1115                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1116                                 goto done;
1117                         err = -EPERM;
1118                         if (t == netdev_priv(ign->fb_tunnel_dev))
1119                                 goto done;
1120                         dev = t->dev;
1121                 }
1122                 unregister_netdevice(dev);
1123                 err = 0;
1124                 break;
1125
1126         default:
1127                 err = -EINVAL;
1128         }
1129
1130 done:
1131         return err;
1132 }
1133
1134 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1135 {
1136         struct ip_tunnel *tunnel = netdev_priv(dev);
1137         if (new_mtu < 68 ||
1138             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1139                 return -EINVAL;
1140         dev->mtu = new_mtu;
1141         return 0;
1142 }
1143
1144 /* Nice toy. Unfortunately, useless in real life :-)
1145    It allows to construct virtual multiprotocol broadcast "LAN"
1146    over the Internet, provided multicast routing is tuned.
1147
1148
1149    I have no idea was this bicycle invented before me,
1150    so that I had to set ARPHRD_IPGRE to a random value.
1151    I have an impression, that Cisco could make something similar,
1152    but this feature is apparently missing in IOS<=11.2(8).
1153
1154    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1155    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1156
1157    ping -t 255 224.66.66.66
1158
1159    If nobody answers, mbone does not work.
1160
1161    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1162    ip addr add 10.66.66.<somewhat>/24 dev Universe
1163    ifconfig Universe up
1164    ifconfig Universe add fe80::<Your_real_addr>/10
1165    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1166    ftp 10.66.66.66
1167    ...
1168    ftp fec0:6666:6666::193.233.7.65
1169    ...
1170
1171  */
1172
1173 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1174                         unsigned short type,
1175                         const void *daddr, const void *saddr, unsigned int len)
1176 {
1177         struct ip_tunnel *t = netdev_priv(dev);
1178         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1179         __be16 *p = (__be16*)(iph+1);
1180
1181         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1182         p[0]            = t->parms.o_flags;
1183         p[1]            = htons(type);
1184
1185         /*
1186          *      Set the source hardware address.
1187          */
1188
1189         if (saddr)
1190                 memcpy(&iph->saddr, saddr, 4);
1191         if (daddr)
1192                 memcpy(&iph->daddr, daddr, 4);
1193         if (iph->daddr)
1194                 return t->hlen;
1195
1196         return -t->hlen;
1197 }
1198
1199 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1200 {
1201         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1202         memcpy(haddr, &iph->saddr, 4);
1203         return 4;
1204 }
1205
1206 static const struct header_ops ipgre_header_ops = {
1207         .create = ipgre_header,
1208         .parse  = ipgre_header_parse,
1209 };
1210
1211 #ifdef CONFIG_NET_IPGRE_BROADCAST
1212 static int ipgre_open(struct net_device *dev)
1213 {
1214         struct ip_tunnel *t = netdev_priv(dev);
1215
1216         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1217                 struct flowi fl = {
1218                         .oif = t->parms.link,
1219                         .nl_u = {
1220                                 .ip4_u = {
1221                                         .daddr = t->parms.iph.daddr,
1222                                         .saddr = t->parms.iph.saddr,
1223                                         .tos = RT_TOS(t->parms.iph.tos)
1224                                 }
1225                         },
1226                         .proto = IPPROTO_GRE
1227                 };
1228                 struct rtable *rt;
1229
1230                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1231                         return -EADDRNOTAVAIL;
1232                 dev = rt->dst.dev;
1233                 ip_rt_put(rt);
1234                 if (__in_dev_get_rtnl(dev) == NULL)
1235                         return -EADDRNOTAVAIL;
1236                 t->mlink = dev->ifindex;
1237                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1238         }
1239         return 0;
1240 }
1241
1242 static int ipgre_close(struct net_device *dev)
1243 {
1244         struct ip_tunnel *t = netdev_priv(dev);
1245
1246         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1247                 struct in_device *in_dev;
1248                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1249                 if (in_dev) {
1250                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1251                         in_dev_put(in_dev);
1252                 }
1253         }
1254         return 0;
1255 }
1256
1257 #endif
1258
1259 static const struct net_device_ops ipgre_netdev_ops = {
1260         .ndo_init               = ipgre_tunnel_init,
1261         .ndo_uninit             = ipgre_tunnel_uninit,
1262 #ifdef CONFIG_NET_IPGRE_BROADCAST
1263         .ndo_open               = ipgre_open,
1264         .ndo_stop               = ipgre_close,
1265 #endif
1266         .ndo_start_xmit         = ipgre_tunnel_xmit,
1267         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1268         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1269         .ndo_get_stats          = ipgre_get_stats,
1270 };
1271
1272 static void ipgre_dev_free(struct net_device *dev)
1273 {
1274         free_percpu(dev->tstats);
1275         free_netdev(dev);
1276 }
1277
1278 static void ipgre_tunnel_setup(struct net_device *dev)
1279 {
1280         dev->netdev_ops         = &ipgre_netdev_ops;
1281         dev->destructor         = ipgre_dev_free;
1282
1283         dev->type               = ARPHRD_IPGRE;
1284         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1285         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1286         dev->flags              = IFF_NOARP;
1287         dev->iflink             = 0;
1288         dev->addr_len           = 4;
1289         dev->features           |= NETIF_F_NETNS_LOCAL;
1290         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1291 }
1292
1293 static int ipgre_tunnel_init(struct net_device *dev)
1294 {
1295         struct ip_tunnel *tunnel;
1296         struct iphdr *iph;
1297
1298         tunnel = netdev_priv(dev);
1299         iph = &tunnel->parms.iph;
1300
1301         tunnel->dev = dev;
1302         strcpy(tunnel->parms.name, dev->name);
1303
1304         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1305         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1306
1307         if (iph->daddr) {
1308 #ifdef CONFIG_NET_IPGRE_BROADCAST
1309                 if (ipv4_is_multicast(iph->daddr)) {
1310                         if (!iph->saddr)
1311                                 return -EINVAL;
1312                         dev->flags = IFF_BROADCAST;
1313                         dev->header_ops = &ipgre_header_ops;
1314                 }
1315 #endif
1316         } else
1317                 dev->header_ops = &ipgre_header_ops;
1318
1319         dev->tstats = alloc_percpu(struct pcpu_tstats);
1320         if (!dev->tstats)
1321                 return -ENOMEM;
1322
1323         return 0;
1324 }
1325
1326 static void ipgre_fb_tunnel_init(struct net_device *dev)
1327 {
1328         struct ip_tunnel *tunnel = netdev_priv(dev);
1329         struct iphdr *iph = &tunnel->parms.iph;
1330         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1331
1332         tunnel->dev = dev;
1333         strcpy(tunnel->parms.name, dev->name);
1334
1335         iph->version            = 4;
1336         iph->protocol           = IPPROTO_GRE;
1337         iph->ihl                = 5;
1338         tunnel->hlen            = sizeof(struct iphdr) + 4;
1339
1340         dev_hold(dev);
1341         rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
1342 }
1343
1344
1345 static const struct gre_protocol ipgre_protocol = {
1346         .handler     = ipgre_rcv,
1347         .err_handler = ipgre_err,
1348 };
1349
1350 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1351 {
1352         int prio;
1353
1354         for (prio = 0; prio < 4; prio++) {
1355                 int h;
1356                 for (h = 0; h < HASH_SIZE; h++) {
1357                         struct ip_tunnel *t;
1358
1359                         t = rtnl_dereference(ign->tunnels[prio][h]);
1360
1361                         while (t != NULL) {
1362                                 unregister_netdevice_queue(t->dev, head);
1363                                 t = rtnl_dereference(t->next);
1364                         }
1365                 }
1366         }
1367 }
1368
1369 static int __net_init ipgre_init_net(struct net *net)
1370 {
1371         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1372         int err;
1373
1374         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1375                                            ipgre_tunnel_setup);
1376         if (!ign->fb_tunnel_dev) {
1377                 err = -ENOMEM;
1378                 goto err_alloc_dev;
1379         }
1380         dev_net_set(ign->fb_tunnel_dev, net);
1381
1382         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1383         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1384
1385         if ((err = register_netdev(ign->fb_tunnel_dev)))
1386                 goto err_reg_dev;
1387
1388         return 0;
1389
1390 err_reg_dev:
1391         free_netdev(ign->fb_tunnel_dev);
1392 err_alloc_dev:
1393         return err;
1394 }
1395
1396 static void __net_exit ipgre_exit_net(struct net *net)
1397 {
1398         struct ipgre_net *ign;
1399         LIST_HEAD(list);
1400
1401         ign = net_generic(net, ipgre_net_id);
1402         rtnl_lock();
1403         ipgre_destroy_tunnels(ign, &list);
1404         unregister_netdevice_many(&list);
1405         rtnl_unlock();
1406 }
1407
1408 static struct pernet_operations ipgre_net_ops = {
1409         .init = ipgre_init_net,
1410         .exit = ipgre_exit_net,
1411         .id   = &ipgre_net_id,
1412         .size = sizeof(struct ipgre_net),
1413 };
1414
1415 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1416 {
1417         __be16 flags;
1418
1419         if (!data)
1420                 return 0;
1421
1422         flags = 0;
1423         if (data[IFLA_GRE_IFLAGS])
1424                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1425         if (data[IFLA_GRE_OFLAGS])
1426                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1427         if (flags & (GRE_VERSION|GRE_ROUTING))
1428                 return -EINVAL;
1429
1430         return 0;
1431 }
1432
1433 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1434 {
1435         __be32 daddr;
1436
1437         if (tb[IFLA_ADDRESS]) {
1438                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1439                         return -EINVAL;
1440                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1441                         return -EADDRNOTAVAIL;
1442         }
1443
1444         if (!data)
1445                 goto out;
1446
1447         if (data[IFLA_GRE_REMOTE]) {
1448                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1449                 if (!daddr)
1450                         return -EINVAL;
1451         }
1452
1453 out:
1454         return ipgre_tunnel_validate(tb, data);
1455 }
1456
1457 static void ipgre_netlink_parms(struct nlattr *data[],
1458                                 struct ip_tunnel_parm *parms)
1459 {
1460         memset(parms, 0, sizeof(*parms));
1461
1462         parms->iph.protocol = IPPROTO_GRE;
1463
1464         if (!data)
1465                 return;
1466
1467         if (data[IFLA_GRE_LINK])
1468                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1469
1470         if (data[IFLA_GRE_IFLAGS])
1471                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1472
1473         if (data[IFLA_GRE_OFLAGS])
1474                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1475
1476         if (data[IFLA_GRE_IKEY])
1477                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1478
1479         if (data[IFLA_GRE_OKEY])
1480                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1481
1482         if (data[IFLA_GRE_LOCAL])
1483                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1484
1485         if (data[IFLA_GRE_REMOTE])
1486                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1487
1488         if (data[IFLA_GRE_TTL])
1489                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1490
1491         if (data[IFLA_GRE_TOS])
1492                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1493
1494         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1495                 parms->iph.frag_off = htons(IP_DF);
1496 }
1497
1498 static int ipgre_tap_init(struct net_device *dev)
1499 {
1500         struct ip_tunnel *tunnel;
1501
1502         tunnel = netdev_priv(dev);
1503
1504         tunnel->dev = dev;
1505         strcpy(tunnel->parms.name, dev->name);
1506
1507         ipgre_tunnel_bind_dev(dev);
1508
1509         dev->tstats = alloc_percpu(struct pcpu_tstats);
1510         if (!dev->tstats)
1511                 return -ENOMEM;
1512
1513         return 0;
1514 }
1515
1516 static const struct net_device_ops ipgre_tap_netdev_ops = {
1517         .ndo_init               = ipgre_tap_init,
1518         .ndo_uninit             = ipgre_tunnel_uninit,
1519         .ndo_start_xmit         = ipgre_tunnel_xmit,
1520         .ndo_set_mac_address    = eth_mac_addr,
1521         .ndo_validate_addr      = eth_validate_addr,
1522         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1523         .ndo_get_stats          = ipgre_get_stats,
1524 };
1525
1526 static void ipgre_tap_setup(struct net_device *dev)
1527 {
1528
1529         ether_setup(dev);
1530
1531         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1532         dev->destructor         = ipgre_dev_free;
1533
1534         dev->iflink             = 0;
1535         dev->features           |= NETIF_F_NETNS_LOCAL;
1536 }
1537
1538 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1539                          struct nlattr *data[])
1540 {
1541         struct ip_tunnel *nt;
1542         struct net *net = dev_net(dev);
1543         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1544         int mtu;
1545         int err;
1546
1547         nt = netdev_priv(dev);
1548         ipgre_netlink_parms(data, &nt->parms);
1549
1550         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1551                 return -EEXIST;
1552
1553         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1554                 random_ether_addr(dev->dev_addr);
1555
1556         mtu = ipgre_tunnel_bind_dev(dev);
1557         if (!tb[IFLA_MTU])
1558                 dev->mtu = mtu;
1559
1560         /* Can use a lockless transmit, unless we generate output sequences */
1561         if (!(nt->parms.o_flags & GRE_SEQ))
1562                 dev->features |= NETIF_F_LLTX;
1563
1564         err = register_netdevice(dev);
1565         if (err)
1566                 goto out;
1567
1568         dev_hold(dev);
1569         ipgre_tunnel_link(ign, nt);
1570
1571 out:
1572         return err;
1573 }
1574
1575 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1576                             struct nlattr *data[])
1577 {
1578         struct ip_tunnel *t, *nt;
1579         struct net *net = dev_net(dev);
1580         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1581         struct ip_tunnel_parm p;
1582         int mtu;
1583
1584         if (dev == ign->fb_tunnel_dev)
1585                 return -EINVAL;
1586
1587         nt = netdev_priv(dev);
1588         ipgre_netlink_parms(data, &p);
1589
1590         t = ipgre_tunnel_locate(net, &p, 0);
1591
1592         if (t) {
1593                 if (t->dev != dev)
1594                         return -EEXIST;
1595         } else {
1596                 t = nt;
1597
1598                 if (dev->type != ARPHRD_ETHER) {
1599                         unsigned int nflags = 0;
1600
1601                         if (ipv4_is_multicast(p.iph.daddr))
1602                                 nflags = IFF_BROADCAST;
1603                         else if (p.iph.daddr)
1604                                 nflags = IFF_POINTOPOINT;
1605
1606                         if ((dev->flags ^ nflags) &
1607                             (IFF_POINTOPOINT | IFF_BROADCAST))
1608                                 return -EINVAL;
1609                 }
1610
1611                 ipgre_tunnel_unlink(ign, t);
1612                 t->parms.iph.saddr = p.iph.saddr;
1613                 t->parms.iph.daddr = p.iph.daddr;
1614                 t->parms.i_key = p.i_key;
1615                 if (dev->type != ARPHRD_ETHER) {
1616                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1617                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1618                 }
1619                 ipgre_tunnel_link(ign, t);
1620                 netdev_state_change(dev);
1621         }
1622
1623         t->parms.o_key = p.o_key;
1624         t->parms.iph.ttl = p.iph.ttl;
1625         t->parms.iph.tos = p.iph.tos;
1626         t->parms.iph.frag_off = p.iph.frag_off;
1627
1628         if (t->parms.link != p.link) {
1629                 t->parms.link = p.link;
1630                 mtu = ipgre_tunnel_bind_dev(dev);
1631                 if (!tb[IFLA_MTU])
1632                         dev->mtu = mtu;
1633                 netdev_state_change(dev);
1634         }
1635
1636         return 0;
1637 }
1638
1639 static size_t ipgre_get_size(const struct net_device *dev)
1640 {
1641         return
1642                 /* IFLA_GRE_LINK */
1643                 nla_total_size(4) +
1644                 /* IFLA_GRE_IFLAGS */
1645                 nla_total_size(2) +
1646                 /* IFLA_GRE_OFLAGS */
1647                 nla_total_size(2) +
1648                 /* IFLA_GRE_IKEY */
1649                 nla_total_size(4) +
1650                 /* IFLA_GRE_OKEY */
1651                 nla_total_size(4) +
1652                 /* IFLA_GRE_LOCAL */
1653                 nla_total_size(4) +
1654                 /* IFLA_GRE_REMOTE */
1655                 nla_total_size(4) +
1656                 /* IFLA_GRE_TTL */
1657                 nla_total_size(1) +
1658                 /* IFLA_GRE_TOS */
1659                 nla_total_size(1) +
1660                 /* IFLA_GRE_PMTUDISC */
1661                 nla_total_size(1) +
1662                 0;
1663 }
1664
1665 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1666 {
1667         struct ip_tunnel *t = netdev_priv(dev);
1668         struct ip_tunnel_parm *p = &t->parms;
1669
1670         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1671         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1672         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1673         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1674         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1675         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1676         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1677         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1678         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1679         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1680
1681         return 0;
1682
1683 nla_put_failure:
1684         return -EMSGSIZE;
1685 }
1686
1687 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1688         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1689         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1690         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1691         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1692         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1693         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1694         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1695         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1696         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1697         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1698 };
1699
1700 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1701         .kind           = "gre",
1702         .maxtype        = IFLA_GRE_MAX,
1703         .policy         = ipgre_policy,
1704         .priv_size      = sizeof(struct ip_tunnel),
1705         .setup          = ipgre_tunnel_setup,
1706         .validate       = ipgre_tunnel_validate,
1707         .newlink        = ipgre_newlink,
1708         .changelink     = ipgre_changelink,
1709         .get_size       = ipgre_get_size,
1710         .fill_info      = ipgre_fill_info,
1711 };
1712
1713 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1714         .kind           = "gretap",
1715         .maxtype        = IFLA_GRE_MAX,
1716         .policy         = ipgre_policy,
1717         .priv_size      = sizeof(struct ip_tunnel),
1718         .setup          = ipgre_tap_setup,
1719         .validate       = ipgre_tap_validate,
1720         .newlink        = ipgre_newlink,
1721         .changelink     = ipgre_changelink,
1722         .get_size       = ipgre_get_size,
1723         .fill_info      = ipgre_fill_info,
1724 };
1725
1726 /*
1727  *      And now the modules code and kernel interface.
1728  */
1729
1730 static int __init ipgre_init(void)
1731 {
1732         int err;
1733
1734         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1735
1736         err = register_pernet_device(&ipgre_net_ops);
1737         if (err < 0)
1738                 return err;
1739
1740         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1741         if (err < 0) {
1742                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1743                 goto add_proto_failed;
1744         }
1745
1746         err = rtnl_link_register(&ipgre_link_ops);
1747         if (err < 0)
1748                 goto rtnl_link_failed;
1749
1750         err = rtnl_link_register(&ipgre_tap_ops);
1751         if (err < 0)
1752                 goto tap_ops_failed;
1753
1754 out:
1755         return err;
1756
1757 tap_ops_failed:
1758         rtnl_link_unregister(&ipgre_link_ops);
1759 rtnl_link_failed:
1760         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1761 add_proto_failed:
1762         unregister_pernet_device(&ipgre_net_ops);
1763         goto out;
1764 }
1765
1766 static void __exit ipgre_fini(void)
1767 {
1768         rtnl_link_unregister(&ipgre_tap_ops);
1769         rtnl_link_unregister(&ipgre_link_ops);
1770         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1771                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1772         unregister_pernet_device(&ipgre_net_ops);
1773 }
1774
1775 module_init(ipgre_init);
1776 module_exit(ipgre_fini);
1777 MODULE_LICENSE("GPL");
1778 MODULE_ALIAS_RTNL_LINK("gre");
1779 MODULE_ALIAS_RTNL_LINK("gretap");