3992e26a603987cf8bba458dd4f687af5c900a2f
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (!ipv6_addr_any(p))
129                 return (const void *) p;
130         return daddr;
131 }
132
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 {
135         struct rt6_info *rt = (struct rt6_info *) dst;
136         struct neighbour *n;
137
138         daddr = choose_neigh_daddr(rt, daddr);
139         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
140         if (n)
141                 return n;
142         return neigh_create(&nd_tbl, daddr, dst->dev);
143 }
144
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 {
147         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148         if (!n) {
149                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
150                 if (IS_ERR(n))
151                         return PTR_ERR(n);
152         }
153         dst_set_neighbour(&rt->dst, n);
154
155         return 0;
156 }
157
158 static struct dst_ops ip6_dst_ops_template = {
159         .family                 =       AF_INET6,
160         .protocol               =       cpu_to_be16(ETH_P_IPV6),
161         .gc                     =       ip6_dst_gc,
162         .gc_thresh              =       1024,
163         .check                  =       ip6_dst_check,
164         .default_advmss         =       ip6_default_advmss,
165         .mtu                    =       ip6_mtu,
166         .cow_metrics            =       ipv6_cow_metrics,
167         .destroy                =       ip6_dst_destroy,
168         .ifdown                 =       ip6_dst_ifdown,
169         .negative_advice        =       ip6_negative_advice,
170         .link_failure           =       ip6_link_failure,
171         .update_pmtu            =       ip6_rt_update_pmtu,
172         .local_out              =       __ip6_local_out,
173         .neigh_lookup           =       ip6_neigh_lookup,
174 };
175
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 {
178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179
180         return mtu ? : dst->dev->mtu;
181 }
182
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
184 {
185 }
186
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
188                                          unsigned long old)
189 {
190         return NULL;
191 }
192
193 static struct dst_ops ip6_dst_blackhole_ops = {
194         .family                 =       AF_INET6,
195         .protocol               =       cpu_to_be16(ETH_P_IPV6),
196         .destroy                =       ip6_dst_destroy,
197         .check                  =       ip6_dst_check,
198         .mtu                    =       ip6_blackhole_mtu,
199         .default_advmss         =       ip6_default_advmss,
200         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
201         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
202         .neigh_lookup           =       ip6_neigh_lookup,
203 };
204
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206         [RTAX_HOPLIMIT - 1] = 255,
207 };
208
209 static struct rt6_info ip6_null_entry_template = {
210         .dst = {
211                 .__refcnt       = ATOMIC_INIT(1),
212                 .__use          = 1,
213                 .obsolete       = -1,
214                 .error          = -ENETUNREACH,
215                 .input          = ip6_pkt_discard,
216                 .output         = ip6_pkt_discard_out,
217         },
218         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
219         .rt6i_protocol  = RTPROT_KERNEL,
220         .rt6i_metric    = ~(u32) 0,
221         .rt6i_ref       = ATOMIC_INIT(1),
222 };
223
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228
229 static struct rt6_info ip6_prohibit_entry_template = {
230         .dst = {
231                 .__refcnt       = ATOMIC_INIT(1),
232                 .__use          = 1,
233                 .obsolete       = -1,
234                 .error          = -EACCES,
235                 .input          = ip6_pkt_prohibit,
236                 .output         = ip6_pkt_prohibit_out,
237         },
238         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
239         .rt6i_protocol  = RTPROT_KERNEL,
240         .rt6i_metric    = ~(u32) 0,
241         .rt6i_ref       = ATOMIC_INIT(1),
242 };
243
244 static struct rt6_info ip6_blk_hole_entry_template = {
245         .dst = {
246                 .__refcnt       = ATOMIC_INIT(1),
247                 .__use          = 1,
248                 .obsolete       = -1,
249                 .error          = -EINVAL,
250                 .input          = dst_discard,
251                 .output         = dst_discard,
252         },
253         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
254         .rt6i_protocol  = RTPROT_KERNEL,
255         .rt6i_metric    = ~(u32) 0,
256         .rt6i_ref       = ATOMIC_INIT(1),
257 };
258
259 #endif
260
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263                                              struct net_device *dev,
264                                              int flags)
265 {
266         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
267
268         if (rt)
269                 memset(&rt->rt6i_table, 0,
270                        sizeof(*rt) - sizeof(struct dst_entry));
271
272         return rt;
273 }
274
275 static void ip6_dst_destroy(struct dst_entry *dst)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct inet_peer *peer = rt->rt6i_peer;
280
281         if (!(rt->dst.flags & DST_HOST))
282                 dst_destroy_metrics_generic(dst);
283
284         if (idev) {
285                 rt->rt6i_idev = NULL;
286                 in6_dev_put(idev);
287         }
288         if (peer) {
289                 rt->rt6i_peer = NULL;
290                 inet_putpeer(peer);
291         }
292 }
293
294 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
295
296 static u32 rt6_peer_genid(void)
297 {
298         return atomic_read(&__rt6_peer_genid);
299 }
300
301 void rt6_bind_peer(struct rt6_info *rt, int create)
302 {
303         struct inet_peer *peer;
304
305         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
306         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
307                 inet_putpeer(peer);
308         else
309                 rt->rt6i_peer_genid = rt6_peer_genid();
310 }
311
312 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
313                            int how)
314 {
315         struct rt6_info *rt = (struct rt6_info *)dst;
316         struct inet6_dev *idev = rt->rt6i_idev;
317         struct net_device *loopback_dev =
318                 dev_net(dev)->loopback_dev;
319
320         if (dev != loopback_dev && idev && idev->dev == dev) {
321                 struct inet6_dev *loopback_idev =
322                         in6_dev_get(loopback_dev);
323                 if (loopback_idev) {
324                         rt->rt6i_idev = loopback_idev;
325                         in6_dev_put(idev);
326                 }
327         }
328 }
329
330 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
331 {
332         return (rt->rt6i_flags & RTF_EXPIRES) &&
333                 time_after(jiffies, rt->dst.expires);
334 }
335
336 static inline int rt6_need_strict(const struct in6_addr *daddr)
337 {
338         return ipv6_addr_type(daddr) &
339                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
340 }
341
342 /*
343  *      Route lookup. Any table->tb6_lock is implied.
344  */
345
346 static inline struct rt6_info *rt6_device_match(struct net *net,
347                                                     struct rt6_info *rt,
348                                                     const struct in6_addr *saddr,
349                                                     int oif,
350                                                     int flags)
351 {
352         struct rt6_info *local = NULL;
353         struct rt6_info *sprt;
354
355         if (!oif && ipv6_addr_any(saddr))
356                 goto out;
357
358         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
359                 struct net_device *dev = sprt->dst.dev;
360
361                 if (oif) {
362                         if (dev->ifindex == oif)
363                                 return sprt;
364                         if (dev->flags & IFF_LOOPBACK) {
365                                 if (!sprt->rt6i_idev ||
366                                     sprt->rt6i_idev->dev->ifindex != oif) {
367                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
368                                                 continue;
369                                         if (local && (!oif ||
370                                                       local->rt6i_idev->dev->ifindex == oif))
371                                                 continue;
372                                 }
373                                 local = sprt;
374                         }
375                 } else {
376                         if (ipv6_chk_addr(net, saddr, dev,
377                                           flags & RT6_LOOKUP_F_IFACE))
378                                 return sprt;
379                 }
380         }
381
382         if (oif) {
383                 if (local)
384                         return local;
385
386                 if (flags & RT6_LOOKUP_F_IFACE)
387                         return net->ipv6.ip6_null_entry;
388         }
389 out:
390         return rt;
391 }
392
393 #ifdef CONFIG_IPV6_ROUTER_PREF
394 static void rt6_probe(struct rt6_info *rt)
395 {
396         struct neighbour *neigh;
397         /*
398          * Okay, this does not seem to be appropriate
399          * for now, however, we need to check if it
400          * is really so; aka Router Reachability Probing.
401          *
402          * Router Reachability Probe MUST be rate-limited
403          * to no more than one per minute.
404          */
405         rcu_read_lock();
406         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
407         if (!neigh || (neigh->nud_state & NUD_VALID))
408                 goto out;
409         read_lock_bh(&neigh->lock);
410         if (!(neigh->nud_state & NUD_VALID) &&
411             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
412                 struct in6_addr mcaddr;
413                 struct in6_addr *target;
414
415                 neigh->updated = jiffies;
416                 read_unlock_bh(&neigh->lock);
417
418                 target = (struct in6_addr *)&neigh->primary_key;
419                 addrconf_addr_solict_mult(target, &mcaddr);
420                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
421         } else {
422                 read_unlock_bh(&neigh->lock);
423         }
424 out:
425         rcu_read_unlock();
426 }
427 #else
428 static inline void rt6_probe(struct rt6_info *rt)
429 {
430 }
431 #endif
432
433 /*
434  * Default Router Selection (RFC 2461 6.3.6)
435  */
436 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
437 {
438         struct net_device *dev = rt->dst.dev;
439         if (!oif || dev->ifindex == oif)
440                 return 2;
441         if ((dev->flags & IFF_LOOPBACK) &&
442             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
443                 return 1;
444         return 0;
445 }
446
447 static inline int rt6_check_neigh(struct rt6_info *rt)
448 {
449         struct neighbour *neigh;
450         int m;
451
452         rcu_read_lock();
453         neigh = dst_get_neighbour_noref(&rt->dst);
454         if (rt->rt6i_flags & RTF_NONEXTHOP ||
455             !(rt->rt6i_flags & RTF_GATEWAY))
456                 m = 1;
457         else if (neigh) {
458                 read_lock_bh(&neigh->lock);
459                 if (neigh->nud_state & NUD_VALID)
460                         m = 2;
461 #ifdef CONFIG_IPV6_ROUTER_PREF
462                 else if (neigh->nud_state & NUD_FAILED)
463                         m = 0;
464 #endif
465                 else
466                         m = 1;
467                 read_unlock_bh(&neigh->lock);
468         } else
469                 m = 0;
470         rcu_read_unlock();
471         return m;
472 }
473
474 static int rt6_score_route(struct rt6_info *rt, int oif,
475                            int strict)
476 {
477         int m, n;
478
479         m = rt6_check_dev(rt, oif);
480         if (!m && (strict & RT6_LOOKUP_F_IFACE))
481                 return -1;
482 #ifdef CONFIG_IPV6_ROUTER_PREF
483         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
484 #endif
485         n = rt6_check_neigh(rt);
486         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
487                 return -1;
488         return m;
489 }
490
491 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
492                                    int *mpri, struct rt6_info *match)
493 {
494         int m;
495
496         if (rt6_check_expired(rt))
497                 goto out;
498
499         m = rt6_score_route(rt, oif, strict);
500         if (m < 0)
501                 goto out;
502
503         if (m > *mpri) {
504                 if (strict & RT6_LOOKUP_F_REACHABLE)
505                         rt6_probe(match);
506                 *mpri = m;
507                 match = rt;
508         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
509                 rt6_probe(rt);
510         }
511
512 out:
513         return match;
514 }
515
516 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
517                                      struct rt6_info *rr_head,
518                                      u32 metric, int oif, int strict)
519 {
520         struct rt6_info *rt, *match;
521         int mpri = -1;
522
523         match = NULL;
524         for (rt = rr_head; rt && rt->rt6i_metric == metric;
525              rt = rt->dst.rt6_next)
526                 match = find_match(rt, oif, strict, &mpri, match);
527         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
528              rt = rt->dst.rt6_next)
529                 match = find_match(rt, oif, strict, &mpri, match);
530
531         return match;
532 }
533
534 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
535 {
536         struct rt6_info *match, *rt0;
537         struct net *net;
538
539         rt0 = fn->rr_ptr;
540         if (!rt0)
541                 fn->rr_ptr = rt0 = fn->leaf;
542
543         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
544
545         if (!match &&
546             (strict & RT6_LOOKUP_F_REACHABLE)) {
547                 struct rt6_info *next = rt0->dst.rt6_next;
548
549                 /* no entries matched; do round-robin */
550                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
551                         next = fn->leaf;
552
553                 if (next != rt0)
554                         fn->rr_ptr = next;
555         }
556
557         net = dev_net(rt0->dst.dev);
558         return match ? match : net->ipv6.ip6_null_entry;
559 }
560
561 #ifdef CONFIG_IPV6_ROUTE_INFO
562 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
563                   const struct in6_addr *gwaddr)
564 {
565         struct net *net = dev_net(dev);
566         struct route_info *rinfo = (struct route_info *) opt;
567         struct in6_addr prefix_buf, *prefix;
568         unsigned int pref;
569         unsigned long lifetime;
570         struct rt6_info *rt;
571
572         if (len < sizeof(struct route_info)) {
573                 return -EINVAL;
574         }
575
576         /* Sanity check for prefix_len and length */
577         if (rinfo->length > 3) {
578                 return -EINVAL;
579         } else if (rinfo->prefix_len > 128) {
580                 return -EINVAL;
581         } else if (rinfo->prefix_len > 64) {
582                 if (rinfo->length < 2) {
583                         return -EINVAL;
584                 }
585         } else if (rinfo->prefix_len > 0) {
586                 if (rinfo->length < 1) {
587                         return -EINVAL;
588                 }
589         }
590
591         pref = rinfo->route_pref;
592         if (pref == ICMPV6_ROUTER_PREF_INVALID)
593                 return -EINVAL;
594
595         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
596
597         if (rinfo->length == 3)
598                 prefix = (struct in6_addr *)rinfo->prefix;
599         else {
600                 /* this function is safe */
601                 ipv6_addr_prefix(&prefix_buf,
602                                  (struct in6_addr *)rinfo->prefix,
603                                  rinfo->prefix_len);
604                 prefix = &prefix_buf;
605         }
606
607         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
608                                 dev->ifindex);
609
610         if (rt && !lifetime) {
611                 ip6_del_rt(rt);
612                 rt = NULL;
613         }
614
615         if (!rt && lifetime)
616                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
617                                         pref);
618         else if (rt)
619                 rt->rt6i_flags = RTF_ROUTEINFO |
620                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
621
622         if (rt) {
623                 if (!addrconf_finite_timeout(lifetime)) {
624                         rt->rt6i_flags &= ~RTF_EXPIRES;
625                 } else {
626                         rt->dst.expires = jiffies + HZ * lifetime;
627                         rt->rt6i_flags |= RTF_EXPIRES;
628                 }
629                 dst_release(&rt->dst);
630         }
631         return 0;
632 }
633 #endif
634
635 #define BACKTRACK(__net, saddr)                 \
636 do { \
637         if (rt == __net->ipv6.ip6_null_entry) { \
638                 struct fib6_node *pn; \
639                 while (1) { \
640                         if (fn->fn_flags & RTN_TL_ROOT) \
641                                 goto out; \
642                         pn = fn->parent; \
643                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
644                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
645                         else \
646                                 fn = pn; \
647                         if (fn->fn_flags & RTN_RTINFO) \
648                                 goto restart; \
649                 } \
650         } \
651 } while (0)
652
653 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
654                                              struct fib6_table *table,
655                                              struct flowi6 *fl6, int flags)
656 {
657         struct fib6_node *fn;
658         struct rt6_info *rt;
659
660         read_lock_bh(&table->tb6_lock);
661         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
662 restart:
663         rt = fn->leaf;
664         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
665         BACKTRACK(net, &fl6->saddr);
666 out:
667         dst_use(&rt->dst, jiffies);
668         read_unlock_bh(&table->tb6_lock);
669         return rt;
670
671 }
672
673 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
674                                     int flags)
675 {
676         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
677 }
678 EXPORT_SYMBOL_GPL(ip6_route_lookup);
679
680 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
681                             const struct in6_addr *saddr, int oif, int strict)
682 {
683         struct flowi6 fl6 = {
684                 .flowi6_oif = oif,
685                 .daddr = *daddr,
686         };
687         struct dst_entry *dst;
688         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
689
690         if (saddr) {
691                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
692                 flags |= RT6_LOOKUP_F_HAS_SADDR;
693         }
694
695         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
696         if (dst->error == 0)
697                 return (struct rt6_info *) dst;
698
699         dst_release(dst);
700
701         return NULL;
702 }
703
704 EXPORT_SYMBOL(rt6_lookup);
705
706 /* ip6_ins_rt is called with FREE table->tb6_lock.
707    It takes new route entry, the addition fails by any reason the
708    route is freed. In any case, if caller does not hold it, it may
709    be destroyed.
710  */
711
712 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
713 {
714         int err;
715         struct fib6_table *table;
716
717         table = rt->rt6i_table;
718         write_lock_bh(&table->tb6_lock);
719         err = fib6_add(&table->tb6_root, rt, info);
720         write_unlock_bh(&table->tb6_lock);
721
722         return err;
723 }
724
725 int ip6_ins_rt(struct rt6_info *rt)
726 {
727         struct nl_info info = {
728                 .nl_net = dev_net(rt->dst.dev),
729         };
730         return __ip6_ins_rt(rt, &info);
731 }
732
733 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
734                                       const struct in6_addr *daddr,
735                                       const struct in6_addr *saddr)
736 {
737         struct rt6_info *rt;
738
739         /*
740          *      Clone the route.
741          */
742
743         rt = ip6_rt_copy(ort, daddr);
744
745         if (rt) {
746                 int attempts = !in_softirq();
747
748                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
749                         if (ort->rt6i_dst.plen != 128 &&
750                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
751                                 rt->rt6i_flags |= RTF_ANYCAST;
752                         rt->rt6i_gateway = *daddr;
753                 }
754
755                 rt->rt6i_flags |= RTF_CACHE;
756
757 #ifdef CONFIG_IPV6_SUBTREES
758                 if (rt->rt6i_src.plen && saddr) {
759                         rt->rt6i_src.addr = *saddr;
760                         rt->rt6i_src.plen = 128;
761                 }
762 #endif
763
764         retry:
765                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
766                         struct net *net = dev_net(rt->dst.dev);
767                         int saved_rt_min_interval =
768                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
769                         int saved_rt_elasticity =
770                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
771
772                         if (attempts-- > 0) {
773                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
774                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
775
776                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
777
778                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
779                                         saved_rt_elasticity;
780                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
781                                         saved_rt_min_interval;
782                                 goto retry;
783                         }
784
785                         if (net_ratelimit())
786                                 printk(KERN_WARNING
787                                        "ipv6: Neighbour table overflow.\n");
788                         dst_free(&rt->dst);
789                         return NULL;
790                 }
791         }
792
793         return rt;
794 }
795
796 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
797                                         const struct in6_addr *daddr)
798 {
799         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
800
801         if (rt) {
802                 rt->rt6i_flags |= RTF_CACHE;
803                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
804         }
805         return rt;
806 }
807
808 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
809                                       struct flowi6 *fl6, int flags)
810 {
811         struct fib6_node *fn;
812         struct rt6_info *rt, *nrt;
813         int strict = 0;
814         int attempts = 3;
815         int err;
816         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
817
818         strict |= flags & RT6_LOOKUP_F_IFACE;
819
820 relookup:
821         read_lock_bh(&table->tb6_lock);
822
823 restart_2:
824         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
825
826 restart:
827         rt = rt6_select(fn, oif, strict | reachable);
828
829         BACKTRACK(net, &fl6->saddr);
830         if (rt == net->ipv6.ip6_null_entry ||
831             rt->rt6i_flags & RTF_CACHE)
832                 goto out;
833
834         dst_hold(&rt->dst);
835         read_unlock_bh(&table->tb6_lock);
836
837         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
838                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
839         else if (!(rt->dst.flags & DST_HOST))
840                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
841         else
842                 goto out2;
843
844         dst_release(&rt->dst);
845         rt = nrt ? : net->ipv6.ip6_null_entry;
846
847         dst_hold(&rt->dst);
848         if (nrt) {
849                 err = ip6_ins_rt(nrt);
850                 if (!err)
851                         goto out2;
852         }
853
854         if (--attempts <= 0)
855                 goto out2;
856
857         /*
858          * Race condition! In the gap, when table->tb6_lock was
859          * released someone could insert this route.  Relookup.
860          */
861         dst_release(&rt->dst);
862         goto relookup;
863
864 out:
865         if (reachable) {
866                 reachable = 0;
867                 goto restart_2;
868         }
869         dst_hold(&rt->dst);
870         read_unlock_bh(&table->tb6_lock);
871 out2:
872         rt->dst.lastuse = jiffies;
873         rt->dst.__use++;
874
875         return rt;
876 }
877
878 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
879                                             struct flowi6 *fl6, int flags)
880 {
881         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
882 }
883
884 static struct dst_entry *ip6_route_input_lookup(struct net *net,
885                                                 struct net_device *dev,
886                                                 struct flowi6 *fl6, int flags)
887 {
888         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
889                 flags |= RT6_LOOKUP_F_IFACE;
890
891         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
892 }
893
894 void ip6_route_input(struct sk_buff *skb)
895 {
896         const struct ipv6hdr *iph = ipv6_hdr(skb);
897         struct net *net = dev_net(skb->dev);
898         int flags = RT6_LOOKUP_F_HAS_SADDR;
899         struct flowi6 fl6 = {
900                 .flowi6_iif = skb->dev->ifindex,
901                 .daddr = iph->daddr,
902                 .saddr = iph->saddr,
903                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
904                 .flowi6_mark = skb->mark,
905                 .flowi6_proto = iph->nexthdr,
906         };
907
908         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
909 }
910
911 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
912                                              struct flowi6 *fl6, int flags)
913 {
914         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
915 }
916
917 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
918                                     struct flowi6 *fl6)
919 {
920         int flags = 0;
921
922         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
923                 flags |= RT6_LOOKUP_F_IFACE;
924
925         if (!ipv6_addr_any(&fl6->saddr))
926                 flags |= RT6_LOOKUP_F_HAS_SADDR;
927         else if (sk)
928                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
929
930         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
931 }
932
933 EXPORT_SYMBOL(ip6_route_output);
934
935 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
936 {
937         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
938         struct dst_entry *new = NULL;
939
940         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
941         if (rt) {
942                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
943
944                 new = &rt->dst;
945
946                 new->__use = 1;
947                 new->input = dst_discard;
948                 new->output = dst_discard;
949
950                 if (dst_metrics_read_only(&ort->dst))
951                         new->_metrics = ort->dst._metrics;
952                 else
953                         dst_copy_metrics(new, &ort->dst);
954                 rt->rt6i_idev = ort->rt6i_idev;
955                 if (rt->rt6i_idev)
956                         in6_dev_hold(rt->rt6i_idev);
957                 rt->dst.expires = 0;
958
959                 rt->rt6i_gateway = ort->rt6i_gateway;
960                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
961                 rt->rt6i_metric = 0;
962
963                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
964 #ifdef CONFIG_IPV6_SUBTREES
965                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
966 #endif
967
968                 dst_free(new);
969         }
970
971         dst_release(dst_orig);
972         return new ? new : ERR_PTR(-ENOMEM);
973 }
974
975 /*
976  *      Destination cache support functions
977  */
978
979 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
980 {
981         struct rt6_info *rt;
982
983         rt = (struct rt6_info *) dst;
984
985         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
986                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
987                         if (!rt->rt6i_peer)
988                                 rt6_bind_peer(rt, 0);
989                         rt->rt6i_peer_genid = rt6_peer_genid();
990                 }
991                 return dst;
992         }
993         return NULL;
994 }
995
996 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
997 {
998         struct rt6_info *rt = (struct rt6_info *) dst;
999
1000         if (rt) {
1001                 if (rt->rt6i_flags & RTF_CACHE) {
1002                         if (rt6_check_expired(rt)) {
1003                                 ip6_del_rt(rt);
1004                                 dst = NULL;
1005                         }
1006                 } else {
1007                         dst_release(dst);
1008                         dst = NULL;
1009                 }
1010         }
1011         return dst;
1012 }
1013
1014 static void ip6_link_failure(struct sk_buff *skb)
1015 {
1016         struct rt6_info *rt;
1017
1018         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1019
1020         rt = (struct rt6_info *) skb_dst(skb);
1021         if (rt) {
1022                 if (rt->rt6i_flags & RTF_CACHE) {
1023                         dst_set_expires(&rt->dst, 0);
1024                         rt->rt6i_flags |= RTF_EXPIRES;
1025                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1026                         rt->rt6i_node->fn_sernum = -1;
1027         }
1028 }
1029
1030 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1031 {
1032         struct rt6_info *rt6 = (struct rt6_info*)dst;
1033
1034         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1035                 rt6->rt6i_flags |= RTF_MODIFIED;
1036                 if (mtu < IPV6_MIN_MTU) {
1037                         u32 features = dst_metric(dst, RTAX_FEATURES);
1038                         mtu = IPV6_MIN_MTU;
1039                         features |= RTAX_FEATURE_ALLFRAG;
1040                         dst_metric_set(dst, RTAX_FEATURES, features);
1041                 }
1042                 dst_metric_set(dst, RTAX_MTU, mtu);
1043         }
1044 }
1045
1046 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1047 {
1048         struct net_device *dev = dst->dev;
1049         unsigned int mtu = dst_mtu(dst);
1050         struct net *net = dev_net(dev);
1051
1052         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1053
1054         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1055                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1056
1057         /*
1058          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1059          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1060          * IPV6_MAXPLEN is also valid and means: "any MSS,
1061          * rely only on pmtu discovery"
1062          */
1063         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1064                 mtu = IPV6_MAXPLEN;
1065         return mtu;
1066 }
1067
1068 static unsigned int ip6_mtu(const struct dst_entry *dst)
1069 {
1070         struct inet6_dev *idev;
1071         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1072
1073         if (mtu)
1074                 return mtu;
1075
1076         mtu = IPV6_MIN_MTU;
1077
1078         rcu_read_lock();
1079         idev = __in6_dev_get(dst->dev);
1080         if (idev)
1081                 mtu = idev->cnf.mtu6;
1082         rcu_read_unlock();
1083
1084         return mtu;
1085 }
1086
1087 static struct dst_entry *icmp6_dst_gc_list;
1088 static DEFINE_SPINLOCK(icmp6_dst_lock);
1089
1090 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1091                                   struct neighbour *neigh,
1092                                   struct flowi6 *fl6)
1093 {
1094         struct dst_entry *dst;
1095         struct rt6_info *rt;
1096         struct inet6_dev *idev = in6_dev_get(dev);
1097         struct net *net = dev_net(dev);
1098
1099         if (unlikely(!idev))
1100                 return ERR_PTR(-ENODEV);
1101
1102         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1103         if (unlikely(!rt)) {
1104                 in6_dev_put(idev);
1105                 dst = ERR_PTR(-ENOMEM);
1106                 goto out;
1107         }
1108
1109         if (neigh)
1110                 neigh_hold(neigh);
1111         else {
1112                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1113                 if (IS_ERR(neigh)) {
1114                         in6_dev_put(idev);
1115                         dst_free(&rt->dst);
1116                         return ERR_CAST(neigh);
1117                 }
1118         }
1119
1120         rt->dst.flags |= DST_HOST;
1121         rt->dst.output  = ip6_output;
1122         dst_set_neighbour(&rt->dst, neigh);
1123         atomic_set(&rt->dst.__refcnt, 1);
1124         rt->rt6i_dst.addr = fl6->daddr;
1125         rt->rt6i_dst.plen = 128;
1126         rt->rt6i_idev     = idev;
1127         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1128
1129         spin_lock_bh(&icmp6_dst_lock);
1130         rt->dst.next = icmp6_dst_gc_list;
1131         icmp6_dst_gc_list = &rt->dst;
1132         spin_unlock_bh(&icmp6_dst_lock);
1133
1134         fib6_force_start_gc(net);
1135
1136         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1137
1138 out:
1139         return dst;
1140 }
1141
1142 int icmp6_dst_gc(void)
1143 {
1144         struct dst_entry *dst, **pprev;
1145         int more = 0;
1146
1147         spin_lock_bh(&icmp6_dst_lock);
1148         pprev = &icmp6_dst_gc_list;
1149
1150         while ((dst = *pprev) != NULL) {
1151                 if (!atomic_read(&dst->__refcnt)) {
1152                         *pprev = dst->next;
1153                         dst_free(dst);
1154                 } else {
1155                         pprev = &dst->next;
1156                         ++more;
1157                 }
1158         }
1159
1160         spin_unlock_bh(&icmp6_dst_lock);
1161
1162         return more;
1163 }
1164
1165 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1166                             void *arg)
1167 {
1168         struct dst_entry *dst, **pprev;
1169
1170         spin_lock_bh(&icmp6_dst_lock);
1171         pprev = &icmp6_dst_gc_list;
1172         while ((dst = *pprev) != NULL) {
1173                 struct rt6_info *rt = (struct rt6_info *) dst;
1174                 if (func(rt, arg)) {
1175                         *pprev = dst->next;
1176                         dst_free(dst);
1177                 } else {
1178                         pprev = &dst->next;
1179                 }
1180         }
1181         spin_unlock_bh(&icmp6_dst_lock);
1182 }
1183
1184 static int ip6_dst_gc(struct dst_ops *ops)
1185 {
1186         unsigned long now = jiffies;
1187         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1188         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1189         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1190         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1191         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1192         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1193         int entries;
1194
1195         entries = dst_entries_get_fast(ops);
1196         if (time_after(rt_last_gc + rt_min_interval, now) &&
1197             entries <= rt_max_size)
1198                 goto out;
1199
1200         net->ipv6.ip6_rt_gc_expire++;
1201         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1202         net->ipv6.ip6_rt_last_gc = now;
1203         entries = dst_entries_get_slow(ops);
1204         if (entries < ops->gc_thresh)
1205                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1206 out:
1207         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1208         return entries > rt_max_size;
1209 }
1210
1211 /* Clean host part of a prefix. Not necessary in radix tree,
1212    but results in cleaner routing tables.
1213
1214    Remove it only when all the things will work!
1215  */
1216
1217 int ip6_dst_hoplimit(struct dst_entry *dst)
1218 {
1219         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1220         if (hoplimit == 0) {
1221                 struct net_device *dev = dst->dev;
1222                 struct inet6_dev *idev;
1223
1224                 rcu_read_lock();
1225                 idev = __in6_dev_get(dev);
1226                 if (idev)
1227                         hoplimit = idev->cnf.hop_limit;
1228                 else
1229                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1230                 rcu_read_unlock();
1231         }
1232         return hoplimit;
1233 }
1234 EXPORT_SYMBOL(ip6_dst_hoplimit);
1235
1236 /*
1237  *
1238  */
1239
1240 int ip6_route_add(struct fib6_config *cfg)
1241 {
1242         int err;
1243         struct net *net = cfg->fc_nlinfo.nl_net;
1244         struct rt6_info *rt = NULL;
1245         struct net_device *dev = NULL;
1246         struct inet6_dev *idev = NULL;
1247         struct fib6_table *table;
1248         int addr_type;
1249
1250         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1251                 return -EINVAL;
1252 #ifndef CONFIG_IPV6_SUBTREES
1253         if (cfg->fc_src_len)
1254                 return -EINVAL;
1255 #endif
1256         if (cfg->fc_ifindex) {
1257                 err = -ENODEV;
1258                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1259                 if (!dev)
1260                         goto out;
1261                 idev = in6_dev_get(dev);
1262                 if (!idev)
1263                         goto out;
1264         }
1265
1266         if (cfg->fc_metric == 0)
1267                 cfg->fc_metric = IP6_RT_PRIO_USER;
1268
1269         err = -ENOBUFS;
1270         if (cfg->fc_nlinfo.nlh &&
1271             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1272                 table = fib6_get_table(net, cfg->fc_table);
1273                 if (!table) {
1274                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1275                         table = fib6_new_table(net, cfg->fc_table);
1276                 }
1277         } else {
1278                 table = fib6_new_table(net, cfg->fc_table);
1279         }
1280
1281         if (!table)
1282                 goto out;
1283
1284         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1285
1286         if (!rt) {
1287                 err = -ENOMEM;
1288                 goto out;
1289         }
1290
1291         rt->dst.obsolete = -1;
1292         rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1293                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1294                                 0;
1295
1296         if (cfg->fc_protocol == RTPROT_UNSPEC)
1297                 cfg->fc_protocol = RTPROT_BOOT;
1298         rt->rt6i_protocol = cfg->fc_protocol;
1299
1300         addr_type = ipv6_addr_type(&cfg->fc_dst);
1301
1302         if (addr_type & IPV6_ADDR_MULTICAST)
1303                 rt->dst.input = ip6_mc_input;
1304         else if (cfg->fc_flags & RTF_LOCAL)
1305                 rt->dst.input = ip6_input;
1306         else
1307                 rt->dst.input = ip6_forward;
1308
1309         rt->dst.output = ip6_output;
1310
1311         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1312         rt->rt6i_dst.plen = cfg->fc_dst_len;
1313         if (rt->rt6i_dst.plen == 128)
1314                rt->dst.flags |= DST_HOST;
1315
1316         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1317                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1318                 if (!metrics) {
1319                         err = -ENOMEM;
1320                         goto out;
1321                 }
1322                 dst_init_metrics(&rt->dst, metrics, 0);
1323         }
1324 #ifdef CONFIG_IPV6_SUBTREES
1325         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1326         rt->rt6i_src.plen = cfg->fc_src_len;
1327 #endif
1328
1329         rt->rt6i_metric = cfg->fc_metric;
1330
1331         /* We cannot add true routes via loopback here,
1332            they would result in kernel looping; promote them to reject routes
1333          */
1334         if ((cfg->fc_flags & RTF_REJECT) ||
1335             (dev && (dev->flags & IFF_LOOPBACK) &&
1336              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1337              !(cfg->fc_flags & RTF_LOCAL))) {
1338                 /* hold loopback dev/idev if we haven't done so. */
1339                 if (dev != net->loopback_dev) {
1340                         if (dev) {
1341                                 dev_put(dev);
1342                                 in6_dev_put(idev);
1343                         }
1344                         dev = net->loopback_dev;
1345                         dev_hold(dev);
1346                         idev = in6_dev_get(dev);
1347                         if (!idev) {
1348                                 err = -ENODEV;
1349                                 goto out;
1350                         }
1351                 }
1352                 rt->dst.output = ip6_pkt_discard_out;
1353                 rt->dst.input = ip6_pkt_discard;
1354                 rt->dst.error = -ENETUNREACH;
1355                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1356                 goto install_route;
1357         }
1358
1359         if (cfg->fc_flags & RTF_GATEWAY) {
1360                 const struct in6_addr *gw_addr;
1361                 int gwa_type;
1362
1363                 gw_addr = &cfg->fc_gateway;
1364                 rt->rt6i_gateway = *gw_addr;
1365                 gwa_type = ipv6_addr_type(gw_addr);
1366
1367                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1368                         struct rt6_info *grt;
1369
1370                         /* IPv6 strictly inhibits using not link-local
1371                            addresses as nexthop address.
1372                            Otherwise, router will not able to send redirects.
1373                            It is very good, but in some (rare!) circumstances
1374                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1375                            some exceptions. --ANK
1376                          */
1377                         err = -EINVAL;
1378                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1379                                 goto out;
1380
1381                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1382
1383                         err = -EHOSTUNREACH;
1384                         if (!grt)
1385                                 goto out;
1386                         if (dev) {
1387                                 if (dev != grt->dst.dev) {
1388                                         dst_release(&grt->dst);
1389                                         goto out;
1390                                 }
1391                         } else {
1392                                 dev = grt->dst.dev;
1393                                 idev = grt->rt6i_idev;
1394                                 dev_hold(dev);
1395                                 in6_dev_hold(grt->rt6i_idev);
1396                         }
1397                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1398                                 err = 0;
1399                         dst_release(&grt->dst);
1400
1401                         if (err)
1402                                 goto out;
1403                 }
1404                 err = -EINVAL;
1405                 if (!dev || (dev->flags & IFF_LOOPBACK))
1406                         goto out;
1407         }
1408
1409         err = -ENODEV;
1410         if (!dev)
1411                 goto out;
1412
1413         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1414                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1415                         err = -EINVAL;
1416                         goto out;
1417                 }
1418                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1419                 rt->rt6i_prefsrc.plen = 128;
1420         } else
1421                 rt->rt6i_prefsrc.plen = 0;
1422
1423         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1424                 err = rt6_bind_neighbour(rt, dev);
1425                 if (err)
1426                         goto out;
1427         }
1428
1429         rt->rt6i_flags = cfg->fc_flags;
1430
1431 install_route:
1432         if (cfg->fc_mx) {
1433                 struct nlattr *nla;
1434                 int remaining;
1435
1436                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1437                         int type = nla_type(nla);
1438
1439                         if (type) {
1440                                 if (type > RTAX_MAX) {
1441                                         err = -EINVAL;
1442                                         goto out;
1443                                 }
1444
1445                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1446                         }
1447                 }
1448         }
1449
1450         rt->dst.dev = dev;
1451         rt->rt6i_idev = idev;
1452         rt->rt6i_table = table;
1453
1454         cfg->fc_nlinfo.nl_net = dev_net(dev);
1455
1456         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1457
1458 out:
1459         if (dev)
1460                 dev_put(dev);
1461         if (idev)
1462                 in6_dev_put(idev);
1463         if (rt)
1464                 dst_free(&rt->dst);
1465         return err;
1466 }
1467
1468 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1469 {
1470         int err;
1471         struct fib6_table *table;
1472         struct net *net = dev_net(rt->dst.dev);
1473
1474         if (rt == net->ipv6.ip6_null_entry)
1475                 return -ENOENT;
1476
1477         table = rt->rt6i_table;
1478         write_lock_bh(&table->tb6_lock);
1479
1480         err = fib6_del(rt, info);
1481         dst_release(&rt->dst);
1482
1483         write_unlock_bh(&table->tb6_lock);
1484
1485         return err;
1486 }
1487
1488 int ip6_del_rt(struct rt6_info *rt)
1489 {
1490         struct nl_info info = {
1491                 .nl_net = dev_net(rt->dst.dev),
1492         };
1493         return __ip6_del_rt(rt, &info);
1494 }
1495
1496 static int ip6_route_del(struct fib6_config *cfg)
1497 {
1498         struct fib6_table *table;
1499         struct fib6_node *fn;
1500         struct rt6_info *rt;
1501         int err = -ESRCH;
1502
1503         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1504         if (!table)
1505                 return err;
1506
1507         read_lock_bh(&table->tb6_lock);
1508
1509         fn = fib6_locate(&table->tb6_root,
1510                          &cfg->fc_dst, cfg->fc_dst_len,
1511                          &cfg->fc_src, cfg->fc_src_len);
1512
1513         if (fn) {
1514                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1515                         if (cfg->fc_ifindex &&
1516                             (!rt->dst.dev ||
1517                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1518                                 continue;
1519                         if (cfg->fc_flags & RTF_GATEWAY &&
1520                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1521                                 continue;
1522                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1523                                 continue;
1524                         dst_hold(&rt->dst);
1525                         read_unlock_bh(&table->tb6_lock);
1526
1527                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1528                 }
1529         }
1530         read_unlock_bh(&table->tb6_lock);
1531
1532         return err;
1533 }
1534
1535 /*
1536  *      Handle redirects
1537  */
1538 struct ip6rd_flowi {
1539         struct flowi6 fl6;
1540         struct in6_addr gateway;
1541 };
1542
1543 static struct rt6_info *__ip6_route_redirect(struct net *net,
1544                                              struct fib6_table *table,
1545                                              struct flowi6 *fl6,
1546                                              int flags)
1547 {
1548         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1549         struct rt6_info *rt;
1550         struct fib6_node *fn;
1551
1552         /*
1553          * Get the "current" route for this destination and
1554          * check if the redirect has come from approriate router.
1555          *
1556          * RFC 2461 specifies that redirects should only be
1557          * accepted if they come from the nexthop to the target.
1558          * Due to the way the routes are chosen, this notion
1559          * is a bit fuzzy and one might need to check all possible
1560          * routes.
1561          */
1562
1563         read_lock_bh(&table->tb6_lock);
1564         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1565 restart:
1566         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1567                 /*
1568                  * Current route is on-link; redirect is always invalid.
1569                  *
1570                  * Seems, previous statement is not true. It could
1571                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1572                  * But then router serving it might decide, that we should
1573                  * know truth 8)8) --ANK (980726).
1574                  */
1575                 if (rt6_check_expired(rt))
1576                         continue;
1577                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1578                         continue;
1579                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1580                         continue;
1581                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1582                         continue;
1583                 break;
1584         }
1585
1586         if (!rt)
1587                 rt = net->ipv6.ip6_null_entry;
1588         BACKTRACK(net, &fl6->saddr);
1589 out:
1590         dst_hold(&rt->dst);
1591
1592         read_unlock_bh(&table->tb6_lock);
1593
1594         return rt;
1595 };
1596
1597 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1598                                            const struct in6_addr *src,
1599                                            const struct in6_addr *gateway,
1600                                            struct net_device *dev)
1601 {
1602         int flags = RT6_LOOKUP_F_HAS_SADDR;
1603         struct net *net = dev_net(dev);
1604         struct ip6rd_flowi rdfl = {
1605                 .fl6 = {
1606                         .flowi6_oif = dev->ifindex,
1607                         .daddr = *dest,
1608                         .saddr = *src,
1609                 },
1610         };
1611
1612         rdfl.gateway = *gateway;
1613
1614         if (rt6_need_strict(dest))
1615                 flags |= RT6_LOOKUP_F_IFACE;
1616
1617         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1618                                                    flags, __ip6_route_redirect);
1619 }
1620
1621 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1622                   const struct in6_addr *saddr,
1623                   struct neighbour *neigh, u8 *lladdr, int on_link)
1624 {
1625         struct rt6_info *rt, *nrt = NULL;
1626         struct netevent_redirect netevent;
1627         struct net *net = dev_net(neigh->dev);
1628
1629         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1630
1631         if (rt == net->ipv6.ip6_null_entry) {
1632                 if (net_ratelimit())
1633                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1634                                "for redirect target\n");
1635                 goto out;
1636         }
1637
1638         /*
1639          *      We have finally decided to accept it.
1640          */
1641
1642         neigh_update(neigh, lladdr, NUD_STALE,
1643                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1644                      NEIGH_UPDATE_F_OVERRIDE|
1645                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1646                                      NEIGH_UPDATE_F_ISROUTER))
1647                      );
1648
1649         /*
1650          * Redirect received -> path was valid.
1651          * Look, redirects are sent only in response to data packets,
1652          * so that this nexthop apparently is reachable. --ANK
1653          */
1654         dst_confirm(&rt->dst);
1655
1656         /* Duplicate redirect: silently ignore. */
1657         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1658                 goto out;
1659
1660         nrt = ip6_rt_copy(rt, dest);
1661         if (!nrt)
1662                 goto out;
1663
1664         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1665         if (on_link)
1666                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1667
1668         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1669         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1670
1671         if (ip6_ins_rt(nrt))
1672                 goto out;
1673
1674         netevent.old = &rt->dst;
1675         netevent.new = &nrt->dst;
1676         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1677
1678         if (rt->rt6i_flags & RTF_CACHE) {
1679                 ip6_del_rt(rt);
1680                 return;
1681         }
1682
1683 out:
1684         dst_release(&rt->dst);
1685 }
1686
1687 /*
1688  *      Handle ICMP "packet too big" messages
1689  *      i.e. Path MTU discovery
1690  */
1691
1692 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1693                              struct net *net, u32 pmtu, int ifindex)
1694 {
1695         struct rt6_info *rt, *nrt;
1696         int allfrag = 0;
1697 again:
1698         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1699         if (!rt)
1700                 return;
1701
1702         if (rt6_check_expired(rt)) {
1703                 ip6_del_rt(rt);
1704                 goto again;
1705         }
1706
1707         if (pmtu >= dst_mtu(&rt->dst))
1708                 goto out;
1709
1710         if (pmtu < IPV6_MIN_MTU) {
1711                 /*
1712                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1713                  * MTU (1280) and a fragment header should always be included
1714                  * after a node receiving Too Big message reporting PMTU is
1715                  * less than the IPv6 Minimum Link MTU.
1716                  */
1717                 pmtu = IPV6_MIN_MTU;
1718                 allfrag = 1;
1719         }
1720
1721         /* New mtu received -> path was valid.
1722            They are sent only in response to data packets,
1723            so that this nexthop apparently is reachable. --ANK
1724          */
1725         dst_confirm(&rt->dst);
1726
1727         /* Host route. If it is static, it would be better
1728            not to override it, but add new one, so that
1729            when cache entry will expire old pmtu
1730            would return automatically.
1731          */
1732         if (rt->rt6i_flags & RTF_CACHE) {
1733                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1734                 if (allfrag) {
1735                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1736                         features |= RTAX_FEATURE_ALLFRAG;
1737                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1738                 }
1739                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1740                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1741                 goto out;
1742         }
1743
1744         /* Network route.
1745            Two cases are possible:
1746            1. It is connected route. Action: COW
1747            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1748          */
1749         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1750                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1751         else
1752                 nrt = rt6_alloc_clone(rt, daddr);
1753
1754         if (nrt) {
1755                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1756                 if (allfrag) {
1757                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1758                         features |= RTAX_FEATURE_ALLFRAG;
1759                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1760                 }
1761
1762                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1763                  * happened within 5 mins, the recommended timer is 10 mins.
1764                  * Here this route expiration time is set to ip6_rt_mtu_expires
1765                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1766                  * and detecting PMTU increase will be automatically happened.
1767                  */
1768                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1769                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1770
1771                 ip6_ins_rt(nrt);
1772         }
1773 out:
1774         dst_release(&rt->dst);
1775 }
1776
1777 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1778                         struct net_device *dev, u32 pmtu)
1779 {
1780         struct net *net = dev_net(dev);
1781
1782         /*
1783          * RFC 1981 states that a node "MUST reduce the size of the packets it
1784          * is sending along the path" that caused the Packet Too Big message.
1785          * Since it's not possible in the general case to determine which
1786          * interface was used to send the original packet, we update the MTU
1787          * on the interface that will be used to send future packets. We also
1788          * update the MTU on the interface that received the Packet Too Big in
1789          * case the original packet was forced out that interface with
1790          * SO_BINDTODEVICE or similar. This is the next best thing to the
1791          * correct behaviour, which would be to update the MTU on all
1792          * interfaces.
1793          */
1794         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1795         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1796 }
1797
1798 /*
1799  *      Misc support functions
1800  */
1801
1802 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1803                                     const struct in6_addr *dest)
1804 {
1805         struct net *net = dev_net(ort->dst.dev);
1806         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1807                                             ort->dst.dev, 0);
1808
1809         if (rt) {
1810                 rt->dst.input = ort->dst.input;
1811                 rt->dst.output = ort->dst.output;
1812                 rt->dst.flags |= DST_HOST;
1813
1814                 rt->rt6i_dst.addr = *dest;
1815                 rt->rt6i_dst.plen = 128;
1816                 dst_copy_metrics(&rt->dst, &ort->dst);
1817                 rt->dst.error = ort->dst.error;
1818                 rt->rt6i_idev = ort->rt6i_idev;
1819                 if (rt->rt6i_idev)
1820                         in6_dev_hold(rt->rt6i_idev);
1821                 rt->dst.lastuse = jiffies;
1822                 rt->dst.expires = 0;
1823
1824                 rt->rt6i_gateway = ort->rt6i_gateway;
1825                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1826                 rt->rt6i_metric = 0;
1827
1828 #ifdef CONFIG_IPV6_SUBTREES
1829                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1830 #endif
1831                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1832                 rt->rt6i_table = ort->rt6i_table;
1833         }
1834         return rt;
1835 }
1836
1837 #ifdef CONFIG_IPV6_ROUTE_INFO
1838 static struct rt6_info *rt6_get_route_info(struct net *net,
1839                                            const struct in6_addr *prefix, int prefixlen,
1840                                            const struct in6_addr *gwaddr, int ifindex)
1841 {
1842         struct fib6_node *fn;
1843         struct rt6_info *rt = NULL;
1844         struct fib6_table *table;
1845
1846         table = fib6_get_table(net, RT6_TABLE_INFO);
1847         if (!table)
1848                 return NULL;
1849
1850         write_lock_bh(&table->tb6_lock);
1851         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1852         if (!fn)
1853                 goto out;
1854
1855         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1856                 if (rt->dst.dev->ifindex != ifindex)
1857                         continue;
1858                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1859                         continue;
1860                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1861                         continue;
1862                 dst_hold(&rt->dst);
1863                 break;
1864         }
1865 out:
1866         write_unlock_bh(&table->tb6_lock);
1867         return rt;
1868 }
1869
1870 static struct rt6_info *rt6_add_route_info(struct net *net,
1871                                            const struct in6_addr *prefix, int prefixlen,
1872                                            const struct in6_addr *gwaddr, int ifindex,
1873                                            unsigned pref)
1874 {
1875         struct fib6_config cfg = {
1876                 .fc_table       = RT6_TABLE_INFO,
1877                 .fc_metric      = IP6_RT_PRIO_USER,
1878                 .fc_ifindex     = ifindex,
1879                 .fc_dst_len     = prefixlen,
1880                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1881                                   RTF_UP | RTF_PREF(pref),
1882                 .fc_nlinfo.pid = 0,
1883                 .fc_nlinfo.nlh = NULL,
1884                 .fc_nlinfo.nl_net = net,
1885         };
1886
1887         cfg.fc_dst = *prefix;
1888         cfg.fc_gateway = *gwaddr;
1889
1890         /* We should treat it as a default route if prefix length is 0. */
1891         if (!prefixlen)
1892                 cfg.fc_flags |= RTF_DEFAULT;
1893
1894         ip6_route_add(&cfg);
1895
1896         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1897 }
1898 #endif
1899
1900 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1901 {
1902         struct rt6_info *rt;
1903         struct fib6_table *table;
1904
1905         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1906         if (!table)
1907                 return NULL;
1908
1909         write_lock_bh(&table->tb6_lock);
1910         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1911                 if (dev == rt->dst.dev &&
1912                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1913                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1914                         break;
1915         }
1916         if (rt)
1917                 dst_hold(&rt->dst);
1918         write_unlock_bh(&table->tb6_lock);
1919         return rt;
1920 }
1921
1922 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1923                                      struct net_device *dev,
1924                                      unsigned int pref)
1925 {
1926         struct fib6_config cfg = {
1927                 .fc_table       = RT6_TABLE_DFLT,
1928                 .fc_metric      = IP6_RT_PRIO_USER,
1929                 .fc_ifindex     = dev->ifindex,
1930                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1931                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1932                 .fc_nlinfo.pid = 0,
1933                 .fc_nlinfo.nlh = NULL,
1934                 .fc_nlinfo.nl_net = dev_net(dev),
1935         };
1936
1937         cfg.fc_gateway = *gwaddr;
1938
1939         ip6_route_add(&cfg);
1940
1941         return rt6_get_dflt_router(gwaddr, dev);
1942 }
1943
1944 void rt6_purge_dflt_routers(struct net *net)
1945 {
1946         struct rt6_info *rt;
1947         struct fib6_table *table;
1948
1949         /* NOTE: Keep consistent with rt6_get_dflt_router */
1950         table = fib6_get_table(net, RT6_TABLE_DFLT);
1951         if (!table)
1952                 return;
1953
1954 restart:
1955         read_lock_bh(&table->tb6_lock);
1956         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1957                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1958                         dst_hold(&rt->dst);
1959                         read_unlock_bh(&table->tb6_lock);
1960                         ip6_del_rt(rt);
1961                         goto restart;
1962                 }
1963         }
1964         read_unlock_bh(&table->tb6_lock);
1965 }
1966
1967 static void rtmsg_to_fib6_config(struct net *net,
1968                                  struct in6_rtmsg *rtmsg,
1969                                  struct fib6_config *cfg)
1970 {
1971         memset(cfg, 0, sizeof(*cfg));
1972
1973         cfg->fc_table = RT6_TABLE_MAIN;
1974         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1975         cfg->fc_metric = rtmsg->rtmsg_metric;
1976         cfg->fc_expires = rtmsg->rtmsg_info;
1977         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1978         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1979         cfg->fc_flags = rtmsg->rtmsg_flags;
1980
1981         cfg->fc_nlinfo.nl_net = net;
1982
1983         cfg->fc_dst = rtmsg->rtmsg_dst;
1984         cfg->fc_src = rtmsg->rtmsg_src;
1985         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1986 }
1987
1988 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1989 {
1990         struct fib6_config cfg;
1991         struct in6_rtmsg rtmsg;
1992         int err;
1993
1994         switch(cmd) {
1995         case SIOCADDRT:         /* Add a route */
1996         case SIOCDELRT:         /* Delete a route */
1997                 if (!capable(CAP_NET_ADMIN))
1998                         return -EPERM;
1999                 err = copy_from_user(&rtmsg, arg,
2000                                      sizeof(struct in6_rtmsg));
2001                 if (err)
2002                         return -EFAULT;
2003
2004                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2005
2006                 rtnl_lock();
2007                 switch (cmd) {
2008                 case SIOCADDRT:
2009                         err = ip6_route_add(&cfg);
2010                         break;
2011                 case SIOCDELRT:
2012                         err = ip6_route_del(&cfg);
2013                         break;
2014                 default:
2015                         err = -EINVAL;
2016                 }
2017                 rtnl_unlock();
2018
2019                 return err;
2020         }
2021
2022         return -EINVAL;
2023 }
2024
2025 /*
2026  *      Drop the packet on the floor
2027  */
2028
2029 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2030 {
2031         int type;
2032         struct dst_entry *dst = skb_dst(skb);
2033         switch (ipstats_mib_noroutes) {
2034         case IPSTATS_MIB_INNOROUTES:
2035                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2036                 if (type == IPV6_ADDR_ANY) {
2037                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2038                                       IPSTATS_MIB_INADDRERRORS);
2039                         break;
2040                 }
2041                 /* FALLTHROUGH */
2042         case IPSTATS_MIB_OUTNOROUTES:
2043                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2044                               ipstats_mib_noroutes);
2045                 break;
2046         }
2047         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2048         kfree_skb(skb);
2049         return 0;
2050 }
2051
2052 static int ip6_pkt_discard(struct sk_buff *skb)
2053 {
2054         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2055 }
2056
2057 static int ip6_pkt_discard_out(struct sk_buff *skb)
2058 {
2059         skb->dev = skb_dst(skb)->dev;
2060         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2061 }
2062
2063 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2064
2065 static int ip6_pkt_prohibit(struct sk_buff *skb)
2066 {
2067         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2068 }
2069
2070 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2071 {
2072         skb->dev = skb_dst(skb)->dev;
2073         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2074 }
2075
2076 #endif
2077
2078 /*
2079  *      Allocate a dst for local (unicast / anycast) address.
2080  */
2081
2082 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2083                                     const struct in6_addr *addr,
2084                                     bool anycast)
2085 {
2086         struct net *net = dev_net(idev->dev);
2087         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2088                                             net->loopback_dev, 0);
2089         int err;
2090
2091         if (!rt) {
2092                 if (net_ratelimit())
2093                         pr_warning("IPv6:  Maximum number of routes reached,"
2094                                    " consider increasing route/max_size.\n");
2095                 return ERR_PTR(-ENOMEM);
2096         }
2097
2098         in6_dev_hold(idev);
2099
2100         rt->dst.flags |= DST_HOST;
2101         rt->dst.input = ip6_input;
2102         rt->dst.output = ip6_output;
2103         rt->rt6i_idev = idev;
2104         rt->dst.obsolete = -1;
2105
2106         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2107         if (anycast)
2108                 rt->rt6i_flags |= RTF_ANYCAST;
2109         else
2110                 rt->rt6i_flags |= RTF_LOCAL;
2111         err = rt6_bind_neighbour(rt, rt->dst.dev);
2112         if (err) {
2113                 dst_free(&rt->dst);
2114                 return ERR_PTR(err);
2115         }
2116
2117         rt->rt6i_dst.addr = *addr;
2118         rt->rt6i_dst.plen = 128;
2119         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2120
2121         atomic_set(&rt->dst.__refcnt, 1);
2122
2123         return rt;
2124 }
2125
2126 int ip6_route_get_saddr(struct net *net,
2127                         struct rt6_info *rt,
2128                         const struct in6_addr *daddr,
2129                         unsigned int prefs,
2130                         struct in6_addr *saddr)
2131 {
2132         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2133         int err = 0;
2134         if (rt->rt6i_prefsrc.plen)
2135                 *saddr = rt->rt6i_prefsrc.addr;
2136         else
2137                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2138                                          daddr, prefs, saddr);
2139         return err;
2140 }
2141
2142 /* remove deleted ip from prefsrc entries */
2143 struct arg_dev_net_ip {
2144         struct net_device *dev;
2145         struct net *net;
2146         struct in6_addr *addr;
2147 };
2148
2149 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2150 {
2151         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2152         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2153         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2154
2155         if (((void *)rt->dst.dev == dev || !dev) &&
2156             rt != net->ipv6.ip6_null_entry &&
2157             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2158                 /* remove prefsrc entry */
2159                 rt->rt6i_prefsrc.plen = 0;
2160         }
2161         return 0;
2162 }
2163
2164 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2165 {
2166         struct net *net = dev_net(ifp->idev->dev);
2167         struct arg_dev_net_ip adni = {
2168                 .dev = ifp->idev->dev,
2169                 .net = net,
2170                 .addr = &ifp->addr,
2171         };
2172         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2173 }
2174
2175 struct arg_dev_net {
2176         struct net_device *dev;
2177         struct net *net;
2178 };
2179
2180 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2181 {
2182         const struct arg_dev_net *adn = arg;
2183         const struct net_device *dev = adn->dev;
2184
2185         if ((rt->dst.dev == dev || !dev) &&
2186             rt != adn->net->ipv6.ip6_null_entry)
2187                 return -1;
2188
2189         return 0;
2190 }
2191
2192 void rt6_ifdown(struct net *net, struct net_device *dev)
2193 {
2194         struct arg_dev_net adn = {
2195                 .dev = dev,
2196                 .net = net,
2197         };
2198
2199         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2200         icmp6_clean_all(fib6_ifdown, &adn);
2201 }
2202
2203 struct rt6_mtu_change_arg
2204 {
2205         struct net_device *dev;
2206         unsigned mtu;
2207 };
2208
2209 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2210 {
2211         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2212         struct inet6_dev *idev;
2213
2214         /* In IPv6 pmtu discovery is not optional,
2215            so that RTAX_MTU lock cannot disable it.
2216            We still use this lock to block changes
2217            caused by addrconf/ndisc.
2218         */
2219
2220         idev = __in6_dev_get(arg->dev);
2221         if (!idev)
2222                 return 0;
2223
2224         /* For administrative MTU increase, there is no way to discover
2225            IPv6 PMTU increase, so PMTU increase should be updated here.
2226            Since RFC 1981 doesn't include administrative MTU increase
2227            update PMTU increase is a MUST. (i.e. jumbo frame)
2228          */
2229         /*
2230            If new MTU is less than route PMTU, this new MTU will be the
2231            lowest MTU in the path, update the route PMTU to reflect PMTU
2232            decreases; if new MTU is greater than route PMTU, and the
2233            old MTU is the lowest MTU in the path, update the route PMTU
2234            to reflect the increase. In this case if the other nodes' MTU
2235            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2236            PMTU discouvery.
2237          */
2238         if (rt->dst.dev == arg->dev &&
2239             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2240             (dst_mtu(&rt->dst) >= arg->mtu ||
2241              (dst_mtu(&rt->dst) < arg->mtu &&
2242               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2243                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2244         }
2245         return 0;
2246 }
2247
2248 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2249 {
2250         struct rt6_mtu_change_arg arg = {
2251                 .dev = dev,
2252                 .mtu = mtu,
2253         };
2254
2255         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2256 }
2257
2258 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2259         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2260         [RTA_OIF]               = { .type = NLA_U32 },
2261         [RTA_IIF]               = { .type = NLA_U32 },
2262         [RTA_PRIORITY]          = { .type = NLA_U32 },
2263         [RTA_METRICS]           = { .type = NLA_NESTED },
2264 };
2265
2266 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2267                               struct fib6_config *cfg)
2268 {
2269         struct rtmsg *rtm;
2270         struct nlattr *tb[RTA_MAX+1];
2271         int err;
2272
2273         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2274         if (err < 0)
2275                 goto errout;
2276
2277         err = -EINVAL;
2278         rtm = nlmsg_data(nlh);
2279         memset(cfg, 0, sizeof(*cfg));
2280
2281         cfg->fc_table = rtm->rtm_table;
2282         cfg->fc_dst_len = rtm->rtm_dst_len;
2283         cfg->fc_src_len = rtm->rtm_src_len;
2284         cfg->fc_flags = RTF_UP;
2285         cfg->fc_protocol = rtm->rtm_protocol;
2286
2287         if (rtm->rtm_type == RTN_UNREACHABLE)
2288                 cfg->fc_flags |= RTF_REJECT;
2289
2290         if (rtm->rtm_type == RTN_LOCAL)
2291                 cfg->fc_flags |= RTF_LOCAL;
2292
2293         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2294         cfg->fc_nlinfo.nlh = nlh;
2295         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2296
2297         if (tb[RTA_GATEWAY]) {
2298                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2299                 cfg->fc_flags |= RTF_GATEWAY;
2300         }
2301
2302         if (tb[RTA_DST]) {
2303                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2304
2305                 if (nla_len(tb[RTA_DST]) < plen)
2306                         goto errout;
2307
2308                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2309         }
2310
2311         if (tb[RTA_SRC]) {
2312                 int plen = (rtm->rtm_src_len + 7) >> 3;
2313
2314                 if (nla_len(tb[RTA_SRC]) < plen)
2315                         goto errout;
2316
2317                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2318         }
2319
2320         if (tb[RTA_PREFSRC])
2321                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2322
2323         if (tb[RTA_OIF])
2324                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2325
2326         if (tb[RTA_PRIORITY])
2327                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2328
2329         if (tb[RTA_METRICS]) {
2330                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2331                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2332         }
2333
2334         if (tb[RTA_TABLE])
2335                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2336
2337         err = 0;
2338 errout:
2339         return err;
2340 }
2341
2342 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2343 {
2344         struct fib6_config cfg;
2345         int err;
2346
2347         err = rtm_to_fib6_config(skb, nlh, &cfg);
2348         if (err < 0)
2349                 return err;
2350
2351         return ip6_route_del(&cfg);
2352 }
2353
2354 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2355 {
2356         struct fib6_config cfg;
2357         int err;
2358
2359         err = rtm_to_fib6_config(skb, nlh, &cfg);
2360         if (err < 0)
2361                 return err;
2362
2363         return ip6_route_add(&cfg);
2364 }
2365
2366 static inline size_t rt6_nlmsg_size(void)
2367 {
2368         return NLMSG_ALIGN(sizeof(struct rtmsg))
2369                + nla_total_size(16) /* RTA_SRC */
2370                + nla_total_size(16) /* RTA_DST */
2371                + nla_total_size(16) /* RTA_GATEWAY */
2372                + nla_total_size(16) /* RTA_PREFSRC */
2373                + nla_total_size(4) /* RTA_TABLE */
2374                + nla_total_size(4) /* RTA_IIF */
2375                + nla_total_size(4) /* RTA_OIF */
2376                + nla_total_size(4) /* RTA_PRIORITY */
2377                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2378                + nla_total_size(sizeof(struct rta_cacheinfo));
2379 }
2380
2381 static int rt6_fill_node(struct net *net,
2382                          struct sk_buff *skb, struct rt6_info *rt,
2383                          struct in6_addr *dst, struct in6_addr *src,
2384                          int iif, int type, u32 pid, u32 seq,
2385                          int prefix, int nowait, unsigned int flags)
2386 {
2387         const struct inet_peer *peer;
2388         struct rtmsg *rtm;
2389         struct nlmsghdr *nlh;
2390         long expires;
2391         u32 table;
2392         struct neighbour *n;
2393         u32 ts, tsage;
2394
2395         if (prefix) {   /* user wants prefix routes only */
2396                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2397                         /* success since this is not a prefix route */
2398                         return 1;
2399                 }
2400         }
2401
2402         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2403         if (!nlh)
2404                 return -EMSGSIZE;
2405
2406         rtm = nlmsg_data(nlh);
2407         rtm->rtm_family = AF_INET6;
2408         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2409         rtm->rtm_src_len = rt->rt6i_src.plen;
2410         rtm->rtm_tos = 0;
2411         if (rt->rt6i_table)
2412                 table = rt->rt6i_table->tb6_id;
2413         else
2414                 table = RT6_TABLE_UNSPEC;
2415         rtm->rtm_table = table;
2416         NLA_PUT_U32(skb, RTA_TABLE, table);
2417         if (rt->rt6i_flags & RTF_REJECT)
2418                 rtm->rtm_type = RTN_UNREACHABLE;
2419         else if (rt->rt6i_flags & RTF_LOCAL)
2420                 rtm->rtm_type = RTN_LOCAL;
2421         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2422                 rtm->rtm_type = RTN_LOCAL;
2423         else
2424                 rtm->rtm_type = RTN_UNICAST;
2425         rtm->rtm_flags = 0;
2426         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2427         rtm->rtm_protocol = rt->rt6i_protocol;
2428         if (rt->rt6i_flags & RTF_DYNAMIC)
2429                 rtm->rtm_protocol = RTPROT_REDIRECT;
2430         else if (rt->rt6i_flags & RTF_ADDRCONF)
2431                 rtm->rtm_protocol = RTPROT_KERNEL;
2432         else if (rt->rt6i_flags & RTF_DEFAULT)
2433                 rtm->rtm_protocol = RTPROT_RA;
2434
2435         if (rt->rt6i_flags & RTF_CACHE)
2436                 rtm->rtm_flags |= RTM_F_CLONED;
2437
2438         if (dst) {
2439                 NLA_PUT(skb, RTA_DST, 16, dst);
2440                 rtm->rtm_dst_len = 128;
2441         } else if (rtm->rtm_dst_len)
2442                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2443 #ifdef CONFIG_IPV6_SUBTREES
2444         if (src) {
2445                 NLA_PUT(skb, RTA_SRC, 16, src);
2446                 rtm->rtm_src_len = 128;
2447         } else if (rtm->rtm_src_len)
2448                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2449 #endif
2450         if (iif) {
2451 #ifdef CONFIG_IPV6_MROUTE
2452                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2453                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2454                         if (err <= 0) {
2455                                 if (!nowait) {
2456                                         if (err == 0)
2457                                                 return 0;
2458                                         goto nla_put_failure;
2459                                 } else {
2460                                         if (err == -EMSGSIZE)
2461                                                 goto nla_put_failure;
2462                                 }
2463                         }
2464                 } else
2465 #endif
2466                         NLA_PUT_U32(skb, RTA_IIF, iif);
2467         } else if (dst) {
2468                 struct in6_addr saddr_buf;
2469                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2470                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2471         }
2472
2473         if (rt->rt6i_prefsrc.plen) {
2474                 struct in6_addr saddr_buf;
2475                 saddr_buf = rt->rt6i_prefsrc.addr;
2476                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2477         }
2478
2479         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2480                 goto nla_put_failure;
2481
2482         rcu_read_lock();
2483         n = dst_get_neighbour_noref(&rt->dst);
2484         if (n) {
2485                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2486                         rcu_read_unlock();
2487                         goto nla_put_failure;
2488                 }
2489         }
2490         rcu_read_unlock();
2491
2492         if (rt->dst.dev)
2493                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2494
2495         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2496
2497         if (!(rt->rt6i_flags & RTF_EXPIRES))
2498                 expires = 0;
2499         else if (rt->dst.expires - jiffies < INT_MAX)
2500                 expires = rt->dst.expires - jiffies;
2501         else
2502                 expires = INT_MAX;
2503
2504         peer = rt->rt6i_peer;
2505         ts = tsage = 0;
2506         if (peer && peer->tcp_ts_stamp) {
2507                 ts = peer->tcp_ts;
2508                 tsage = get_seconds() - peer->tcp_ts_stamp;
2509         }
2510
2511         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2512                                expires, rt->dst.error) < 0)
2513                 goto nla_put_failure;
2514
2515         return nlmsg_end(skb, nlh);
2516
2517 nla_put_failure:
2518         nlmsg_cancel(skb, nlh);
2519         return -EMSGSIZE;
2520 }
2521
2522 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2523 {
2524         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2525         int prefix;
2526
2527         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2528                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2529                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2530         } else
2531                 prefix = 0;
2532
2533         return rt6_fill_node(arg->net,
2534                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2535                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2536                      prefix, 0, NLM_F_MULTI);
2537 }
2538
2539 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2540 {
2541         struct net *net = sock_net(in_skb->sk);
2542         struct nlattr *tb[RTA_MAX+1];
2543         struct rt6_info *rt;
2544         struct sk_buff *skb;
2545         struct rtmsg *rtm;
2546         struct flowi6 fl6;
2547         int err, iif = 0, oif = 0;
2548
2549         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2550         if (err < 0)
2551                 goto errout;
2552
2553         err = -EINVAL;
2554         memset(&fl6, 0, sizeof(fl6));
2555
2556         if (tb[RTA_SRC]) {
2557                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2558                         goto errout;
2559
2560                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2561         }
2562
2563         if (tb[RTA_DST]) {
2564                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2565                         goto errout;
2566
2567                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2568         }
2569
2570         if (tb[RTA_IIF])
2571                 iif = nla_get_u32(tb[RTA_IIF]);
2572
2573         if (tb[RTA_OIF])
2574                 oif = nla_get_u32(tb[RTA_OIF]);
2575
2576         if (iif) {
2577                 struct net_device *dev;
2578                 int flags = 0;
2579
2580                 dev = __dev_get_by_index(net, iif);
2581                 if (!dev) {
2582                         err = -ENODEV;
2583                         goto errout;
2584                 }
2585
2586                 fl6.flowi6_iif = iif;
2587
2588                 if (!ipv6_addr_any(&fl6.saddr))
2589                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2590
2591                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2592                                                                flags);
2593         } else {
2594                 fl6.flowi6_oif = oif;
2595
2596                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2597         }
2598
2599         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2600         if (!skb) {
2601                 err = -ENOBUFS;
2602                 goto errout;
2603         }
2604
2605         /* Reserve room for dummy headers, this skb can pass
2606            through good chunk of routing engine.
2607          */
2608         skb_reset_mac_header(skb);
2609         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2610
2611         skb_dst_set(skb, &rt->dst);
2612
2613         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2614                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2615                             nlh->nlmsg_seq, 0, 0, 0);
2616         if (err < 0) {
2617                 kfree_skb(skb);
2618                 goto errout;
2619         }
2620
2621         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2622 errout:
2623         return err;
2624 }
2625
2626 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2627 {
2628         struct sk_buff *skb;
2629         struct net *net = info->nl_net;
2630         u32 seq;
2631         int err;
2632
2633         err = -ENOBUFS;
2634         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2635
2636         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2637         if (!skb)
2638                 goto errout;
2639
2640         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2641                                 event, info->pid, seq, 0, 0, 0);
2642         if (err < 0) {
2643                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2644                 WARN_ON(err == -EMSGSIZE);
2645                 kfree_skb(skb);
2646                 goto errout;
2647         }
2648         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2649                     info->nlh, gfp_any());
2650         return;
2651 errout:
2652         if (err < 0)
2653                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2654 }
2655
2656 static int ip6_route_dev_notify(struct notifier_block *this,
2657                                 unsigned long event, void *data)
2658 {
2659         struct net_device *dev = (struct net_device *)data;
2660         struct net *net = dev_net(dev);
2661
2662         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2663                 net->ipv6.ip6_null_entry->dst.dev = dev;
2664                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2665 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2666                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2667                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2668                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2669                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2670 #endif
2671         }
2672
2673         return NOTIFY_OK;
2674 }
2675
2676 /*
2677  *      /proc
2678  */
2679
2680 #ifdef CONFIG_PROC_FS
2681
2682 struct rt6_proc_arg
2683 {
2684         char *buffer;
2685         int offset;
2686         int length;
2687         int skip;
2688         int len;
2689 };
2690
2691 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2692 {
2693         struct seq_file *m = p_arg;
2694         struct neighbour *n;
2695
2696         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2697
2698 #ifdef CONFIG_IPV6_SUBTREES
2699         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2700 #else
2701         seq_puts(m, "00000000000000000000000000000000 00 ");
2702 #endif
2703         rcu_read_lock();
2704         n = dst_get_neighbour_noref(&rt->dst);
2705         if (n) {
2706                 seq_printf(m, "%pi6", n->primary_key);
2707         } else {
2708                 seq_puts(m, "00000000000000000000000000000000");
2709         }
2710         rcu_read_unlock();
2711         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2712                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2713                    rt->dst.__use, rt->rt6i_flags,
2714                    rt->dst.dev ? rt->dst.dev->name : "");
2715         return 0;
2716 }
2717
2718 static int ipv6_route_show(struct seq_file *m, void *v)
2719 {
2720         struct net *net = (struct net *)m->private;
2721         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2722         return 0;
2723 }
2724
2725 static int ipv6_route_open(struct inode *inode, struct file *file)
2726 {
2727         return single_open_net(inode, file, ipv6_route_show);
2728 }
2729
2730 static const struct file_operations ipv6_route_proc_fops = {
2731         .owner          = THIS_MODULE,
2732         .open           = ipv6_route_open,
2733         .read           = seq_read,
2734         .llseek         = seq_lseek,
2735         .release        = single_release_net,
2736 };
2737
2738 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2739 {
2740         struct net *net = (struct net *)seq->private;
2741         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2742                    net->ipv6.rt6_stats->fib_nodes,
2743                    net->ipv6.rt6_stats->fib_route_nodes,
2744                    net->ipv6.rt6_stats->fib_rt_alloc,
2745                    net->ipv6.rt6_stats->fib_rt_entries,
2746                    net->ipv6.rt6_stats->fib_rt_cache,
2747                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2748                    net->ipv6.rt6_stats->fib_discarded_routes);
2749
2750         return 0;
2751 }
2752
2753 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2754 {
2755         return single_open_net(inode, file, rt6_stats_seq_show);
2756 }
2757
2758 static const struct file_operations rt6_stats_seq_fops = {
2759         .owner   = THIS_MODULE,
2760         .open    = rt6_stats_seq_open,
2761         .read    = seq_read,
2762         .llseek  = seq_lseek,
2763         .release = single_release_net,
2764 };
2765 #endif  /* CONFIG_PROC_FS */
2766
2767 #ifdef CONFIG_SYSCTL
2768
2769 static
2770 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2771                               void __user *buffer, size_t *lenp, loff_t *ppos)
2772 {
2773         struct net *net;
2774         int delay;
2775         if (!write)
2776                 return -EINVAL;
2777
2778         net = (struct net *)ctl->extra1;
2779         delay = net->ipv6.sysctl.flush_delay;
2780         proc_dointvec(ctl, write, buffer, lenp, ppos);
2781         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2782         return 0;
2783 }
2784
2785 ctl_table ipv6_route_table_template[] = {
2786         {
2787                 .procname       =       "flush",
2788                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2789                 .maxlen         =       sizeof(int),
2790                 .mode           =       0200,
2791                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2792         },
2793         {
2794                 .procname       =       "gc_thresh",
2795                 .data           =       &ip6_dst_ops_template.gc_thresh,
2796                 .maxlen         =       sizeof(int),
2797                 .mode           =       0644,
2798                 .proc_handler   =       proc_dointvec,
2799         },
2800         {
2801                 .procname       =       "max_size",
2802                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2803                 .maxlen         =       sizeof(int),
2804                 .mode           =       0644,
2805                 .proc_handler   =       proc_dointvec,
2806         },
2807         {
2808                 .procname       =       "gc_min_interval",
2809                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2810                 .maxlen         =       sizeof(int),
2811                 .mode           =       0644,
2812                 .proc_handler   =       proc_dointvec_jiffies,
2813         },
2814         {
2815                 .procname       =       "gc_timeout",
2816                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2817                 .maxlen         =       sizeof(int),
2818                 .mode           =       0644,
2819                 .proc_handler   =       proc_dointvec_jiffies,
2820         },
2821         {
2822                 .procname       =       "gc_interval",
2823                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2824                 .maxlen         =       sizeof(int),
2825                 .mode           =       0644,
2826                 .proc_handler   =       proc_dointvec_jiffies,
2827         },
2828         {
2829                 .procname       =       "gc_elasticity",
2830                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2831                 .maxlen         =       sizeof(int),
2832                 .mode           =       0644,
2833                 .proc_handler   =       proc_dointvec,
2834         },
2835         {
2836                 .procname       =       "mtu_expires",
2837                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2838                 .maxlen         =       sizeof(int),
2839                 .mode           =       0644,
2840                 .proc_handler   =       proc_dointvec_jiffies,
2841         },
2842         {
2843                 .procname       =       "min_adv_mss",
2844                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2845                 .maxlen         =       sizeof(int),
2846                 .mode           =       0644,
2847                 .proc_handler   =       proc_dointvec,
2848         },
2849         {
2850                 .procname       =       "gc_min_interval_ms",
2851                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2852                 .maxlen         =       sizeof(int),
2853                 .mode           =       0644,
2854                 .proc_handler   =       proc_dointvec_ms_jiffies,
2855         },
2856         { }
2857 };
2858
2859 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2860 {
2861         struct ctl_table *table;
2862
2863         table = kmemdup(ipv6_route_table_template,
2864                         sizeof(ipv6_route_table_template),
2865                         GFP_KERNEL);
2866
2867         if (table) {
2868                 table[0].data = &net->ipv6.sysctl.flush_delay;
2869                 table[0].extra1 = net;
2870                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2871                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2872                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2873                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2874                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2875                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2876                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2877                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2878                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2879         }
2880
2881         return table;
2882 }
2883 #endif
2884
2885 static int __net_init ip6_route_net_init(struct net *net)
2886 {
2887         int ret = -ENOMEM;
2888
2889         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2890                sizeof(net->ipv6.ip6_dst_ops));
2891
2892         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2893                 goto out_ip6_dst_ops;
2894
2895         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2896                                            sizeof(*net->ipv6.ip6_null_entry),
2897                                            GFP_KERNEL);
2898         if (!net->ipv6.ip6_null_entry)
2899                 goto out_ip6_dst_entries;
2900         net->ipv6.ip6_null_entry->dst.path =
2901                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2902         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2903         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2904                          ip6_template_metrics, true);
2905
2906 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2907         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2908                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2909                                                GFP_KERNEL);
2910         if (!net->ipv6.ip6_prohibit_entry)
2911                 goto out_ip6_null_entry;
2912         net->ipv6.ip6_prohibit_entry->dst.path =
2913                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2914         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2915         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2916                          ip6_template_metrics, true);
2917
2918         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2919                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2920                                                GFP_KERNEL);
2921         if (!net->ipv6.ip6_blk_hole_entry)
2922                 goto out_ip6_prohibit_entry;
2923         net->ipv6.ip6_blk_hole_entry->dst.path =
2924                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2925         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2926         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2927                          ip6_template_metrics, true);
2928 #endif
2929
2930         net->ipv6.sysctl.flush_delay = 0;
2931         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2932         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2933         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2934         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2935         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2936         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2937         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2938
2939 #ifdef CONFIG_PROC_FS
2940         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2941         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2942 #endif
2943         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2944
2945         ret = 0;
2946 out:
2947         return ret;
2948
2949 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2950 out_ip6_prohibit_entry:
2951         kfree(net->ipv6.ip6_prohibit_entry);
2952 out_ip6_null_entry:
2953         kfree(net->ipv6.ip6_null_entry);
2954 #endif
2955 out_ip6_dst_entries:
2956         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2957 out_ip6_dst_ops:
2958         goto out;
2959 }
2960
2961 static void __net_exit ip6_route_net_exit(struct net *net)
2962 {
2963 #ifdef CONFIG_PROC_FS
2964         proc_net_remove(net, "ipv6_route");
2965         proc_net_remove(net, "rt6_stats");
2966 #endif
2967         kfree(net->ipv6.ip6_null_entry);
2968 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2969         kfree(net->ipv6.ip6_prohibit_entry);
2970         kfree(net->ipv6.ip6_blk_hole_entry);
2971 #endif
2972         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2973 }
2974
2975 static struct pernet_operations ip6_route_net_ops = {
2976         .init = ip6_route_net_init,
2977         .exit = ip6_route_net_exit,
2978 };
2979
2980 static struct notifier_block ip6_route_dev_notifier = {
2981         .notifier_call = ip6_route_dev_notify,
2982         .priority = 0,
2983 };
2984
2985 int __init ip6_route_init(void)
2986 {
2987         int ret;
2988
2989         ret = -ENOMEM;
2990         ip6_dst_ops_template.kmem_cachep =
2991                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2992                                   SLAB_HWCACHE_ALIGN, NULL);
2993         if (!ip6_dst_ops_template.kmem_cachep)
2994                 goto out;
2995
2996         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2997         if (ret)
2998                 goto out_kmem_cache;
2999
3000         ret = register_pernet_subsys(&ip6_route_net_ops);
3001         if (ret)
3002                 goto out_dst_entries;
3003
3004         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3005
3006         /* Registering of the loopback is done before this portion of code,
3007          * the loopback reference in rt6_info will not be taken, do it
3008          * manually for init_net */
3009         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3010         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3011   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3012         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3013         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3014         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3015         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3016   #endif
3017         ret = fib6_init();
3018         if (ret)
3019                 goto out_register_subsys;
3020
3021         ret = xfrm6_init();
3022         if (ret)
3023                 goto out_fib6_init;
3024
3025         ret = fib6_rules_init();
3026         if (ret)
3027                 goto xfrm6_init;
3028
3029         ret = -ENOBUFS;
3030         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3031             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3032             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3033                 goto fib6_rules_init;
3034
3035         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3036         if (ret)
3037                 goto fib6_rules_init;
3038
3039 out:
3040         return ret;
3041
3042 fib6_rules_init:
3043         fib6_rules_cleanup();
3044 xfrm6_init:
3045         xfrm6_fini();
3046 out_fib6_init:
3047         fib6_gc_cleanup();
3048 out_register_subsys:
3049         unregister_pernet_subsys(&ip6_route_net_ops);
3050 out_dst_entries:
3051         dst_entries_destroy(&ip6_dst_blackhole_ops);
3052 out_kmem_cache:
3053         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3054         goto out;
3055 }
3056
3057 void ip6_route_cleanup(void)
3058 {
3059         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3060         fib6_rules_cleanup();
3061         xfrm6_fini();
3062         fib6_gc_cleanup();
3063         unregister_pernet_subsys(&ip6_route_net_ops);
3064         dst_entries_destroy(&ip6_dst_blackhole_ops);
3065         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3066 }