07361dfa80852cbbe4db66027f8da5ef13ade4c1
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
125 {
126         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
127         if (n)
128                 return n;
129         return neigh_create(&nd_tbl, daddr, dst->dev);
130 }
131
132 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
133 {
134         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
135         if (!n) {
136                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
137                 if (IS_ERR(n))
138                         return PTR_ERR(n);
139         }
140         dst_set_neighbour(&rt->dst, n);
141
142         return 0;
143 }
144
145 static struct dst_ops ip6_dst_ops_template = {
146         .family                 =       AF_INET6,
147         .protocol               =       cpu_to_be16(ETH_P_IPV6),
148         .gc                     =       ip6_dst_gc,
149         .gc_thresh              =       1024,
150         .check                  =       ip6_dst_check,
151         .default_advmss         =       ip6_default_advmss,
152         .mtu                    =       ip6_mtu,
153         .cow_metrics            =       ipv6_cow_metrics,
154         .destroy                =       ip6_dst_destroy,
155         .ifdown                 =       ip6_dst_ifdown,
156         .negative_advice        =       ip6_negative_advice,
157         .link_failure           =       ip6_link_failure,
158         .update_pmtu            =       ip6_rt_update_pmtu,
159         .local_out              =       __ip6_local_out,
160         .neigh_lookup           =       ip6_neigh_lookup,
161 };
162
163 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
164 {
165         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
166
167         return mtu ? : dst->dev->mtu;
168 }
169
170 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
171 {
172 }
173
174 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175                                          unsigned long old)
176 {
177         return NULL;
178 }
179
180 static struct dst_ops ip6_dst_blackhole_ops = {
181         .family                 =       AF_INET6,
182         .protocol               =       cpu_to_be16(ETH_P_IPV6),
183         .destroy                =       ip6_dst_destroy,
184         .check                  =       ip6_dst_check,
185         .mtu                    =       ip6_blackhole_mtu,
186         .default_advmss         =       ip6_default_advmss,
187         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
188         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
189         .neigh_lookup           =       ip6_neigh_lookup,
190 };
191
192 static const u32 ip6_template_metrics[RTAX_MAX] = {
193         [RTAX_HOPLIMIT - 1] = 255,
194 };
195
196 static struct rt6_info ip6_null_entry_template = {
197         .dst = {
198                 .__refcnt       = ATOMIC_INIT(1),
199                 .__use          = 1,
200                 .obsolete       = -1,
201                 .error          = -ENETUNREACH,
202                 .input          = ip6_pkt_discard,
203                 .output         = ip6_pkt_discard_out,
204         },
205         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
206         .rt6i_protocol  = RTPROT_KERNEL,
207         .rt6i_metric    = ~(u32) 0,
208         .rt6i_ref       = ATOMIC_INIT(1),
209 };
210
211 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
212
213 static int ip6_pkt_prohibit(struct sk_buff *skb);
214 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
215
216 static struct rt6_info ip6_prohibit_entry_template = {
217         .dst = {
218                 .__refcnt       = ATOMIC_INIT(1),
219                 .__use          = 1,
220                 .obsolete       = -1,
221                 .error          = -EACCES,
222                 .input          = ip6_pkt_prohibit,
223                 .output         = ip6_pkt_prohibit_out,
224         },
225         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
226         .rt6i_protocol  = RTPROT_KERNEL,
227         .rt6i_metric    = ~(u32) 0,
228         .rt6i_ref       = ATOMIC_INIT(1),
229 };
230
231 static struct rt6_info ip6_blk_hole_entry_template = {
232         .dst = {
233                 .__refcnt       = ATOMIC_INIT(1),
234                 .__use          = 1,
235                 .obsolete       = -1,
236                 .error          = -EINVAL,
237                 .input          = dst_discard,
238                 .output         = dst_discard,
239         },
240         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
241         .rt6i_protocol  = RTPROT_KERNEL,
242         .rt6i_metric    = ~(u32) 0,
243         .rt6i_ref       = ATOMIC_INIT(1),
244 };
245
246 #endif
247
248 /* allocate dst with ip6_dst_ops */
249 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
250                                              struct net_device *dev,
251                                              int flags)
252 {
253         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
254
255         if (rt)
256                 memset(&rt->rt6i_table, 0,
257                        sizeof(*rt) - sizeof(struct dst_entry));
258
259         return rt;
260 }
261
262 static void ip6_dst_destroy(struct dst_entry *dst)
263 {
264         struct rt6_info *rt = (struct rt6_info *)dst;
265         struct inet6_dev *idev = rt->rt6i_idev;
266         struct inet_peer *peer = rt->rt6i_peer;
267
268         if (!(rt->dst.flags & DST_HOST))
269                 dst_destroy_metrics_generic(dst);
270
271         if (idev) {
272                 rt->rt6i_idev = NULL;
273                 in6_dev_put(idev);
274         }
275         if (peer) {
276                 rt->rt6i_peer = NULL;
277                 inet_putpeer(peer);
278         }
279 }
280
281 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
282
283 static u32 rt6_peer_genid(void)
284 {
285         return atomic_read(&__rt6_peer_genid);
286 }
287
288 void rt6_bind_peer(struct rt6_info *rt, int create)
289 {
290         struct inet_peer *peer;
291
292         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
293         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
294                 inet_putpeer(peer);
295         else
296                 rt->rt6i_peer_genid = rt6_peer_genid();
297 }
298
299 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
300                            int how)
301 {
302         struct rt6_info *rt = (struct rt6_info *)dst;
303         struct inet6_dev *idev = rt->rt6i_idev;
304         struct net_device *loopback_dev =
305                 dev_net(dev)->loopback_dev;
306
307         if (dev != loopback_dev && idev && idev->dev == dev) {
308                 struct inet6_dev *loopback_idev =
309                         in6_dev_get(loopback_dev);
310                 if (loopback_idev) {
311                         rt->rt6i_idev = loopback_idev;
312                         in6_dev_put(idev);
313                 }
314         }
315 }
316
317 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
318 {
319         return (rt->rt6i_flags & RTF_EXPIRES) &&
320                 time_after(jiffies, rt->dst.expires);
321 }
322
323 static inline int rt6_need_strict(const struct in6_addr *daddr)
324 {
325         return ipv6_addr_type(daddr) &
326                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
327 }
328
329 /*
330  *      Route lookup. Any table->tb6_lock is implied.
331  */
332
333 static inline struct rt6_info *rt6_device_match(struct net *net,
334                                                     struct rt6_info *rt,
335                                                     const struct in6_addr *saddr,
336                                                     int oif,
337                                                     int flags)
338 {
339         struct rt6_info *local = NULL;
340         struct rt6_info *sprt;
341
342         if (!oif && ipv6_addr_any(saddr))
343                 goto out;
344
345         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
346                 struct net_device *dev = sprt->dst.dev;
347
348                 if (oif) {
349                         if (dev->ifindex == oif)
350                                 return sprt;
351                         if (dev->flags & IFF_LOOPBACK) {
352                                 if (!sprt->rt6i_idev ||
353                                     sprt->rt6i_idev->dev->ifindex != oif) {
354                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
355                                                 continue;
356                                         if (local && (!oif ||
357                                                       local->rt6i_idev->dev->ifindex == oif))
358                                                 continue;
359                                 }
360                                 local = sprt;
361                         }
362                 } else {
363                         if (ipv6_chk_addr(net, saddr, dev,
364                                           flags & RT6_LOOKUP_F_IFACE))
365                                 return sprt;
366                 }
367         }
368
369         if (oif) {
370                 if (local)
371                         return local;
372
373                 if (flags & RT6_LOOKUP_F_IFACE)
374                         return net->ipv6.ip6_null_entry;
375         }
376 out:
377         return rt;
378 }
379
380 #ifdef CONFIG_IPV6_ROUTER_PREF
381 static void rt6_probe(struct rt6_info *rt)
382 {
383         struct neighbour *neigh;
384         /*
385          * Okay, this does not seem to be appropriate
386          * for now, however, we need to check if it
387          * is really so; aka Router Reachability Probing.
388          *
389          * Router Reachability Probe MUST be rate-limited
390          * to no more than one per minute.
391          */
392         rcu_read_lock();
393         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
394         if (!neigh || (neigh->nud_state & NUD_VALID))
395                 goto out;
396         read_lock_bh(&neigh->lock);
397         if (!(neigh->nud_state & NUD_VALID) &&
398             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
399                 struct in6_addr mcaddr;
400                 struct in6_addr *target;
401
402                 neigh->updated = jiffies;
403                 read_unlock_bh(&neigh->lock);
404
405                 target = (struct in6_addr *)&neigh->primary_key;
406                 addrconf_addr_solict_mult(target, &mcaddr);
407                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
408         } else {
409                 read_unlock_bh(&neigh->lock);
410         }
411 out:
412         rcu_read_unlock();
413 }
414 #else
415 static inline void rt6_probe(struct rt6_info *rt)
416 {
417 }
418 #endif
419
420 /*
421  * Default Router Selection (RFC 2461 6.3.6)
422  */
423 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
424 {
425         struct net_device *dev = rt->dst.dev;
426         if (!oif || dev->ifindex == oif)
427                 return 2;
428         if ((dev->flags & IFF_LOOPBACK) &&
429             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
430                 return 1;
431         return 0;
432 }
433
434 static inline int rt6_check_neigh(struct rt6_info *rt)
435 {
436         struct neighbour *neigh;
437         int m;
438
439         rcu_read_lock();
440         neigh = dst_get_neighbour_noref(&rt->dst);
441         if (rt->rt6i_flags & RTF_NONEXTHOP ||
442             !(rt->rt6i_flags & RTF_GATEWAY))
443                 m = 1;
444         else if (neigh) {
445                 read_lock_bh(&neigh->lock);
446                 if (neigh->nud_state & NUD_VALID)
447                         m = 2;
448 #ifdef CONFIG_IPV6_ROUTER_PREF
449                 else if (neigh->nud_state & NUD_FAILED)
450                         m = 0;
451 #endif
452                 else
453                         m = 1;
454                 read_unlock_bh(&neigh->lock);
455         } else
456                 m = 0;
457         rcu_read_unlock();
458         return m;
459 }
460
461 static int rt6_score_route(struct rt6_info *rt, int oif,
462                            int strict)
463 {
464         int m, n;
465
466         m = rt6_check_dev(rt, oif);
467         if (!m && (strict & RT6_LOOKUP_F_IFACE))
468                 return -1;
469 #ifdef CONFIG_IPV6_ROUTER_PREF
470         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
471 #endif
472         n = rt6_check_neigh(rt);
473         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
474                 return -1;
475         return m;
476 }
477
478 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
479                                    int *mpri, struct rt6_info *match)
480 {
481         int m;
482
483         if (rt6_check_expired(rt))
484                 goto out;
485
486         m = rt6_score_route(rt, oif, strict);
487         if (m < 0)
488                 goto out;
489
490         if (m > *mpri) {
491                 if (strict & RT6_LOOKUP_F_REACHABLE)
492                         rt6_probe(match);
493                 *mpri = m;
494                 match = rt;
495         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
496                 rt6_probe(rt);
497         }
498
499 out:
500         return match;
501 }
502
503 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
504                                      struct rt6_info *rr_head,
505                                      u32 metric, int oif, int strict)
506 {
507         struct rt6_info *rt, *match;
508         int mpri = -1;
509
510         match = NULL;
511         for (rt = rr_head; rt && rt->rt6i_metric == metric;
512              rt = rt->dst.rt6_next)
513                 match = find_match(rt, oif, strict, &mpri, match);
514         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
515              rt = rt->dst.rt6_next)
516                 match = find_match(rt, oif, strict, &mpri, match);
517
518         return match;
519 }
520
521 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
522 {
523         struct rt6_info *match, *rt0;
524         struct net *net;
525
526         rt0 = fn->rr_ptr;
527         if (!rt0)
528                 fn->rr_ptr = rt0 = fn->leaf;
529
530         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531
532         if (!match &&
533             (strict & RT6_LOOKUP_F_REACHABLE)) {
534                 struct rt6_info *next = rt0->dst.rt6_next;
535
536                 /* no entries matched; do round-robin */
537                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
538                         next = fn->leaf;
539
540                 if (next != rt0)
541                         fn->rr_ptr = next;
542         }
543
544         net = dev_net(rt0->dst.dev);
545         return match ? match : net->ipv6.ip6_null_entry;
546 }
547
548 #ifdef CONFIG_IPV6_ROUTE_INFO
549 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
550                   const struct in6_addr *gwaddr)
551 {
552         struct net *net = dev_net(dev);
553         struct route_info *rinfo = (struct route_info *) opt;
554         struct in6_addr prefix_buf, *prefix;
555         unsigned int pref;
556         unsigned long lifetime;
557         struct rt6_info *rt;
558
559         if (len < sizeof(struct route_info)) {
560                 return -EINVAL;
561         }
562
563         /* Sanity check for prefix_len and length */
564         if (rinfo->length > 3) {
565                 return -EINVAL;
566         } else if (rinfo->prefix_len > 128) {
567                 return -EINVAL;
568         } else if (rinfo->prefix_len > 64) {
569                 if (rinfo->length < 2) {
570                         return -EINVAL;
571                 }
572         } else if (rinfo->prefix_len > 0) {
573                 if (rinfo->length < 1) {
574                         return -EINVAL;
575                 }
576         }
577
578         pref = rinfo->route_pref;
579         if (pref == ICMPV6_ROUTER_PREF_INVALID)
580                 return -EINVAL;
581
582         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
583
584         if (rinfo->length == 3)
585                 prefix = (struct in6_addr *)rinfo->prefix;
586         else {
587                 /* this function is safe */
588                 ipv6_addr_prefix(&prefix_buf,
589                                  (struct in6_addr *)rinfo->prefix,
590                                  rinfo->prefix_len);
591                 prefix = &prefix_buf;
592         }
593
594         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
595                                 dev->ifindex);
596
597         if (rt && !lifetime) {
598                 ip6_del_rt(rt);
599                 rt = NULL;
600         }
601
602         if (!rt && lifetime)
603                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
604                                         pref);
605         else if (rt)
606                 rt->rt6i_flags = RTF_ROUTEINFO |
607                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
608
609         if (rt) {
610                 if (!addrconf_finite_timeout(lifetime)) {
611                         rt->rt6i_flags &= ~RTF_EXPIRES;
612                 } else {
613                         rt->dst.expires = jiffies + HZ * lifetime;
614                         rt->rt6i_flags |= RTF_EXPIRES;
615                 }
616                 dst_release(&rt->dst);
617         }
618         return 0;
619 }
620 #endif
621
622 #define BACKTRACK(__net, saddr)                 \
623 do { \
624         if (rt == __net->ipv6.ip6_null_entry) { \
625                 struct fib6_node *pn; \
626                 while (1) { \
627                         if (fn->fn_flags & RTN_TL_ROOT) \
628                                 goto out; \
629                         pn = fn->parent; \
630                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
631                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
632                         else \
633                                 fn = pn; \
634                         if (fn->fn_flags & RTN_RTINFO) \
635                                 goto restart; \
636                 } \
637         } \
638 } while (0)
639
640 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
641                                              struct fib6_table *table,
642                                              struct flowi6 *fl6, int flags)
643 {
644         struct fib6_node *fn;
645         struct rt6_info *rt;
646
647         read_lock_bh(&table->tb6_lock);
648         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
649 restart:
650         rt = fn->leaf;
651         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
652         BACKTRACK(net, &fl6->saddr);
653 out:
654         dst_use(&rt->dst, jiffies);
655         read_unlock_bh(&table->tb6_lock);
656         return rt;
657
658 }
659
660 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
661                                     int flags)
662 {
663         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
664 }
665 EXPORT_SYMBOL_GPL(ip6_route_lookup);
666
667 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
668                             const struct in6_addr *saddr, int oif, int strict)
669 {
670         struct flowi6 fl6 = {
671                 .flowi6_oif = oif,
672                 .daddr = *daddr,
673         };
674         struct dst_entry *dst;
675         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
676
677         if (saddr) {
678                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
679                 flags |= RT6_LOOKUP_F_HAS_SADDR;
680         }
681
682         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
683         if (dst->error == 0)
684                 return (struct rt6_info *) dst;
685
686         dst_release(dst);
687
688         return NULL;
689 }
690
691 EXPORT_SYMBOL(rt6_lookup);
692
693 /* ip6_ins_rt is called with FREE table->tb6_lock.
694    It takes new route entry, the addition fails by any reason the
695    route is freed. In any case, if caller does not hold it, it may
696    be destroyed.
697  */
698
699 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
700 {
701         int err;
702         struct fib6_table *table;
703
704         table = rt->rt6i_table;
705         write_lock_bh(&table->tb6_lock);
706         err = fib6_add(&table->tb6_root, rt, info);
707         write_unlock_bh(&table->tb6_lock);
708
709         return err;
710 }
711
712 int ip6_ins_rt(struct rt6_info *rt)
713 {
714         struct nl_info info = {
715                 .nl_net = dev_net(rt->dst.dev),
716         };
717         return __ip6_ins_rt(rt, &info);
718 }
719
720 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
721                                       const struct in6_addr *daddr,
722                                       const struct in6_addr *saddr)
723 {
724         struct rt6_info *rt;
725
726         /*
727          *      Clone the route.
728          */
729
730         rt = ip6_rt_copy(ort, daddr);
731
732         if (rt) {
733                 int attempts = !in_softirq();
734
735                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
736                         if (ort->rt6i_dst.plen != 128 &&
737                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
738                                 rt->rt6i_flags |= RTF_ANYCAST;
739                         rt->rt6i_gateway = *daddr;
740                 }
741
742                 rt->rt6i_flags |= RTF_CACHE;
743
744 #ifdef CONFIG_IPV6_SUBTREES
745                 if (rt->rt6i_src.plen && saddr) {
746                         rt->rt6i_src.addr = *saddr;
747                         rt->rt6i_src.plen = 128;
748                 }
749 #endif
750
751         retry:
752                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
753                         struct net *net = dev_net(rt->dst.dev);
754                         int saved_rt_min_interval =
755                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
756                         int saved_rt_elasticity =
757                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
758
759                         if (attempts-- > 0) {
760                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
761                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
762
763                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
764
765                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
766                                         saved_rt_elasticity;
767                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
768                                         saved_rt_min_interval;
769                                 goto retry;
770                         }
771
772                         if (net_ratelimit())
773                                 printk(KERN_WARNING
774                                        "ipv6: Neighbour table overflow.\n");
775                         dst_free(&rt->dst);
776                         return NULL;
777                 }
778         }
779
780         return rt;
781 }
782
783 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
784                                         const struct in6_addr *daddr)
785 {
786         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
787
788         if (rt) {
789                 rt->rt6i_flags |= RTF_CACHE;
790                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
791         }
792         return rt;
793 }
794
795 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
796                                       struct flowi6 *fl6, int flags)
797 {
798         struct fib6_node *fn;
799         struct rt6_info *rt, *nrt;
800         int strict = 0;
801         int attempts = 3;
802         int err;
803         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
804
805         strict |= flags & RT6_LOOKUP_F_IFACE;
806
807 relookup:
808         read_lock_bh(&table->tb6_lock);
809
810 restart_2:
811         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
812
813 restart:
814         rt = rt6_select(fn, oif, strict | reachable);
815
816         BACKTRACK(net, &fl6->saddr);
817         if (rt == net->ipv6.ip6_null_entry ||
818             rt->rt6i_flags & RTF_CACHE)
819                 goto out;
820
821         dst_hold(&rt->dst);
822         read_unlock_bh(&table->tb6_lock);
823
824         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
825                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
826         else if (!(rt->dst.flags & DST_HOST))
827                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
828         else
829                 goto out2;
830
831         dst_release(&rt->dst);
832         rt = nrt ? : net->ipv6.ip6_null_entry;
833
834         dst_hold(&rt->dst);
835         if (nrt) {
836                 err = ip6_ins_rt(nrt);
837                 if (!err)
838                         goto out2;
839         }
840
841         if (--attempts <= 0)
842                 goto out2;
843
844         /*
845          * Race condition! In the gap, when table->tb6_lock was
846          * released someone could insert this route.  Relookup.
847          */
848         dst_release(&rt->dst);
849         goto relookup;
850
851 out:
852         if (reachable) {
853                 reachable = 0;
854                 goto restart_2;
855         }
856         dst_hold(&rt->dst);
857         read_unlock_bh(&table->tb6_lock);
858 out2:
859         rt->dst.lastuse = jiffies;
860         rt->dst.__use++;
861
862         return rt;
863 }
864
865 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
866                                             struct flowi6 *fl6, int flags)
867 {
868         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
869 }
870
871 void ip6_route_input(struct sk_buff *skb)
872 {
873         const struct ipv6hdr *iph = ipv6_hdr(skb);
874         struct net *net = dev_net(skb->dev);
875         int flags = RT6_LOOKUP_F_HAS_SADDR;
876         struct flowi6 fl6 = {
877                 .flowi6_iif = skb->dev->ifindex,
878                 .daddr = iph->daddr,
879                 .saddr = iph->saddr,
880                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
881                 .flowi6_mark = skb->mark,
882                 .flowi6_proto = iph->nexthdr,
883         };
884
885         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
886                 flags |= RT6_LOOKUP_F_IFACE;
887
888         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
889 }
890
891 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
892                                              struct flowi6 *fl6, int flags)
893 {
894         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
895 }
896
897 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
898                                     struct flowi6 *fl6)
899 {
900         int flags = 0;
901
902         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
903                 flags |= RT6_LOOKUP_F_IFACE;
904
905         if (!ipv6_addr_any(&fl6->saddr))
906                 flags |= RT6_LOOKUP_F_HAS_SADDR;
907         else if (sk)
908                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
909
910         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
911 }
912
913 EXPORT_SYMBOL(ip6_route_output);
914
915 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
916 {
917         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
918         struct dst_entry *new = NULL;
919
920         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
921         if (rt) {
922                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
923
924                 new = &rt->dst;
925
926                 new->__use = 1;
927                 new->input = dst_discard;
928                 new->output = dst_discard;
929
930                 if (dst_metrics_read_only(&ort->dst))
931                         new->_metrics = ort->dst._metrics;
932                 else
933                         dst_copy_metrics(new, &ort->dst);
934                 rt->rt6i_idev = ort->rt6i_idev;
935                 if (rt->rt6i_idev)
936                         in6_dev_hold(rt->rt6i_idev);
937                 rt->dst.expires = 0;
938
939                 rt->rt6i_gateway = ort->rt6i_gateway;
940                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
941                 rt->rt6i_metric = 0;
942
943                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
944 #ifdef CONFIG_IPV6_SUBTREES
945                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
946 #endif
947
948                 dst_free(new);
949         }
950
951         dst_release(dst_orig);
952         return new ? new : ERR_PTR(-ENOMEM);
953 }
954
955 /*
956  *      Destination cache support functions
957  */
958
959 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
960 {
961         struct rt6_info *rt;
962
963         rt = (struct rt6_info *) dst;
964
965         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
966                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
967                         if (!rt->rt6i_peer)
968                                 rt6_bind_peer(rt, 0);
969                         rt->rt6i_peer_genid = rt6_peer_genid();
970                 }
971                 return dst;
972         }
973         return NULL;
974 }
975
976 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
977 {
978         struct rt6_info *rt = (struct rt6_info *) dst;
979
980         if (rt) {
981                 if (rt->rt6i_flags & RTF_CACHE) {
982                         if (rt6_check_expired(rt)) {
983                                 ip6_del_rt(rt);
984                                 dst = NULL;
985                         }
986                 } else {
987                         dst_release(dst);
988                         dst = NULL;
989                 }
990         }
991         return dst;
992 }
993
994 static void ip6_link_failure(struct sk_buff *skb)
995 {
996         struct rt6_info *rt;
997
998         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
999
1000         rt = (struct rt6_info *) skb_dst(skb);
1001         if (rt) {
1002                 if (rt->rt6i_flags & RTF_CACHE) {
1003                         dst_set_expires(&rt->dst, 0);
1004                         rt->rt6i_flags |= RTF_EXPIRES;
1005                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1006                         rt->rt6i_node->fn_sernum = -1;
1007         }
1008 }
1009
1010 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1011 {
1012         struct rt6_info *rt6 = (struct rt6_info*)dst;
1013
1014         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1015                 rt6->rt6i_flags |= RTF_MODIFIED;
1016                 if (mtu < IPV6_MIN_MTU) {
1017                         u32 features = dst_metric(dst, RTAX_FEATURES);
1018                         mtu = IPV6_MIN_MTU;
1019                         features |= RTAX_FEATURE_ALLFRAG;
1020                         dst_metric_set(dst, RTAX_FEATURES, features);
1021                 }
1022                 dst_metric_set(dst, RTAX_MTU, mtu);
1023         }
1024 }
1025
1026 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1027 {
1028         struct net_device *dev = dst->dev;
1029         unsigned int mtu = dst_mtu(dst);
1030         struct net *net = dev_net(dev);
1031
1032         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1033
1034         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1035                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1036
1037         /*
1038          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1039          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1040          * IPV6_MAXPLEN is also valid and means: "any MSS,
1041          * rely only on pmtu discovery"
1042          */
1043         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1044                 mtu = IPV6_MAXPLEN;
1045         return mtu;
1046 }
1047
1048 static unsigned int ip6_mtu(const struct dst_entry *dst)
1049 {
1050         struct inet6_dev *idev;
1051         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1052
1053         if (mtu)
1054                 return mtu;
1055
1056         mtu = IPV6_MIN_MTU;
1057
1058         rcu_read_lock();
1059         idev = __in6_dev_get(dst->dev);
1060         if (idev)
1061                 mtu = idev->cnf.mtu6;
1062         rcu_read_unlock();
1063
1064         return mtu;
1065 }
1066
1067 static struct dst_entry *icmp6_dst_gc_list;
1068 static DEFINE_SPINLOCK(icmp6_dst_lock);
1069
1070 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1071                                   struct neighbour *neigh,
1072                                   struct flowi6 *fl6)
1073 {
1074         struct dst_entry *dst;
1075         struct rt6_info *rt;
1076         struct inet6_dev *idev = in6_dev_get(dev);
1077         struct net *net = dev_net(dev);
1078
1079         if (unlikely(!idev))
1080                 return NULL;
1081
1082         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1083         if (unlikely(!rt)) {
1084                 in6_dev_put(idev);
1085                 dst = ERR_PTR(-ENOMEM);
1086                 goto out;
1087         }
1088
1089         if (neigh)
1090                 neigh_hold(neigh);
1091         else {
1092                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1093                 if (IS_ERR(neigh)) {
1094                         dst_free(&rt->dst);
1095                         return ERR_CAST(neigh);
1096                 }
1097         }
1098
1099         rt->dst.flags |= DST_HOST;
1100         rt->dst.output  = ip6_output;
1101         dst_set_neighbour(&rt->dst, neigh);
1102         atomic_set(&rt->dst.__refcnt, 1);
1103         rt->rt6i_dst.addr = fl6->daddr;
1104         rt->rt6i_dst.plen = 128;
1105         rt->rt6i_idev     = idev;
1106         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1107
1108         spin_lock_bh(&icmp6_dst_lock);
1109         rt->dst.next = icmp6_dst_gc_list;
1110         icmp6_dst_gc_list = &rt->dst;
1111         spin_unlock_bh(&icmp6_dst_lock);
1112
1113         fib6_force_start_gc(net);
1114
1115         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1116
1117 out:
1118         return dst;
1119 }
1120
1121 int icmp6_dst_gc(void)
1122 {
1123         struct dst_entry *dst, **pprev;
1124         int more = 0;
1125
1126         spin_lock_bh(&icmp6_dst_lock);
1127         pprev = &icmp6_dst_gc_list;
1128
1129         while ((dst = *pprev) != NULL) {
1130                 if (!atomic_read(&dst->__refcnt)) {
1131                         *pprev = dst->next;
1132                         dst_free(dst);
1133                 } else {
1134                         pprev = &dst->next;
1135                         ++more;
1136                 }
1137         }
1138
1139         spin_unlock_bh(&icmp6_dst_lock);
1140
1141         return more;
1142 }
1143
1144 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1145                             void *arg)
1146 {
1147         struct dst_entry *dst, **pprev;
1148
1149         spin_lock_bh(&icmp6_dst_lock);
1150         pprev = &icmp6_dst_gc_list;
1151         while ((dst = *pprev) != NULL) {
1152                 struct rt6_info *rt = (struct rt6_info *) dst;
1153                 if (func(rt, arg)) {
1154                         *pprev = dst->next;
1155                         dst_free(dst);
1156                 } else {
1157                         pprev = &dst->next;
1158                 }
1159         }
1160         spin_unlock_bh(&icmp6_dst_lock);
1161 }
1162
1163 static int ip6_dst_gc(struct dst_ops *ops)
1164 {
1165         unsigned long now = jiffies;
1166         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1167         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1168         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1169         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1170         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1171         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1172         int entries;
1173
1174         entries = dst_entries_get_fast(ops);
1175         if (time_after(rt_last_gc + rt_min_interval, now) &&
1176             entries <= rt_max_size)
1177                 goto out;
1178
1179         net->ipv6.ip6_rt_gc_expire++;
1180         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1181         net->ipv6.ip6_rt_last_gc = now;
1182         entries = dst_entries_get_slow(ops);
1183         if (entries < ops->gc_thresh)
1184                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1185 out:
1186         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1187         return entries > rt_max_size;
1188 }
1189
1190 /* Clean host part of a prefix. Not necessary in radix tree,
1191    but results in cleaner routing tables.
1192
1193    Remove it only when all the things will work!
1194  */
1195
1196 int ip6_dst_hoplimit(struct dst_entry *dst)
1197 {
1198         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1199         if (hoplimit == 0) {
1200                 struct net_device *dev = dst->dev;
1201                 struct inet6_dev *idev;
1202
1203                 rcu_read_lock();
1204                 idev = __in6_dev_get(dev);
1205                 if (idev)
1206                         hoplimit = idev->cnf.hop_limit;
1207                 else
1208                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1209                 rcu_read_unlock();
1210         }
1211         return hoplimit;
1212 }
1213 EXPORT_SYMBOL(ip6_dst_hoplimit);
1214
1215 /*
1216  *
1217  */
1218
1219 int ip6_route_add(struct fib6_config *cfg)
1220 {
1221         int err;
1222         struct net *net = cfg->fc_nlinfo.nl_net;
1223         struct rt6_info *rt = NULL;
1224         struct net_device *dev = NULL;
1225         struct inet6_dev *idev = NULL;
1226         struct fib6_table *table;
1227         int addr_type;
1228
1229         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1230                 return -EINVAL;
1231 #ifndef CONFIG_IPV6_SUBTREES
1232         if (cfg->fc_src_len)
1233                 return -EINVAL;
1234 #endif
1235         if (cfg->fc_ifindex) {
1236                 err = -ENODEV;
1237                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1238                 if (!dev)
1239                         goto out;
1240                 idev = in6_dev_get(dev);
1241                 if (!idev)
1242                         goto out;
1243         }
1244
1245         if (cfg->fc_metric == 0)
1246                 cfg->fc_metric = IP6_RT_PRIO_USER;
1247
1248         err = -ENOBUFS;
1249         if (cfg->fc_nlinfo.nlh &&
1250             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1251                 table = fib6_get_table(net, cfg->fc_table);
1252                 if (!table) {
1253                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1254                         table = fib6_new_table(net, cfg->fc_table);
1255                 }
1256         } else {
1257                 table = fib6_new_table(net, cfg->fc_table);
1258         }
1259
1260         if (!table)
1261                 goto out;
1262
1263         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1264
1265         if (!rt) {
1266                 err = -ENOMEM;
1267                 goto out;
1268         }
1269
1270         rt->dst.obsolete = -1;
1271         rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1272                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1273                                 0;
1274
1275         if (cfg->fc_protocol == RTPROT_UNSPEC)
1276                 cfg->fc_protocol = RTPROT_BOOT;
1277         rt->rt6i_protocol = cfg->fc_protocol;
1278
1279         addr_type = ipv6_addr_type(&cfg->fc_dst);
1280
1281         if (addr_type & IPV6_ADDR_MULTICAST)
1282                 rt->dst.input = ip6_mc_input;
1283         else if (cfg->fc_flags & RTF_LOCAL)
1284                 rt->dst.input = ip6_input;
1285         else
1286                 rt->dst.input = ip6_forward;
1287
1288         rt->dst.output = ip6_output;
1289
1290         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1291         rt->rt6i_dst.plen = cfg->fc_dst_len;
1292         if (rt->rt6i_dst.plen == 128)
1293                rt->dst.flags |= DST_HOST;
1294
1295         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1296                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1297                 if (!metrics) {
1298                         err = -ENOMEM;
1299                         goto out;
1300                 }
1301                 dst_init_metrics(&rt->dst, metrics, 0);
1302         }
1303 #ifdef CONFIG_IPV6_SUBTREES
1304         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1305         rt->rt6i_src.plen = cfg->fc_src_len;
1306 #endif
1307
1308         rt->rt6i_metric = cfg->fc_metric;
1309
1310         /* We cannot add true routes via loopback here,
1311            they would result in kernel looping; promote them to reject routes
1312          */
1313         if ((cfg->fc_flags & RTF_REJECT) ||
1314             (dev && (dev->flags & IFF_LOOPBACK) &&
1315              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1316              !(cfg->fc_flags & RTF_LOCAL))) {
1317                 /* hold loopback dev/idev if we haven't done so. */
1318                 if (dev != net->loopback_dev) {
1319                         if (dev) {
1320                                 dev_put(dev);
1321                                 in6_dev_put(idev);
1322                         }
1323                         dev = net->loopback_dev;
1324                         dev_hold(dev);
1325                         idev = in6_dev_get(dev);
1326                         if (!idev) {
1327                                 err = -ENODEV;
1328                                 goto out;
1329                         }
1330                 }
1331                 rt->dst.output = ip6_pkt_discard_out;
1332                 rt->dst.input = ip6_pkt_discard;
1333                 rt->dst.error = -ENETUNREACH;
1334                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1335                 goto install_route;
1336         }
1337
1338         if (cfg->fc_flags & RTF_GATEWAY) {
1339                 const struct in6_addr *gw_addr;
1340                 int gwa_type;
1341
1342                 gw_addr = &cfg->fc_gateway;
1343                 rt->rt6i_gateway = *gw_addr;
1344                 gwa_type = ipv6_addr_type(gw_addr);
1345
1346                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1347                         struct rt6_info *grt;
1348
1349                         /* IPv6 strictly inhibits using not link-local
1350                            addresses as nexthop address.
1351                            Otherwise, router will not able to send redirects.
1352                            It is very good, but in some (rare!) circumstances
1353                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1354                            some exceptions. --ANK
1355                          */
1356                         err = -EINVAL;
1357                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1358                                 goto out;
1359
1360                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1361
1362                         err = -EHOSTUNREACH;
1363                         if (!grt)
1364                                 goto out;
1365                         if (dev) {
1366                                 if (dev != grt->dst.dev) {
1367                                         dst_release(&grt->dst);
1368                                         goto out;
1369                                 }
1370                         } else {
1371                                 dev = grt->dst.dev;
1372                                 idev = grt->rt6i_idev;
1373                                 dev_hold(dev);
1374                                 in6_dev_hold(grt->rt6i_idev);
1375                         }
1376                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1377                                 err = 0;
1378                         dst_release(&grt->dst);
1379
1380                         if (err)
1381                                 goto out;
1382                 }
1383                 err = -EINVAL;
1384                 if (!dev || (dev->flags & IFF_LOOPBACK))
1385                         goto out;
1386         }
1387
1388         err = -ENODEV;
1389         if (!dev)
1390                 goto out;
1391
1392         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1393                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1394                         err = -EINVAL;
1395                         goto out;
1396                 }
1397                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1398                 rt->rt6i_prefsrc.plen = 128;
1399         } else
1400                 rt->rt6i_prefsrc.plen = 0;
1401
1402         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1403                 err = rt6_bind_neighbour(rt, dev);
1404                 if (err)
1405                         goto out;
1406         }
1407
1408         rt->rt6i_flags = cfg->fc_flags;
1409
1410 install_route:
1411         if (cfg->fc_mx) {
1412                 struct nlattr *nla;
1413                 int remaining;
1414
1415                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1416                         int type = nla_type(nla);
1417
1418                         if (type) {
1419                                 if (type > RTAX_MAX) {
1420                                         err = -EINVAL;
1421                                         goto out;
1422                                 }
1423
1424                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1425                         }
1426                 }
1427         }
1428
1429         rt->dst.dev = dev;
1430         rt->rt6i_idev = idev;
1431         rt->rt6i_table = table;
1432
1433         cfg->fc_nlinfo.nl_net = dev_net(dev);
1434
1435         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1436
1437 out:
1438         if (dev)
1439                 dev_put(dev);
1440         if (idev)
1441                 in6_dev_put(idev);
1442         if (rt)
1443                 dst_free(&rt->dst);
1444         return err;
1445 }
1446
1447 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1448 {
1449         int err;
1450         struct fib6_table *table;
1451         struct net *net = dev_net(rt->dst.dev);
1452
1453         if (rt == net->ipv6.ip6_null_entry)
1454                 return -ENOENT;
1455
1456         table = rt->rt6i_table;
1457         write_lock_bh(&table->tb6_lock);
1458
1459         err = fib6_del(rt, info);
1460         dst_release(&rt->dst);
1461
1462         write_unlock_bh(&table->tb6_lock);
1463
1464         return err;
1465 }
1466
1467 int ip6_del_rt(struct rt6_info *rt)
1468 {
1469         struct nl_info info = {
1470                 .nl_net = dev_net(rt->dst.dev),
1471         };
1472         return __ip6_del_rt(rt, &info);
1473 }
1474
1475 static int ip6_route_del(struct fib6_config *cfg)
1476 {
1477         struct fib6_table *table;
1478         struct fib6_node *fn;
1479         struct rt6_info *rt;
1480         int err = -ESRCH;
1481
1482         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1483         if (!table)
1484                 return err;
1485
1486         read_lock_bh(&table->tb6_lock);
1487
1488         fn = fib6_locate(&table->tb6_root,
1489                          &cfg->fc_dst, cfg->fc_dst_len,
1490                          &cfg->fc_src, cfg->fc_src_len);
1491
1492         if (fn) {
1493                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1494                         if (cfg->fc_ifindex &&
1495                             (!rt->dst.dev ||
1496                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1497                                 continue;
1498                         if (cfg->fc_flags & RTF_GATEWAY &&
1499                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1500                                 continue;
1501                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1502                                 continue;
1503                         dst_hold(&rt->dst);
1504                         read_unlock_bh(&table->tb6_lock);
1505
1506                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1507                 }
1508         }
1509         read_unlock_bh(&table->tb6_lock);
1510
1511         return err;
1512 }
1513
1514 /*
1515  *      Handle redirects
1516  */
1517 struct ip6rd_flowi {
1518         struct flowi6 fl6;
1519         struct in6_addr gateway;
1520 };
1521
1522 static struct rt6_info *__ip6_route_redirect(struct net *net,
1523                                              struct fib6_table *table,
1524                                              struct flowi6 *fl6,
1525                                              int flags)
1526 {
1527         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1528         struct rt6_info *rt;
1529         struct fib6_node *fn;
1530
1531         /*
1532          * Get the "current" route for this destination and
1533          * check if the redirect has come from approriate router.
1534          *
1535          * RFC 2461 specifies that redirects should only be
1536          * accepted if they come from the nexthop to the target.
1537          * Due to the way the routes are chosen, this notion
1538          * is a bit fuzzy and one might need to check all possible
1539          * routes.
1540          */
1541
1542         read_lock_bh(&table->tb6_lock);
1543         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1544 restart:
1545         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1546                 /*
1547                  * Current route is on-link; redirect is always invalid.
1548                  *
1549                  * Seems, previous statement is not true. It could
1550                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1551                  * But then router serving it might decide, that we should
1552                  * know truth 8)8) --ANK (980726).
1553                  */
1554                 if (rt6_check_expired(rt))
1555                         continue;
1556                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1557                         continue;
1558                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1559                         continue;
1560                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1561                         continue;
1562                 break;
1563         }
1564
1565         if (!rt)
1566                 rt = net->ipv6.ip6_null_entry;
1567         BACKTRACK(net, &fl6->saddr);
1568 out:
1569         dst_hold(&rt->dst);
1570
1571         read_unlock_bh(&table->tb6_lock);
1572
1573         return rt;
1574 };
1575
1576 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1577                                            const struct in6_addr *src,
1578                                            const struct in6_addr *gateway,
1579                                            struct net_device *dev)
1580 {
1581         int flags = RT6_LOOKUP_F_HAS_SADDR;
1582         struct net *net = dev_net(dev);
1583         struct ip6rd_flowi rdfl = {
1584                 .fl6 = {
1585                         .flowi6_oif = dev->ifindex,
1586                         .daddr = *dest,
1587                         .saddr = *src,
1588                 },
1589         };
1590
1591         rdfl.gateway = *gateway;
1592
1593         if (rt6_need_strict(dest))
1594                 flags |= RT6_LOOKUP_F_IFACE;
1595
1596         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1597                                                    flags, __ip6_route_redirect);
1598 }
1599
1600 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1601                   const struct in6_addr *saddr,
1602                   struct neighbour *neigh, u8 *lladdr, int on_link)
1603 {
1604         struct rt6_info *rt, *nrt = NULL;
1605         struct netevent_redirect netevent;
1606         struct net *net = dev_net(neigh->dev);
1607
1608         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1609
1610         if (rt == net->ipv6.ip6_null_entry) {
1611                 if (net_ratelimit())
1612                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1613                                "for redirect target\n");
1614                 goto out;
1615         }
1616
1617         /*
1618          *      We have finally decided to accept it.
1619          */
1620
1621         neigh_update(neigh, lladdr, NUD_STALE,
1622                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1623                      NEIGH_UPDATE_F_OVERRIDE|
1624                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1625                                      NEIGH_UPDATE_F_ISROUTER))
1626                      );
1627
1628         /*
1629          * Redirect received -> path was valid.
1630          * Look, redirects are sent only in response to data packets,
1631          * so that this nexthop apparently is reachable. --ANK
1632          */
1633         dst_confirm(&rt->dst);
1634
1635         /* Duplicate redirect: silently ignore. */
1636         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1637                 goto out;
1638
1639         nrt = ip6_rt_copy(rt, dest);
1640         if (!nrt)
1641                 goto out;
1642
1643         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1644         if (on_link)
1645                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1646
1647         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1648         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1649
1650         if (ip6_ins_rt(nrt))
1651                 goto out;
1652
1653         netevent.old = &rt->dst;
1654         netevent.new = &nrt->dst;
1655         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1656
1657         if (rt->rt6i_flags & RTF_CACHE) {
1658                 ip6_del_rt(rt);
1659                 return;
1660         }
1661
1662 out:
1663         dst_release(&rt->dst);
1664 }
1665
1666 /*
1667  *      Handle ICMP "packet too big" messages
1668  *      i.e. Path MTU discovery
1669  */
1670
1671 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1672                              struct net *net, u32 pmtu, int ifindex)
1673 {
1674         struct rt6_info *rt, *nrt;
1675         int allfrag = 0;
1676 again:
1677         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1678         if (!rt)
1679                 return;
1680
1681         if (rt6_check_expired(rt)) {
1682                 ip6_del_rt(rt);
1683                 goto again;
1684         }
1685
1686         if (pmtu >= dst_mtu(&rt->dst))
1687                 goto out;
1688
1689         if (pmtu < IPV6_MIN_MTU) {
1690                 /*
1691                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1692                  * MTU (1280) and a fragment header should always be included
1693                  * after a node receiving Too Big message reporting PMTU is
1694                  * less than the IPv6 Minimum Link MTU.
1695                  */
1696                 pmtu = IPV6_MIN_MTU;
1697                 allfrag = 1;
1698         }
1699
1700         /* New mtu received -> path was valid.
1701            They are sent only in response to data packets,
1702            so that this nexthop apparently is reachable. --ANK
1703          */
1704         dst_confirm(&rt->dst);
1705
1706         /* Host route. If it is static, it would be better
1707            not to override it, but add new one, so that
1708            when cache entry will expire old pmtu
1709            would return automatically.
1710          */
1711         if (rt->rt6i_flags & RTF_CACHE) {
1712                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1713                 if (allfrag) {
1714                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1715                         features |= RTAX_FEATURE_ALLFRAG;
1716                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1717                 }
1718                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1719                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1720                 goto out;
1721         }
1722
1723         /* Network route.
1724            Two cases are possible:
1725            1. It is connected route. Action: COW
1726            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1727          */
1728         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1729                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1730         else
1731                 nrt = rt6_alloc_clone(rt, daddr);
1732
1733         if (nrt) {
1734                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1735                 if (allfrag) {
1736                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1737                         features |= RTAX_FEATURE_ALLFRAG;
1738                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1739                 }
1740
1741                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1742                  * happened within 5 mins, the recommended timer is 10 mins.
1743                  * Here this route expiration time is set to ip6_rt_mtu_expires
1744                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1745                  * and detecting PMTU increase will be automatically happened.
1746                  */
1747                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1748                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1749
1750                 ip6_ins_rt(nrt);
1751         }
1752 out:
1753         dst_release(&rt->dst);
1754 }
1755
1756 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1757                         struct net_device *dev, u32 pmtu)
1758 {
1759         struct net *net = dev_net(dev);
1760
1761         /*
1762          * RFC 1981 states that a node "MUST reduce the size of the packets it
1763          * is sending along the path" that caused the Packet Too Big message.
1764          * Since it's not possible in the general case to determine which
1765          * interface was used to send the original packet, we update the MTU
1766          * on the interface that will be used to send future packets. We also
1767          * update the MTU on the interface that received the Packet Too Big in
1768          * case the original packet was forced out that interface with
1769          * SO_BINDTODEVICE or similar. This is the next best thing to the
1770          * correct behaviour, which would be to update the MTU on all
1771          * interfaces.
1772          */
1773         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1774         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1775 }
1776
1777 /*
1778  *      Misc support functions
1779  */
1780
1781 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1782                                     const struct in6_addr *dest)
1783 {
1784         struct net *net = dev_net(ort->dst.dev);
1785         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1786                                             ort->dst.dev, 0);
1787
1788         if (rt) {
1789                 rt->dst.input = ort->dst.input;
1790                 rt->dst.output = ort->dst.output;
1791                 rt->dst.flags |= DST_HOST;
1792
1793                 rt->rt6i_dst.addr = *dest;
1794                 rt->rt6i_dst.plen = 128;
1795                 dst_copy_metrics(&rt->dst, &ort->dst);
1796                 rt->dst.error = ort->dst.error;
1797                 rt->rt6i_idev = ort->rt6i_idev;
1798                 if (rt->rt6i_idev)
1799                         in6_dev_hold(rt->rt6i_idev);
1800                 rt->dst.lastuse = jiffies;
1801                 rt->dst.expires = 0;
1802
1803                 rt->rt6i_gateway = ort->rt6i_gateway;
1804                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1805                 rt->rt6i_metric = 0;
1806
1807 #ifdef CONFIG_IPV6_SUBTREES
1808                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1809 #endif
1810                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1811                 rt->rt6i_table = ort->rt6i_table;
1812         }
1813         return rt;
1814 }
1815
1816 #ifdef CONFIG_IPV6_ROUTE_INFO
1817 static struct rt6_info *rt6_get_route_info(struct net *net,
1818                                            const struct in6_addr *prefix, int prefixlen,
1819                                            const struct in6_addr *gwaddr, int ifindex)
1820 {
1821         struct fib6_node *fn;
1822         struct rt6_info *rt = NULL;
1823         struct fib6_table *table;
1824
1825         table = fib6_get_table(net, RT6_TABLE_INFO);
1826         if (!table)
1827                 return NULL;
1828
1829         write_lock_bh(&table->tb6_lock);
1830         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1831         if (!fn)
1832                 goto out;
1833
1834         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1835                 if (rt->dst.dev->ifindex != ifindex)
1836                         continue;
1837                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1838                         continue;
1839                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1840                         continue;
1841                 dst_hold(&rt->dst);
1842                 break;
1843         }
1844 out:
1845         write_unlock_bh(&table->tb6_lock);
1846         return rt;
1847 }
1848
1849 static struct rt6_info *rt6_add_route_info(struct net *net,
1850                                            const struct in6_addr *prefix, int prefixlen,
1851                                            const struct in6_addr *gwaddr, int ifindex,
1852                                            unsigned pref)
1853 {
1854         struct fib6_config cfg = {
1855                 .fc_table       = RT6_TABLE_INFO,
1856                 .fc_metric      = IP6_RT_PRIO_USER,
1857                 .fc_ifindex     = ifindex,
1858                 .fc_dst_len     = prefixlen,
1859                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1860                                   RTF_UP | RTF_PREF(pref),
1861                 .fc_nlinfo.pid = 0,
1862                 .fc_nlinfo.nlh = NULL,
1863                 .fc_nlinfo.nl_net = net,
1864         };
1865
1866         cfg.fc_dst = *prefix;
1867         cfg.fc_gateway = *gwaddr;
1868
1869         /* We should treat it as a default route if prefix length is 0. */
1870         if (!prefixlen)
1871                 cfg.fc_flags |= RTF_DEFAULT;
1872
1873         ip6_route_add(&cfg);
1874
1875         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1876 }
1877 #endif
1878
1879 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1880 {
1881         struct rt6_info *rt;
1882         struct fib6_table *table;
1883
1884         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1885         if (!table)
1886                 return NULL;
1887
1888         write_lock_bh(&table->tb6_lock);
1889         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1890                 if (dev == rt->dst.dev &&
1891                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1892                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1893                         break;
1894         }
1895         if (rt)
1896                 dst_hold(&rt->dst);
1897         write_unlock_bh(&table->tb6_lock);
1898         return rt;
1899 }
1900
1901 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1902                                      struct net_device *dev,
1903                                      unsigned int pref)
1904 {
1905         struct fib6_config cfg = {
1906                 .fc_table       = RT6_TABLE_DFLT,
1907                 .fc_metric      = IP6_RT_PRIO_USER,
1908                 .fc_ifindex     = dev->ifindex,
1909                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1910                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1911                 .fc_nlinfo.pid = 0,
1912                 .fc_nlinfo.nlh = NULL,
1913                 .fc_nlinfo.nl_net = dev_net(dev),
1914         };
1915
1916         cfg.fc_gateway = *gwaddr;
1917
1918         ip6_route_add(&cfg);
1919
1920         return rt6_get_dflt_router(gwaddr, dev);
1921 }
1922
1923 void rt6_purge_dflt_routers(struct net *net)
1924 {
1925         struct rt6_info *rt;
1926         struct fib6_table *table;
1927
1928         /* NOTE: Keep consistent with rt6_get_dflt_router */
1929         table = fib6_get_table(net, RT6_TABLE_DFLT);
1930         if (!table)
1931                 return;
1932
1933 restart:
1934         read_lock_bh(&table->tb6_lock);
1935         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1936                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1937                         dst_hold(&rt->dst);
1938                         read_unlock_bh(&table->tb6_lock);
1939                         ip6_del_rt(rt);
1940                         goto restart;
1941                 }
1942         }
1943         read_unlock_bh(&table->tb6_lock);
1944 }
1945
1946 static void rtmsg_to_fib6_config(struct net *net,
1947                                  struct in6_rtmsg *rtmsg,
1948                                  struct fib6_config *cfg)
1949 {
1950         memset(cfg, 0, sizeof(*cfg));
1951
1952         cfg->fc_table = RT6_TABLE_MAIN;
1953         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1954         cfg->fc_metric = rtmsg->rtmsg_metric;
1955         cfg->fc_expires = rtmsg->rtmsg_info;
1956         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1957         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1958         cfg->fc_flags = rtmsg->rtmsg_flags;
1959
1960         cfg->fc_nlinfo.nl_net = net;
1961
1962         cfg->fc_dst = rtmsg->rtmsg_dst;
1963         cfg->fc_src = rtmsg->rtmsg_src;
1964         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1965 }
1966
1967 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1968 {
1969         struct fib6_config cfg;
1970         struct in6_rtmsg rtmsg;
1971         int err;
1972
1973         switch(cmd) {
1974         case SIOCADDRT:         /* Add a route */
1975         case SIOCDELRT:         /* Delete a route */
1976                 if (!capable(CAP_NET_ADMIN))
1977                         return -EPERM;
1978                 err = copy_from_user(&rtmsg, arg,
1979                                      sizeof(struct in6_rtmsg));
1980                 if (err)
1981                         return -EFAULT;
1982
1983                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1984
1985                 rtnl_lock();
1986                 switch (cmd) {
1987                 case SIOCADDRT:
1988                         err = ip6_route_add(&cfg);
1989                         break;
1990                 case SIOCDELRT:
1991                         err = ip6_route_del(&cfg);
1992                         break;
1993                 default:
1994                         err = -EINVAL;
1995                 }
1996                 rtnl_unlock();
1997
1998                 return err;
1999         }
2000
2001         return -EINVAL;
2002 }
2003
2004 /*
2005  *      Drop the packet on the floor
2006  */
2007
2008 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2009 {
2010         int type;
2011         struct dst_entry *dst = skb_dst(skb);
2012         switch (ipstats_mib_noroutes) {
2013         case IPSTATS_MIB_INNOROUTES:
2014                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2015                 if (type == IPV6_ADDR_ANY) {
2016                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2017                                       IPSTATS_MIB_INADDRERRORS);
2018                         break;
2019                 }
2020                 /* FALLTHROUGH */
2021         case IPSTATS_MIB_OUTNOROUTES:
2022                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2023                               ipstats_mib_noroutes);
2024                 break;
2025         }
2026         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2027         kfree_skb(skb);
2028         return 0;
2029 }
2030
2031 static int ip6_pkt_discard(struct sk_buff *skb)
2032 {
2033         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2034 }
2035
2036 static int ip6_pkt_discard_out(struct sk_buff *skb)
2037 {
2038         skb->dev = skb_dst(skb)->dev;
2039         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2040 }
2041
2042 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2043
2044 static int ip6_pkt_prohibit(struct sk_buff *skb)
2045 {
2046         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2047 }
2048
2049 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2050 {
2051         skb->dev = skb_dst(skb)->dev;
2052         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2053 }
2054
2055 #endif
2056
2057 /*
2058  *      Allocate a dst for local (unicast / anycast) address.
2059  */
2060
2061 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2062                                     const struct in6_addr *addr,
2063                                     bool anycast)
2064 {
2065         struct net *net = dev_net(idev->dev);
2066         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2067                                             net->loopback_dev, 0);
2068         int err;
2069
2070         if (!rt) {
2071                 if (net_ratelimit())
2072                         pr_warning("IPv6:  Maximum number of routes reached,"
2073                                    " consider increasing route/max_size.\n");
2074                 return ERR_PTR(-ENOMEM);
2075         }
2076
2077         in6_dev_hold(idev);
2078
2079         rt->dst.flags |= DST_HOST;
2080         rt->dst.input = ip6_input;
2081         rt->dst.output = ip6_output;
2082         rt->rt6i_idev = idev;
2083         rt->dst.obsolete = -1;
2084
2085         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2086         if (anycast)
2087                 rt->rt6i_flags |= RTF_ANYCAST;
2088         else
2089                 rt->rt6i_flags |= RTF_LOCAL;
2090         err = rt6_bind_neighbour(rt, rt->dst.dev);
2091         if (err) {
2092                 dst_free(&rt->dst);
2093                 return ERR_PTR(err);
2094         }
2095
2096         rt->rt6i_dst.addr = *addr;
2097         rt->rt6i_dst.plen = 128;
2098         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2099
2100         atomic_set(&rt->dst.__refcnt, 1);
2101
2102         return rt;
2103 }
2104
2105 int ip6_route_get_saddr(struct net *net,
2106                         struct rt6_info *rt,
2107                         const struct in6_addr *daddr,
2108                         unsigned int prefs,
2109                         struct in6_addr *saddr)
2110 {
2111         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2112         int err = 0;
2113         if (rt->rt6i_prefsrc.plen)
2114                 *saddr = rt->rt6i_prefsrc.addr;
2115         else
2116                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2117                                          daddr, prefs, saddr);
2118         return err;
2119 }
2120
2121 /* remove deleted ip from prefsrc entries */
2122 struct arg_dev_net_ip {
2123         struct net_device *dev;
2124         struct net *net;
2125         struct in6_addr *addr;
2126 };
2127
2128 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2129 {
2130         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2131         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2132         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2133
2134         if (((void *)rt->dst.dev == dev || !dev) &&
2135             rt != net->ipv6.ip6_null_entry &&
2136             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2137                 /* remove prefsrc entry */
2138                 rt->rt6i_prefsrc.plen = 0;
2139         }
2140         return 0;
2141 }
2142
2143 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2144 {
2145         struct net *net = dev_net(ifp->idev->dev);
2146         struct arg_dev_net_ip adni = {
2147                 .dev = ifp->idev->dev,
2148                 .net = net,
2149                 .addr = &ifp->addr,
2150         };
2151         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2152 }
2153
2154 struct arg_dev_net {
2155         struct net_device *dev;
2156         struct net *net;
2157 };
2158
2159 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2160 {
2161         const struct arg_dev_net *adn = arg;
2162         const struct net_device *dev = adn->dev;
2163
2164         if ((rt->dst.dev == dev || !dev) &&
2165             rt != adn->net->ipv6.ip6_null_entry)
2166                 return -1;
2167
2168         return 0;
2169 }
2170
2171 void rt6_ifdown(struct net *net, struct net_device *dev)
2172 {
2173         struct arg_dev_net adn = {
2174                 .dev = dev,
2175                 .net = net,
2176         };
2177
2178         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2179         icmp6_clean_all(fib6_ifdown, &adn);
2180 }
2181
2182 struct rt6_mtu_change_arg
2183 {
2184         struct net_device *dev;
2185         unsigned mtu;
2186 };
2187
2188 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2189 {
2190         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2191         struct inet6_dev *idev;
2192
2193         /* In IPv6 pmtu discovery is not optional,
2194            so that RTAX_MTU lock cannot disable it.
2195            We still use this lock to block changes
2196            caused by addrconf/ndisc.
2197         */
2198
2199         idev = __in6_dev_get(arg->dev);
2200         if (!idev)
2201                 return 0;
2202
2203         /* For administrative MTU increase, there is no way to discover
2204            IPv6 PMTU increase, so PMTU increase should be updated here.
2205            Since RFC 1981 doesn't include administrative MTU increase
2206            update PMTU increase is a MUST. (i.e. jumbo frame)
2207          */
2208         /*
2209            If new MTU is less than route PMTU, this new MTU will be the
2210            lowest MTU in the path, update the route PMTU to reflect PMTU
2211            decreases; if new MTU is greater than route PMTU, and the
2212            old MTU is the lowest MTU in the path, update the route PMTU
2213            to reflect the increase. In this case if the other nodes' MTU
2214            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2215            PMTU discouvery.
2216          */
2217         if (rt->dst.dev == arg->dev &&
2218             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2219             (dst_mtu(&rt->dst) >= arg->mtu ||
2220              (dst_mtu(&rt->dst) < arg->mtu &&
2221               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2222                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2223         }
2224         return 0;
2225 }
2226
2227 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2228 {
2229         struct rt6_mtu_change_arg arg = {
2230                 .dev = dev,
2231                 .mtu = mtu,
2232         };
2233
2234         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2235 }
2236
2237 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2238         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2239         [RTA_OIF]               = { .type = NLA_U32 },
2240         [RTA_IIF]               = { .type = NLA_U32 },
2241         [RTA_PRIORITY]          = { .type = NLA_U32 },
2242         [RTA_METRICS]           = { .type = NLA_NESTED },
2243 };
2244
2245 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2246                               struct fib6_config *cfg)
2247 {
2248         struct rtmsg *rtm;
2249         struct nlattr *tb[RTA_MAX+1];
2250         int err;
2251
2252         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2253         if (err < 0)
2254                 goto errout;
2255
2256         err = -EINVAL;
2257         rtm = nlmsg_data(nlh);
2258         memset(cfg, 0, sizeof(*cfg));
2259
2260         cfg->fc_table = rtm->rtm_table;
2261         cfg->fc_dst_len = rtm->rtm_dst_len;
2262         cfg->fc_src_len = rtm->rtm_src_len;
2263         cfg->fc_flags = RTF_UP;
2264         cfg->fc_protocol = rtm->rtm_protocol;
2265
2266         if (rtm->rtm_type == RTN_UNREACHABLE)
2267                 cfg->fc_flags |= RTF_REJECT;
2268
2269         if (rtm->rtm_type == RTN_LOCAL)
2270                 cfg->fc_flags |= RTF_LOCAL;
2271
2272         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2273         cfg->fc_nlinfo.nlh = nlh;
2274         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2275
2276         if (tb[RTA_GATEWAY]) {
2277                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2278                 cfg->fc_flags |= RTF_GATEWAY;
2279         }
2280
2281         if (tb[RTA_DST]) {
2282                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2283
2284                 if (nla_len(tb[RTA_DST]) < plen)
2285                         goto errout;
2286
2287                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2288         }
2289
2290         if (tb[RTA_SRC]) {
2291                 int plen = (rtm->rtm_src_len + 7) >> 3;
2292
2293                 if (nla_len(tb[RTA_SRC]) < plen)
2294                         goto errout;
2295
2296                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2297         }
2298
2299         if (tb[RTA_PREFSRC])
2300                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2301
2302         if (tb[RTA_OIF])
2303                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2304
2305         if (tb[RTA_PRIORITY])
2306                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2307
2308         if (tb[RTA_METRICS]) {
2309                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2310                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2311         }
2312
2313         if (tb[RTA_TABLE])
2314                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2315
2316         err = 0;
2317 errout:
2318         return err;
2319 }
2320
2321 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2322 {
2323         struct fib6_config cfg;
2324         int err;
2325
2326         err = rtm_to_fib6_config(skb, nlh, &cfg);
2327         if (err < 0)
2328                 return err;
2329
2330         return ip6_route_del(&cfg);
2331 }
2332
2333 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2334 {
2335         struct fib6_config cfg;
2336         int err;
2337
2338         err = rtm_to_fib6_config(skb, nlh, &cfg);
2339         if (err < 0)
2340                 return err;
2341
2342         return ip6_route_add(&cfg);
2343 }
2344
2345 static inline size_t rt6_nlmsg_size(void)
2346 {
2347         return NLMSG_ALIGN(sizeof(struct rtmsg))
2348                + nla_total_size(16) /* RTA_SRC */
2349                + nla_total_size(16) /* RTA_DST */
2350                + nla_total_size(16) /* RTA_GATEWAY */
2351                + nla_total_size(16) /* RTA_PREFSRC */
2352                + nla_total_size(4) /* RTA_TABLE */
2353                + nla_total_size(4) /* RTA_IIF */
2354                + nla_total_size(4) /* RTA_OIF */
2355                + nla_total_size(4) /* RTA_PRIORITY */
2356                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2357                + nla_total_size(sizeof(struct rta_cacheinfo));
2358 }
2359
2360 static int rt6_fill_node(struct net *net,
2361                          struct sk_buff *skb, struct rt6_info *rt,
2362                          struct in6_addr *dst, struct in6_addr *src,
2363                          int iif, int type, u32 pid, u32 seq,
2364                          int prefix, int nowait, unsigned int flags)
2365 {
2366         const struct inet_peer *peer;
2367         struct rtmsg *rtm;
2368         struct nlmsghdr *nlh;
2369         long expires;
2370         u32 table;
2371         struct neighbour *n;
2372         u32 ts, tsage;
2373
2374         if (prefix) {   /* user wants prefix routes only */
2375                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2376                         /* success since this is not a prefix route */
2377                         return 1;
2378                 }
2379         }
2380
2381         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2382         if (!nlh)
2383                 return -EMSGSIZE;
2384
2385         rtm = nlmsg_data(nlh);
2386         rtm->rtm_family = AF_INET6;
2387         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2388         rtm->rtm_src_len = rt->rt6i_src.plen;
2389         rtm->rtm_tos = 0;
2390         if (rt->rt6i_table)
2391                 table = rt->rt6i_table->tb6_id;
2392         else
2393                 table = RT6_TABLE_UNSPEC;
2394         rtm->rtm_table = table;
2395         NLA_PUT_U32(skb, RTA_TABLE, table);
2396         if (rt->rt6i_flags & RTF_REJECT)
2397                 rtm->rtm_type = RTN_UNREACHABLE;
2398         else if (rt->rt6i_flags & RTF_LOCAL)
2399                 rtm->rtm_type = RTN_LOCAL;
2400         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2401                 rtm->rtm_type = RTN_LOCAL;
2402         else
2403                 rtm->rtm_type = RTN_UNICAST;
2404         rtm->rtm_flags = 0;
2405         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2406         rtm->rtm_protocol = rt->rt6i_protocol;
2407         if (rt->rt6i_flags & RTF_DYNAMIC)
2408                 rtm->rtm_protocol = RTPROT_REDIRECT;
2409         else if (rt->rt6i_flags & RTF_ADDRCONF)
2410                 rtm->rtm_protocol = RTPROT_KERNEL;
2411         else if (rt->rt6i_flags & RTF_DEFAULT)
2412                 rtm->rtm_protocol = RTPROT_RA;
2413
2414         if (rt->rt6i_flags & RTF_CACHE)
2415                 rtm->rtm_flags |= RTM_F_CLONED;
2416
2417         if (dst) {
2418                 NLA_PUT(skb, RTA_DST, 16, dst);
2419                 rtm->rtm_dst_len = 128;
2420         } else if (rtm->rtm_dst_len)
2421                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2422 #ifdef CONFIG_IPV6_SUBTREES
2423         if (src) {
2424                 NLA_PUT(skb, RTA_SRC, 16, src);
2425                 rtm->rtm_src_len = 128;
2426         } else if (rtm->rtm_src_len)
2427                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2428 #endif
2429         if (iif) {
2430 #ifdef CONFIG_IPV6_MROUTE
2431                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2432                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2433                         if (err <= 0) {
2434                                 if (!nowait) {
2435                                         if (err == 0)
2436                                                 return 0;
2437                                         goto nla_put_failure;
2438                                 } else {
2439                                         if (err == -EMSGSIZE)
2440                                                 goto nla_put_failure;
2441                                 }
2442                         }
2443                 } else
2444 #endif
2445                         NLA_PUT_U32(skb, RTA_IIF, iif);
2446         } else if (dst) {
2447                 struct in6_addr saddr_buf;
2448                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2449                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2450         }
2451
2452         if (rt->rt6i_prefsrc.plen) {
2453                 struct in6_addr saddr_buf;
2454                 saddr_buf = rt->rt6i_prefsrc.addr;
2455                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2456         }
2457
2458         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2459                 goto nla_put_failure;
2460
2461         rcu_read_lock();
2462         n = dst_get_neighbour_noref(&rt->dst);
2463         if (n)
2464                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2465         rcu_read_unlock();
2466
2467         if (rt->dst.dev)
2468                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2469
2470         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2471
2472         if (!(rt->rt6i_flags & RTF_EXPIRES))
2473                 expires = 0;
2474         else if (rt->dst.expires - jiffies < INT_MAX)
2475                 expires = rt->dst.expires - jiffies;
2476         else
2477                 expires = INT_MAX;
2478
2479         peer = rt->rt6i_peer;
2480         ts = tsage = 0;
2481         if (peer && peer->tcp_ts_stamp) {
2482                 ts = peer->tcp_ts;
2483                 tsage = get_seconds() - peer->tcp_ts_stamp;
2484         }
2485
2486         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2487                                expires, rt->dst.error) < 0)
2488                 goto nla_put_failure;
2489
2490         return nlmsg_end(skb, nlh);
2491
2492 nla_put_failure:
2493         nlmsg_cancel(skb, nlh);
2494         return -EMSGSIZE;
2495 }
2496
2497 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2498 {
2499         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2500         int prefix;
2501
2502         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2503                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2504                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2505         } else
2506                 prefix = 0;
2507
2508         return rt6_fill_node(arg->net,
2509                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2510                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2511                      prefix, 0, NLM_F_MULTI);
2512 }
2513
2514 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2515 {
2516         struct net *net = sock_net(in_skb->sk);
2517         struct nlattr *tb[RTA_MAX+1];
2518         struct rt6_info *rt;
2519         struct sk_buff *skb;
2520         struct rtmsg *rtm;
2521         struct flowi6 fl6;
2522         int err, iif = 0;
2523
2524         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2525         if (err < 0)
2526                 goto errout;
2527
2528         err = -EINVAL;
2529         memset(&fl6, 0, sizeof(fl6));
2530
2531         if (tb[RTA_SRC]) {
2532                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2533                         goto errout;
2534
2535                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2536         }
2537
2538         if (tb[RTA_DST]) {
2539                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2540                         goto errout;
2541
2542                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2543         }
2544
2545         if (tb[RTA_IIF])
2546                 iif = nla_get_u32(tb[RTA_IIF]);
2547
2548         if (tb[RTA_OIF])
2549                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2550
2551         if (iif) {
2552                 struct net_device *dev;
2553                 dev = __dev_get_by_index(net, iif);
2554                 if (!dev) {
2555                         err = -ENODEV;
2556                         goto errout;
2557                 }
2558         }
2559
2560         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2561         if (!skb) {
2562                 err = -ENOBUFS;
2563                 goto errout;
2564         }
2565
2566         /* Reserve room for dummy headers, this skb can pass
2567            through good chunk of routing engine.
2568          */
2569         skb_reset_mac_header(skb);
2570         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2571
2572         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2573         skb_dst_set(skb, &rt->dst);
2574
2575         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2576                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2577                             nlh->nlmsg_seq, 0, 0, 0);
2578         if (err < 0) {
2579                 kfree_skb(skb);
2580                 goto errout;
2581         }
2582
2583         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2584 errout:
2585         return err;
2586 }
2587
2588 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2589 {
2590         struct sk_buff *skb;
2591         struct net *net = info->nl_net;
2592         u32 seq;
2593         int err;
2594
2595         err = -ENOBUFS;
2596         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2597
2598         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2599         if (!skb)
2600                 goto errout;
2601
2602         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2603                                 event, info->pid, seq, 0, 0, 0);
2604         if (err < 0) {
2605                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2606                 WARN_ON(err == -EMSGSIZE);
2607                 kfree_skb(skb);
2608                 goto errout;
2609         }
2610         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2611                     info->nlh, gfp_any());
2612         return;
2613 errout:
2614         if (err < 0)
2615                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2616 }
2617
2618 static int ip6_route_dev_notify(struct notifier_block *this,
2619                                 unsigned long event, void *data)
2620 {
2621         struct net_device *dev = (struct net_device *)data;
2622         struct net *net = dev_net(dev);
2623
2624         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2625                 net->ipv6.ip6_null_entry->dst.dev = dev;
2626                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2627 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2628                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2629                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2630                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2631                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2632 #endif
2633         }
2634
2635         return NOTIFY_OK;
2636 }
2637
2638 /*
2639  *      /proc
2640  */
2641
2642 #ifdef CONFIG_PROC_FS
2643
2644 struct rt6_proc_arg
2645 {
2646         char *buffer;
2647         int offset;
2648         int length;
2649         int skip;
2650         int len;
2651 };
2652
2653 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2654 {
2655         struct seq_file *m = p_arg;
2656         struct neighbour *n;
2657
2658         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2659
2660 #ifdef CONFIG_IPV6_SUBTREES
2661         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2662 #else
2663         seq_puts(m, "00000000000000000000000000000000 00 ");
2664 #endif
2665         rcu_read_lock();
2666         n = dst_get_neighbour_noref(&rt->dst);
2667         if (n) {
2668                 seq_printf(m, "%pi6", n->primary_key);
2669         } else {
2670                 seq_puts(m, "00000000000000000000000000000000");
2671         }
2672         rcu_read_unlock();
2673         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2674                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2675                    rt->dst.__use, rt->rt6i_flags,
2676                    rt->dst.dev ? rt->dst.dev->name : "");
2677         return 0;
2678 }
2679
2680 static int ipv6_route_show(struct seq_file *m, void *v)
2681 {
2682         struct net *net = (struct net *)m->private;
2683         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2684         return 0;
2685 }
2686
2687 static int ipv6_route_open(struct inode *inode, struct file *file)
2688 {
2689         return single_open_net(inode, file, ipv6_route_show);
2690 }
2691
2692 static const struct file_operations ipv6_route_proc_fops = {
2693         .owner          = THIS_MODULE,
2694         .open           = ipv6_route_open,
2695         .read           = seq_read,
2696         .llseek         = seq_lseek,
2697         .release        = single_release_net,
2698 };
2699
2700 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2701 {
2702         struct net *net = (struct net *)seq->private;
2703         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2704                    net->ipv6.rt6_stats->fib_nodes,
2705                    net->ipv6.rt6_stats->fib_route_nodes,
2706                    net->ipv6.rt6_stats->fib_rt_alloc,
2707                    net->ipv6.rt6_stats->fib_rt_entries,
2708                    net->ipv6.rt6_stats->fib_rt_cache,
2709                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2710                    net->ipv6.rt6_stats->fib_discarded_routes);
2711
2712         return 0;
2713 }
2714
2715 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2716 {
2717         return single_open_net(inode, file, rt6_stats_seq_show);
2718 }
2719
2720 static const struct file_operations rt6_stats_seq_fops = {
2721         .owner   = THIS_MODULE,
2722         .open    = rt6_stats_seq_open,
2723         .read    = seq_read,
2724         .llseek  = seq_lseek,
2725         .release = single_release_net,
2726 };
2727 #endif  /* CONFIG_PROC_FS */
2728
2729 #ifdef CONFIG_SYSCTL
2730
2731 static
2732 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2733                               void __user *buffer, size_t *lenp, loff_t *ppos)
2734 {
2735         struct net *net;
2736         int delay;
2737         if (!write)
2738                 return -EINVAL;
2739
2740         net = (struct net *)ctl->extra1;
2741         delay = net->ipv6.sysctl.flush_delay;
2742         proc_dointvec(ctl, write, buffer, lenp, ppos);
2743         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2744         return 0;
2745 }
2746
2747 ctl_table ipv6_route_table_template[] = {
2748         {
2749                 .procname       =       "flush",
2750                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0200,
2753                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2754         },
2755         {
2756                 .procname       =       "gc_thresh",
2757                 .data           =       &ip6_dst_ops_template.gc_thresh,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec,
2761         },
2762         {
2763                 .procname       =       "max_size",
2764                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2765                 .maxlen         =       sizeof(int),
2766                 .mode           =       0644,
2767                 .proc_handler   =       proc_dointvec,
2768         },
2769         {
2770                 .procname       =       "gc_min_interval",
2771                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2772                 .maxlen         =       sizeof(int),
2773                 .mode           =       0644,
2774                 .proc_handler   =       proc_dointvec_jiffies,
2775         },
2776         {
2777                 .procname       =       "gc_timeout",
2778                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2779                 .maxlen         =       sizeof(int),
2780                 .mode           =       0644,
2781                 .proc_handler   =       proc_dointvec_jiffies,
2782         },
2783         {
2784                 .procname       =       "gc_interval",
2785                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2786                 .maxlen         =       sizeof(int),
2787                 .mode           =       0644,
2788                 .proc_handler   =       proc_dointvec_jiffies,
2789         },
2790         {
2791                 .procname       =       "gc_elasticity",
2792                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2793                 .maxlen         =       sizeof(int),
2794                 .mode           =       0644,
2795                 .proc_handler   =       proc_dointvec,
2796         },
2797         {
2798                 .procname       =       "mtu_expires",
2799                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2800                 .maxlen         =       sizeof(int),
2801                 .mode           =       0644,
2802                 .proc_handler   =       proc_dointvec_jiffies,
2803         },
2804         {
2805                 .procname       =       "min_adv_mss",
2806                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2807                 .maxlen         =       sizeof(int),
2808                 .mode           =       0644,
2809                 .proc_handler   =       proc_dointvec,
2810         },
2811         {
2812                 .procname       =       "gc_min_interval_ms",
2813                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2814                 .maxlen         =       sizeof(int),
2815                 .mode           =       0644,
2816                 .proc_handler   =       proc_dointvec_ms_jiffies,
2817         },
2818         { }
2819 };
2820
2821 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2822 {
2823         struct ctl_table *table;
2824
2825         table = kmemdup(ipv6_route_table_template,
2826                         sizeof(ipv6_route_table_template),
2827                         GFP_KERNEL);
2828
2829         if (table) {
2830                 table[0].data = &net->ipv6.sysctl.flush_delay;
2831                 table[0].extra1 = net;
2832                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2833                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2834                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2835                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2836                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2837                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2838                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2839                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2840                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2841         }
2842
2843         return table;
2844 }
2845 #endif
2846
2847 static int __net_init ip6_route_net_init(struct net *net)
2848 {
2849         int ret = -ENOMEM;
2850
2851         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2852                sizeof(net->ipv6.ip6_dst_ops));
2853
2854         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2855                 goto out_ip6_dst_ops;
2856
2857         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2858                                            sizeof(*net->ipv6.ip6_null_entry),
2859                                            GFP_KERNEL);
2860         if (!net->ipv6.ip6_null_entry)
2861                 goto out_ip6_dst_entries;
2862         net->ipv6.ip6_null_entry->dst.path =
2863                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2864         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2865         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2866                          ip6_template_metrics, true);
2867
2868 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2869         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2870                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2871                                                GFP_KERNEL);
2872         if (!net->ipv6.ip6_prohibit_entry)
2873                 goto out_ip6_null_entry;
2874         net->ipv6.ip6_prohibit_entry->dst.path =
2875                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2876         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2877         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2878                          ip6_template_metrics, true);
2879
2880         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2881                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2882                                                GFP_KERNEL);
2883         if (!net->ipv6.ip6_blk_hole_entry)
2884                 goto out_ip6_prohibit_entry;
2885         net->ipv6.ip6_blk_hole_entry->dst.path =
2886                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2887         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2888         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2889                          ip6_template_metrics, true);
2890 #endif
2891
2892         net->ipv6.sysctl.flush_delay = 0;
2893         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2894         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2895         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2896         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2897         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2898         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2899         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2900
2901 #ifdef CONFIG_PROC_FS
2902         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2903         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2904 #endif
2905         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2906
2907         ret = 0;
2908 out:
2909         return ret;
2910
2911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2912 out_ip6_prohibit_entry:
2913         kfree(net->ipv6.ip6_prohibit_entry);
2914 out_ip6_null_entry:
2915         kfree(net->ipv6.ip6_null_entry);
2916 #endif
2917 out_ip6_dst_entries:
2918         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2919 out_ip6_dst_ops:
2920         goto out;
2921 }
2922
2923 static void __net_exit ip6_route_net_exit(struct net *net)
2924 {
2925 #ifdef CONFIG_PROC_FS
2926         proc_net_remove(net, "ipv6_route");
2927         proc_net_remove(net, "rt6_stats");
2928 #endif
2929         kfree(net->ipv6.ip6_null_entry);
2930 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2931         kfree(net->ipv6.ip6_prohibit_entry);
2932         kfree(net->ipv6.ip6_blk_hole_entry);
2933 #endif
2934         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2935 }
2936
2937 static struct pernet_operations ip6_route_net_ops = {
2938         .init = ip6_route_net_init,
2939         .exit = ip6_route_net_exit,
2940 };
2941
2942 static struct notifier_block ip6_route_dev_notifier = {
2943         .notifier_call = ip6_route_dev_notify,
2944         .priority = 0,
2945 };
2946
2947 int __init ip6_route_init(void)
2948 {
2949         int ret;
2950
2951         ret = -ENOMEM;
2952         ip6_dst_ops_template.kmem_cachep =
2953                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954                                   SLAB_HWCACHE_ALIGN, NULL);
2955         if (!ip6_dst_ops_template.kmem_cachep)
2956                 goto out;
2957
2958         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2959         if (ret)
2960                 goto out_kmem_cache;
2961
2962         ret = register_pernet_subsys(&ip6_route_net_ops);
2963         if (ret)
2964                 goto out_dst_entries;
2965
2966         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2967
2968         /* Registering of the loopback is done before this portion of code,
2969          * the loopback reference in rt6_info will not be taken, do it
2970          * manually for init_net */
2971         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2978   #endif
2979         ret = fib6_init();
2980         if (ret)
2981                 goto out_register_subsys;
2982
2983         ret = xfrm6_init();
2984         if (ret)
2985                 goto out_fib6_init;
2986
2987         ret = fib6_rules_init();
2988         if (ret)
2989                 goto xfrm6_init;
2990
2991         ret = -ENOBUFS;
2992         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2993             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2994             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2995                 goto fib6_rules_init;
2996
2997         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2998         if (ret)
2999                 goto fib6_rules_init;
3000
3001 out:
3002         return ret;
3003
3004 fib6_rules_init:
3005         fib6_rules_cleanup();
3006 xfrm6_init:
3007         xfrm6_fini();
3008 out_fib6_init:
3009         fib6_gc_cleanup();
3010 out_register_subsys:
3011         unregister_pernet_subsys(&ip6_route_net_ops);
3012 out_dst_entries:
3013         dst_entries_destroy(&ip6_dst_blackhole_ops);
3014 out_kmem_cache:
3015         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3016         goto out;
3017 }
3018
3019 void ip6_route_cleanup(void)
3020 {
3021         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3022         fib6_rules_cleanup();
3023         xfrm6_fini();
3024         fib6_gc_cleanup();
3025         unregister_pernet_subsys(&ip6_route_net_ops);
3026         dst_entries_destroy(&ip6_dst_blackhole_ops);
3027         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3028 }