net: lwtunnel: Handle fragmentation
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <asm/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr, int ifindex,
106                                            unsigned int pref);
107 static struct rt6_info *rt6_get_route_info(struct net *net,
108                                            const struct in6_addr *prefix, int prefixlen,
109                                            const struct in6_addr *gwaddr, int ifindex);
110 #endif
111
112 struct uncached_list {
113         spinlock_t              lock;
114         struct list_head        head;
115 };
116
117 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
118
119 static void rt6_uncached_list_add(struct rt6_info *rt)
120 {
121         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
122
123         rt->dst.flags |= DST_NOCACHE;
124         rt->rt6i_uncached_list = ul;
125
126         spin_lock_bh(&ul->lock);
127         list_add_tail(&rt->rt6i_uncached, &ul->head);
128         spin_unlock_bh(&ul->lock);
129 }
130
131 static void rt6_uncached_list_del(struct rt6_info *rt)
132 {
133         if (!list_empty(&rt->rt6i_uncached)) {
134                 struct uncached_list *ul = rt->rt6i_uncached_list;
135
136                 spin_lock_bh(&ul->lock);
137                 list_del(&rt->rt6i_uncached);
138                 spin_unlock_bh(&ul->lock);
139         }
140 }
141
142 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
143 {
144         struct net_device *loopback_dev = net->loopback_dev;
145         int cpu;
146
147         if (dev == loopback_dev)
148                 return;
149
150         for_each_possible_cpu(cpu) {
151                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152                 struct rt6_info *rt;
153
154                 spin_lock_bh(&ul->lock);
155                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
156                         struct inet6_dev *rt_idev = rt->rt6i_idev;
157                         struct net_device *rt_dev = rt->dst.dev;
158
159                         if (rt_idev->dev == dev) {
160                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
161                                 in6_dev_put(rt_idev);
162                         }
163
164                         if (rt_dev == dev) {
165                                 rt->dst.dev = loopback_dev;
166                                 dev_hold(rt->dst.dev);
167                                 dev_put(rt_dev);
168                         }
169                 }
170                 spin_unlock_bh(&ul->lock);
171         }
172 }
173
174 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
175 {
176         return dst_metrics_write_ptr(rt->dst.from);
177 }
178
179 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
180 {
181         struct rt6_info *rt = (struct rt6_info *)dst;
182
183         if (rt->rt6i_flags & RTF_PCPU)
184                 return rt6_pcpu_cow_metrics(rt);
185         else if (rt->rt6i_flags & RTF_CACHE)
186                 return NULL;
187         else
188                 return dst_cow_metrics_generic(dst, old);
189 }
190
191 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         struct in6_addr *p = &rt->rt6i_gateway;
196
197         if (!ipv6_addr_any(p))
198                 return (const void *) p;
199         else if (skb)
200                 return &ipv6_hdr(skb)->daddr;
201         return daddr;
202 }
203
204 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
205                                           struct sk_buff *skb,
206                                           const void *daddr)
207 {
208         struct rt6_info *rt = (struct rt6_info *) dst;
209         struct neighbour *n;
210
211         daddr = choose_neigh_daddr(rt, skb, daddr);
212         n = __ipv6_neigh_lookup(dst->dev, daddr);
213         if (n)
214                 return n;
215         return neigh_create(&nd_tbl, daddr, dst->dev);
216 }
217
218 static struct dst_ops ip6_dst_ops_template = {
219         .family                 =       AF_INET6,
220         .gc                     =       ip6_dst_gc,
221         .gc_thresh              =       1024,
222         .check                  =       ip6_dst_check,
223         .default_advmss         =       ip6_default_advmss,
224         .mtu                    =       ip6_mtu,
225         .cow_metrics            =       ipv6_cow_metrics,
226         .destroy                =       ip6_dst_destroy,
227         .ifdown                 =       ip6_dst_ifdown,
228         .negative_advice        =       ip6_negative_advice,
229         .link_failure           =       ip6_link_failure,
230         .update_pmtu            =       ip6_rt_update_pmtu,
231         .redirect               =       rt6_do_redirect,
232         .local_out              =       __ip6_local_out,
233         .neigh_lookup           =       ip6_neigh_lookup,
234 };
235
236 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
237 {
238         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
239
240         return mtu ? : dst->dev->mtu;
241 }
242
243 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
244                                          struct sk_buff *skb, u32 mtu)
245 {
246 }
247
248 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
249                                       struct sk_buff *skb)
250 {
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       dst_cow_metrics_generic,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 static void rt6_info_init(struct rt6_info *rt)
319 {
320         struct dst_entry *dst = &rt->dst;
321
322         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
323         INIT_LIST_HEAD(&rt->rt6i_siblings);
324         INIT_LIST_HEAD(&rt->rt6i_uncached);
325 }
326
327 /* allocate dst with ip6_dst_ops */
328 static struct rt6_info *__ip6_dst_alloc(struct net *net,
329                                         struct net_device *dev,
330                                         int flags)
331 {
332         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
333                                         0, DST_OBSOLETE_FORCE_CHK, flags);
334
335         if (rt)
336                 rt6_info_init(rt);
337
338         return rt;
339 }
340
341 struct rt6_info *ip6_dst_alloc(struct net *net,
342                                struct net_device *dev,
343                                int flags)
344 {
345         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346
347         if (rt) {
348                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
349                 if (rt->rt6i_pcpu) {
350                         int cpu;
351
352                         for_each_possible_cpu(cpu) {
353                                 struct rt6_info **p;
354
355                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
356                                 /* no one shares rt */
357                                 *p =  NULL;
358                         }
359                 } else {
360                         dst_destroy((struct dst_entry *)rt);
361                         return NULL;
362                 }
363         }
364
365         return rt;
366 }
367 EXPORT_SYMBOL(ip6_dst_alloc);
368
369 static void ip6_dst_destroy(struct dst_entry *dst)
370 {
371         struct rt6_info *rt = (struct rt6_info *)dst;
372         struct dst_entry *from = dst->from;
373         struct inet6_dev *idev;
374
375         dst_destroy_metrics_generic(dst);
376         free_percpu(rt->rt6i_pcpu);
377         rt6_uncached_list_del(rt);
378
379         idev = rt->rt6i_idev;
380         if (idev) {
381                 rt->rt6i_idev = NULL;
382                 in6_dev_put(idev);
383         }
384
385         dst->from = NULL;
386         dst_release(from);
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (dev != loopback_dev) {
398                 if (idev && idev->dev == dev) {
399                         struct inet6_dev *loopback_idev =
400                                 in6_dev_get(loopback_dev);
401                         if (loopback_idev) {
402                                 rt->rt6i_idev = loopback_idev;
403                                 in6_dev_put(idev);
404                         }
405                 }
406         }
407 }
408
409 static bool __rt6_check_expired(const struct rt6_info *rt)
410 {
411         if (rt->rt6i_flags & RTF_EXPIRES)
412                 return time_after(jiffies, rt->dst.expires);
413         else
414                 return false;
415 }
416
417 static bool rt6_check_expired(const struct rt6_info *rt)
418 {
419         if (rt->rt6i_flags & RTF_EXPIRES) {
420                 if (time_after(jiffies, rt->dst.expires))
421                         return true;
422         } else if (rt->dst.from) {
423                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
424         }
425         return false;
426 }
427
428 /* Multipath route selection:
429  *   Hash based function using packet header and flowlabel.
430  * Adapted from fib_info_hashfn()
431  */
432 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
433                                const struct flowi6 *fl6)
434 {
435         return get_hash_from_flowi6(fl6) % candidate_count;
436 }
437
438 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
439                                              struct flowi6 *fl6, int oif,
440                                              int strict)
441 {
442         struct rt6_info *sibling, *next_sibling;
443         int route_choosen;
444
445         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
446         /* Don't change the route, if route_choosen == 0
447          * (siblings does not include ourself)
448          */
449         if (route_choosen)
450                 list_for_each_entry_safe(sibling, next_sibling,
451                                 &match->rt6i_siblings, rt6i_siblings) {
452                         route_choosen--;
453                         if (route_choosen == 0) {
454                                 if (rt6_score_route(sibling, oif, strict) < 0)
455                                         break;
456                                 match = sibling;
457                                 break;
458                         }
459                 }
460         return match;
461 }
462
463 /*
464  *      Route lookup. Any table->tb6_lock is implied.
465  */
466
467 static inline struct rt6_info *rt6_device_match(struct net *net,
468                                                     struct rt6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct rt6_info *local = NULL;
474         struct rt6_info *sprt;
475
476         if (!oif && ipv6_addr_any(saddr))
477                 goto out;
478
479         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
480                 struct net_device *dev = sprt->dst.dev;
481
482                 if (oif) {
483                         if (dev->ifindex == oif)
484                                 return sprt;
485                         if (dev->flags & IFF_LOOPBACK) {
486                                 if (!sprt->rt6i_idev ||
487                                     sprt->rt6i_idev->dev->ifindex != oif) {
488                                         if (flags & RT6_LOOKUP_F_IFACE)
489                                                 continue;
490                                         if (local &&
491                                             local->rt6i_idev->dev->ifindex == oif)
492                                                 continue;
493                                 }
494                                 local = sprt;
495                         }
496                 } else {
497                         if (ipv6_chk_addr(net, saddr, dev,
498                                           flags & RT6_LOOKUP_F_IFACE))
499                                 return sprt;
500                 }
501         }
502
503         if (oif) {
504                 if (local)
505                         return local;
506
507                 if (flags & RT6_LOOKUP_F_IFACE)
508                         return net->ipv6.ip6_null_entry;
509         }
510 out:
511         return rt;
512 }
513
514 #ifdef CONFIG_IPV6_ROUTER_PREF
515 struct __rt6_probe_work {
516         struct work_struct work;
517         struct in6_addr target;
518         struct net_device *dev;
519 };
520
521 static void rt6_probe_deferred(struct work_struct *w)
522 {
523         struct in6_addr mcaddr;
524         struct __rt6_probe_work *work =
525                 container_of(w, struct __rt6_probe_work, work);
526
527         addrconf_addr_solict_mult(&work->target, &mcaddr);
528         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
529         dev_put(work->dev);
530         kfree(work);
531 }
532
533 static void rt6_probe(struct rt6_info *rt)
534 {
535         struct __rt6_probe_work *work;
536         struct neighbour *neigh;
537         /*
538          * Okay, this does not seem to be appropriate
539          * for now, however, we need to check if it
540          * is really so; aka Router Reachability Probing.
541          *
542          * Router Reachability Probe MUST be rate-limited
543          * to no more than one per minute.
544          */
545         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
546                 return;
547         rcu_read_lock_bh();
548         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
549         if (neigh) {
550                 if (neigh->nud_state & NUD_VALID)
551                         goto out;
552
553                 work = NULL;
554                 write_lock(&neigh->lock);
555                 if (!(neigh->nud_state & NUD_VALID) &&
556                     time_after(jiffies,
557                                neigh->updated +
558                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
559                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
560                         if (work)
561                                 __neigh_set_probe_once(neigh);
562                 }
563                 write_unlock(&neigh->lock);
564         } else {
565                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
566         }
567
568         if (work) {
569                 INIT_WORK(&work->work, rt6_probe_deferred);
570                 work->target = rt->rt6i_gateway;
571                 dev_hold(rt->dst.dev);
572                 work->dev = rt->dst.dev;
573                 schedule_work(&work->work);
574         }
575
576 out:
577         rcu_read_unlock_bh();
578 }
579 #else
580 static inline void rt6_probe(struct rt6_info *rt)
581 {
582 }
583 #endif
584
585 /*
586  * Default Router Selection (RFC 2461 6.3.6)
587  */
588 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
589 {
590         struct net_device *dev = rt->dst.dev;
591         if (!oif || dev->ifindex == oif)
592                 return 2;
593         if ((dev->flags & IFF_LOOPBACK) &&
594             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
595                 return 1;
596         return 0;
597 }
598
599 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
600 {
601         struct neighbour *neigh;
602         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
603
604         if (rt->rt6i_flags & RTF_NONEXTHOP ||
605             !(rt->rt6i_flags & RTF_GATEWAY))
606                 return RT6_NUD_SUCCEED;
607
608         rcu_read_lock_bh();
609         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
610         if (neigh) {
611                 read_lock(&neigh->lock);
612                 if (neigh->nud_state & NUD_VALID)
613                         ret = RT6_NUD_SUCCEED;
614 #ifdef CONFIG_IPV6_ROUTER_PREF
615                 else if (!(neigh->nud_state & NUD_FAILED))
616                         ret = RT6_NUD_SUCCEED;
617                 else
618                         ret = RT6_NUD_FAIL_PROBE;
619 #endif
620                 read_unlock(&neigh->lock);
621         } else {
622                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
623                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
624         }
625         rcu_read_unlock_bh();
626
627         return ret;
628 }
629
630 static int rt6_score_route(struct rt6_info *rt, int oif,
631                            int strict)
632 {
633         int m;
634
635         m = rt6_check_dev(rt, oif);
636         if (!m && (strict & RT6_LOOKUP_F_IFACE))
637                 return RT6_NUD_FAIL_HARD;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
640 #endif
641         if (strict & RT6_LOOKUP_F_REACHABLE) {
642                 int n = rt6_check_neigh(rt);
643                 if (n < 0)
644                         return n;
645         }
646         return m;
647 }
648
649 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
650                                    int *mpri, struct rt6_info *match,
651                                    bool *do_rr)
652 {
653         int m;
654         bool match_do_rr = false;
655         struct inet6_dev *idev = rt->rt6i_idev;
656         struct net_device *dev = rt->dst.dev;
657
658         if (dev && !netif_carrier_ok(dev) &&
659             idev->cnf.ignore_routes_with_linkdown)
660                 goto out;
661
662         if (rt6_check_expired(rt))
663                 goto out;
664
665         m = rt6_score_route(rt, oif, strict);
666         if (m == RT6_NUD_FAIL_DO_RR) {
667                 match_do_rr = true;
668                 m = 0; /* lowest valid score */
669         } else if (m == RT6_NUD_FAIL_HARD) {
670                 goto out;
671         }
672
673         if (strict & RT6_LOOKUP_F_REACHABLE)
674                 rt6_probe(rt);
675
676         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
677         if (m > *mpri) {
678                 *do_rr = match_do_rr;
679                 *mpri = m;
680                 match = rt;
681         }
682 out:
683         return match;
684 }
685
686 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
687                                      struct rt6_info *rr_head,
688                                      u32 metric, int oif, int strict,
689                                      bool *do_rr)
690 {
691         struct rt6_info *rt, *match, *cont;
692         int mpri = -1;
693
694         match = NULL;
695         cont = NULL;
696         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
697                 if (rt->rt6i_metric != metric) {
698                         cont = rt;
699                         break;
700                 }
701
702                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
703         }
704
705         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
706                 if (rt->rt6i_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         if (match || !cont)
715                 return match;
716
717         for (rt = cont; rt; rt = rt->dst.rt6_next)
718                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
719
720         return match;
721 }
722
723 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
724 {
725         struct rt6_info *match, *rt0;
726         struct net *net;
727         bool do_rr = false;
728
729         rt0 = fn->rr_ptr;
730         if (!rt0)
731                 fn->rr_ptr = rt0 = fn->leaf;
732
733         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
734                              &do_rr);
735
736         if (do_rr) {
737                 struct rt6_info *next = rt0->dst.rt6_next;
738
739                 /* no entries matched; do round-robin */
740                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
741                         next = fn->leaf;
742
743                 if (next != rt0)
744                         fn->rr_ptr = next;
745         }
746
747         net = dev_net(rt0->dst.dev);
748         return match ? match : net->ipv6.ip6_null_entry;
749 }
750
751 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
752 {
753         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
754 }
755
756 #ifdef CONFIG_IPV6_ROUTE_INFO
757 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
758                   const struct in6_addr *gwaddr)
759 {
760         struct net *net = dev_net(dev);
761         struct route_info *rinfo = (struct route_info *) opt;
762         struct in6_addr prefix_buf, *prefix;
763         unsigned int pref;
764         unsigned long lifetime;
765         struct rt6_info *rt;
766
767         if (len < sizeof(struct route_info)) {
768                 return -EINVAL;
769         }
770
771         /* Sanity check for prefix_len and length */
772         if (rinfo->length > 3) {
773                 return -EINVAL;
774         } else if (rinfo->prefix_len > 128) {
775                 return -EINVAL;
776         } else if (rinfo->prefix_len > 64) {
777                 if (rinfo->length < 2) {
778                         return -EINVAL;
779                 }
780         } else if (rinfo->prefix_len > 0) {
781                 if (rinfo->length < 1) {
782                         return -EINVAL;
783                 }
784         }
785
786         pref = rinfo->route_pref;
787         if (pref == ICMPV6_ROUTER_PREF_INVALID)
788                 return -EINVAL;
789
790         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
791
792         if (rinfo->length == 3)
793                 prefix = (struct in6_addr *)rinfo->prefix;
794         else {
795                 /* this function is safe */
796                 ipv6_addr_prefix(&prefix_buf,
797                                  (struct in6_addr *)rinfo->prefix,
798                                  rinfo->prefix_len);
799                 prefix = &prefix_buf;
800         }
801
802         if (rinfo->prefix_len == 0)
803                 rt = rt6_get_dflt_router(gwaddr, dev);
804         else
805                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
806                                         gwaddr, dev->ifindex);
807
808         if (rt && !lifetime) {
809                 ip6_del_rt(rt);
810                 rt = NULL;
811         }
812
813         if (!rt && lifetime)
814                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
815                                         pref);
816         else if (rt)
817                 rt->rt6i_flags = RTF_ROUTEINFO |
818                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
819
820         if (rt) {
821                 if (!addrconf_finite_timeout(lifetime))
822                         rt6_clean_expires(rt);
823                 else
824                         rt6_set_expires(rt, jiffies + HZ * lifetime);
825
826                 ip6_rt_put(rt);
827         }
828         return 0;
829 }
830 #endif
831
832 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
833                                         struct in6_addr *saddr)
834 {
835         struct fib6_node *pn;
836         while (1) {
837                 if (fn->fn_flags & RTN_TL_ROOT)
838                         return NULL;
839                 pn = fn->parent;
840                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
841                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
842                 else
843                         fn = pn;
844                 if (fn->fn_flags & RTN_RTINFO)
845                         return fn;
846         }
847 }
848
849 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
850                                              struct fib6_table *table,
851                                              struct flowi6 *fl6, int flags)
852 {
853         struct fib6_node *fn;
854         struct rt6_info *rt;
855
856         read_lock_bh(&table->tb6_lock);
857         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
858 restart:
859         rt = fn->leaf;
860         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
861         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
862                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
863         if (rt == net->ipv6.ip6_null_entry) {
864                 fn = fib6_backtrack(fn, &fl6->saddr);
865                 if (fn)
866                         goto restart;
867         }
868         dst_use(&rt->dst, jiffies);
869         read_unlock_bh(&table->tb6_lock);
870
871         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
872
873         return rt;
874
875 }
876
877 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
878                                     int flags)
879 {
880         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
881 }
882 EXPORT_SYMBOL_GPL(ip6_route_lookup);
883
884 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
885                             const struct in6_addr *saddr, int oif, int strict)
886 {
887         struct flowi6 fl6 = {
888                 .flowi6_oif = oif,
889                 .daddr = *daddr,
890         };
891         struct dst_entry *dst;
892         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
893
894         if (saddr) {
895                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
896                 flags |= RT6_LOOKUP_F_HAS_SADDR;
897         }
898
899         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
900         if (dst->error == 0)
901                 return (struct rt6_info *) dst;
902
903         dst_release(dst);
904
905         return NULL;
906 }
907 EXPORT_SYMBOL(rt6_lookup);
908
909 /* ip6_ins_rt is called with FREE table->tb6_lock.
910    It takes new route entry, the addition fails by any reason the
911    route is freed. In any case, if caller does not hold it, it may
912    be destroyed.
913  */
914
915 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
916                         struct mx6_config *mxc)
917 {
918         int err;
919         struct fib6_table *table;
920
921         table = rt->rt6i_table;
922         write_lock_bh(&table->tb6_lock);
923         err = fib6_add(&table->tb6_root, rt, info, mxc);
924         write_unlock_bh(&table->tb6_lock);
925
926         return err;
927 }
928
929 int ip6_ins_rt(struct rt6_info *rt)
930 {
931         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
932         struct mx6_config mxc = { .mx = NULL, };
933
934         return __ip6_ins_rt(rt, &info, &mxc);
935 }
936
937 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
938                                            const struct in6_addr *daddr,
939                                            const struct in6_addr *saddr)
940 {
941         struct rt6_info *rt;
942
943         /*
944          *      Clone the route.
945          */
946
947         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
948                 ort = (struct rt6_info *)ort->dst.from;
949
950         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
951
952         if (!rt)
953                 return NULL;
954
955         ip6_rt_copy_init(rt, ort);
956         rt->rt6i_flags |= RTF_CACHE;
957         rt->rt6i_metric = 0;
958         rt->dst.flags |= DST_HOST;
959         rt->rt6i_dst.addr = *daddr;
960         rt->rt6i_dst.plen = 128;
961
962         if (!rt6_is_gw_or_nonexthop(ort)) {
963                 if (ort->rt6i_dst.plen != 128 &&
964                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
965                         rt->rt6i_flags |= RTF_ANYCAST;
966 #ifdef CONFIG_IPV6_SUBTREES
967                 if (rt->rt6i_src.plen && saddr) {
968                         rt->rt6i_src.addr = *saddr;
969                         rt->rt6i_src.plen = 128;
970                 }
971 #endif
972         }
973
974         return rt;
975 }
976
977 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
978 {
979         struct rt6_info *pcpu_rt;
980
981         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
982                                   rt->dst.dev, rt->dst.flags);
983
984         if (!pcpu_rt)
985                 return NULL;
986         ip6_rt_copy_init(pcpu_rt, rt);
987         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
988         pcpu_rt->rt6i_flags |= RTF_PCPU;
989         return pcpu_rt;
990 }
991
992 /* It should be called with read_lock_bh(&tb6_lock) acquired */
993 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
994 {
995         struct rt6_info *pcpu_rt, **p;
996
997         p = this_cpu_ptr(rt->rt6i_pcpu);
998         pcpu_rt = *p;
999
1000         if (pcpu_rt) {
1001                 dst_hold(&pcpu_rt->dst);
1002                 rt6_dst_from_metrics_check(pcpu_rt);
1003         }
1004         return pcpu_rt;
1005 }
1006
1007 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1008 {
1009         struct fib6_table *table = rt->rt6i_table;
1010         struct rt6_info *pcpu_rt, *prev, **p;
1011
1012         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1013         if (!pcpu_rt) {
1014                 struct net *net = dev_net(rt->dst.dev);
1015
1016                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1017                 return net->ipv6.ip6_null_entry;
1018         }
1019
1020         read_lock_bh(&table->tb6_lock);
1021         if (rt->rt6i_pcpu) {
1022                 p = this_cpu_ptr(rt->rt6i_pcpu);
1023                 prev = cmpxchg(p, NULL, pcpu_rt);
1024                 if (prev) {
1025                         /* If someone did it before us, return prev instead */
1026                         dst_destroy(&pcpu_rt->dst);
1027                         pcpu_rt = prev;
1028                 }
1029         } else {
1030                 /* rt has been removed from the fib6 tree
1031                  * before we have a chance to acquire the read_lock.
1032                  * In this case, don't brother to create a pcpu rt
1033                  * since rt is going away anyway.  The next
1034                  * dst_check() will trigger a re-lookup.
1035                  */
1036                 dst_destroy(&pcpu_rt->dst);
1037                 pcpu_rt = rt;
1038         }
1039         dst_hold(&pcpu_rt->dst);
1040         rt6_dst_from_metrics_check(pcpu_rt);
1041         read_unlock_bh(&table->tb6_lock);
1042         return pcpu_rt;
1043 }
1044
1045 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1046                                int oif, struct flowi6 *fl6, int flags)
1047 {
1048         struct fib6_node *fn, *saved_fn;
1049         struct rt6_info *rt;
1050         int strict = 0;
1051
1052         strict |= flags & RT6_LOOKUP_F_IFACE;
1053         if (net->ipv6.devconf_all->forwarding == 0)
1054                 strict |= RT6_LOOKUP_F_REACHABLE;
1055
1056         read_lock_bh(&table->tb6_lock);
1057
1058         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1059         saved_fn = fn;
1060
1061         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1062                 oif = 0;
1063
1064 redo_rt6_select:
1065         rt = rt6_select(fn, oif, strict);
1066         if (rt->rt6i_nsiblings)
1067                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1068         if (rt == net->ipv6.ip6_null_entry) {
1069                 fn = fib6_backtrack(fn, &fl6->saddr);
1070                 if (fn)
1071                         goto redo_rt6_select;
1072                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1073                         /* also consider unreachable route */
1074                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1075                         fn = saved_fn;
1076                         goto redo_rt6_select;
1077                 }
1078         }
1079
1080
1081         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1082                 dst_use(&rt->dst, jiffies);
1083                 read_unlock_bh(&table->tb6_lock);
1084
1085                 rt6_dst_from_metrics_check(rt);
1086
1087                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1088                 return rt;
1089         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1090                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1091                 /* Create a RTF_CACHE clone which will not be
1092                  * owned by the fib6 tree.  It is for the special case where
1093                  * the daddr in the skb during the neighbor look-up is different
1094                  * from the fl6->daddr used to look-up route here.
1095                  */
1096
1097                 struct rt6_info *uncached_rt;
1098
1099                 dst_use(&rt->dst, jiffies);
1100                 read_unlock_bh(&table->tb6_lock);
1101
1102                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1103                 dst_release(&rt->dst);
1104
1105                 if (uncached_rt)
1106                         rt6_uncached_list_add(uncached_rt);
1107                 else
1108                         uncached_rt = net->ipv6.ip6_null_entry;
1109
1110                 dst_hold(&uncached_rt->dst);
1111
1112                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1113                 return uncached_rt;
1114
1115         } else {
1116                 /* Get a percpu copy */
1117
1118                 struct rt6_info *pcpu_rt;
1119
1120                 rt->dst.lastuse = jiffies;
1121                 rt->dst.__use++;
1122                 pcpu_rt = rt6_get_pcpu_route(rt);
1123
1124                 if (pcpu_rt) {
1125                         read_unlock_bh(&table->tb6_lock);
1126                 } else {
1127                         /* We have to do the read_unlock first
1128                          * because rt6_make_pcpu_route() may trigger
1129                          * ip6_dst_gc() which will take the write_lock.
1130                          */
1131                         dst_hold(&rt->dst);
1132                         read_unlock_bh(&table->tb6_lock);
1133                         pcpu_rt = rt6_make_pcpu_route(rt);
1134                         dst_release(&rt->dst);
1135                 }
1136
1137                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1138                 return pcpu_rt;
1139
1140         }
1141 }
1142 EXPORT_SYMBOL_GPL(ip6_pol_route);
1143
1144 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1145                                             struct flowi6 *fl6, int flags)
1146 {
1147         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1148 }
1149
1150 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1151                                                 struct net_device *dev,
1152                                                 struct flowi6 *fl6, int flags)
1153 {
1154         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1155                 flags |= RT6_LOOKUP_F_IFACE;
1156
1157         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1158 }
1159
1160 void ip6_route_input(struct sk_buff *skb)
1161 {
1162         const struct ipv6hdr *iph = ipv6_hdr(skb);
1163         struct net *net = dev_net(skb->dev);
1164         int flags = RT6_LOOKUP_F_HAS_SADDR;
1165         struct ip_tunnel_info *tun_info;
1166         struct flowi6 fl6 = {
1167                 .flowi6_iif = l3mdev_fib_oif(skb->dev),
1168                 .daddr = iph->daddr,
1169                 .saddr = iph->saddr,
1170                 .flowlabel = ip6_flowinfo(iph),
1171                 .flowi6_mark = skb->mark,
1172                 .flowi6_proto = iph->nexthdr,
1173         };
1174
1175         tun_info = skb_tunnel_info(skb);
1176         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1177                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1178         skb_dst_drop(skb);
1179         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1180 }
1181
1182 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1183                                              struct flowi6 *fl6, int flags)
1184 {
1185         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1186 }
1187
1188 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1189                                          struct flowi6 *fl6, int flags)
1190 {
1191         struct dst_entry *dst;
1192         bool any_src;
1193
1194         dst = l3mdev_get_rt6_dst(net, fl6);
1195         if (dst)
1196                 return dst;
1197
1198         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1199
1200         any_src = ipv6_addr_any(&fl6->saddr);
1201         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1202             (fl6->flowi6_oif && any_src))
1203                 flags |= RT6_LOOKUP_F_IFACE;
1204
1205         if (!any_src)
1206                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1207         else if (sk)
1208                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1209
1210         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1211 }
1212 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1213
1214 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1215 {
1216         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1217         struct dst_entry *new = NULL;
1218
1219         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1220         if (rt) {
1221                 rt6_info_init(rt);
1222
1223                 new = &rt->dst;
1224                 new->__use = 1;
1225                 new->input = dst_discard;
1226                 new->output = dst_discard_out;
1227
1228                 dst_copy_metrics(new, &ort->dst);
1229                 rt->rt6i_idev = ort->rt6i_idev;
1230                 if (rt->rt6i_idev)
1231                         in6_dev_hold(rt->rt6i_idev);
1232
1233                 rt->rt6i_gateway = ort->rt6i_gateway;
1234                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1235                 rt->rt6i_metric = 0;
1236
1237                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1238 #ifdef CONFIG_IPV6_SUBTREES
1239                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1240 #endif
1241
1242                 dst_free(new);
1243         }
1244
1245         dst_release(dst_orig);
1246         return new ? new : ERR_PTR(-ENOMEM);
1247 }
1248
1249 /*
1250  *      Destination cache support functions
1251  */
1252
1253 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1254 {
1255         if (rt->dst.from &&
1256             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1257                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1258 }
1259
1260 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1261 {
1262         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1263                 return NULL;
1264
1265         if (rt6_check_expired(rt))
1266                 return NULL;
1267
1268         return &rt->dst;
1269 }
1270
1271 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1272 {
1273         if (!__rt6_check_expired(rt) &&
1274             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1275             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1276                 return &rt->dst;
1277         else
1278                 return NULL;
1279 }
1280
1281 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1282 {
1283         struct rt6_info *rt;
1284
1285         rt = (struct rt6_info *) dst;
1286
1287         /* All IPV6 dsts are created with ->obsolete set to the value
1288          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1289          * into this function always.
1290          */
1291
1292         rt6_dst_from_metrics_check(rt);
1293
1294         if (rt->rt6i_flags & RTF_PCPU ||
1295             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1296                 return rt6_dst_from_check(rt, cookie);
1297         else
1298                 return rt6_check(rt, cookie);
1299 }
1300
1301 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1302 {
1303         struct rt6_info *rt = (struct rt6_info *) dst;
1304
1305         if (rt) {
1306                 if (rt->rt6i_flags & RTF_CACHE) {
1307                         if (rt6_check_expired(rt)) {
1308                                 ip6_del_rt(rt);
1309                                 dst = NULL;
1310                         }
1311                 } else {
1312                         dst_release(dst);
1313                         dst = NULL;
1314                 }
1315         }
1316         return dst;
1317 }
1318
1319 static void ip6_link_failure(struct sk_buff *skb)
1320 {
1321         struct rt6_info *rt;
1322
1323         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1324
1325         rt = (struct rt6_info *) skb_dst(skb);
1326         if (rt) {
1327                 if (rt->rt6i_flags & RTF_CACHE) {
1328                         dst_hold(&rt->dst);
1329                         ip6_del_rt(rt);
1330                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1331                         rt->rt6i_node->fn_sernum = -1;
1332                 }
1333         }
1334 }
1335
1336 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1337 {
1338         struct net *net = dev_net(rt->dst.dev);
1339
1340         rt->rt6i_flags |= RTF_MODIFIED;
1341         rt->rt6i_pmtu = mtu;
1342         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1343 }
1344
1345 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1346 {
1347         return !(rt->rt6i_flags & RTF_CACHE) &&
1348                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1349 }
1350
1351 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1352                                  const struct ipv6hdr *iph, u32 mtu)
1353 {
1354         struct rt6_info *rt6 = (struct rt6_info *)dst;
1355
1356         if (rt6->rt6i_flags & RTF_LOCAL)
1357                 return;
1358
1359         dst_confirm(dst);
1360         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1361         if (mtu >= dst_mtu(dst))
1362                 return;
1363
1364         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1365                 rt6_do_update_pmtu(rt6, mtu);
1366         } else {
1367                 const struct in6_addr *daddr, *saddr;
1368                 struct rt6_info *nrt6;
1369
1370                 if (iph) {
1371                         daddr = &iph->daddr;
1372                         saddr = &iph->saddr;
1373                 } else if (sk) {
1374                         daddr = &sk->sk_v6_daddr;
1375                         saddr = &inet6_sk(sk)->saddr;
1376                 } else {
1377                         return;
1378                 }
1379                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1380                 if (nrt6) {
1381                         rt6_do_update_pmtu(nrt6, mtu);
1382
1383                         /* ip6_ins_rt(nrt6) will bump the
1384                          * rt6->rt6i_node->fn_sernum
1385                          * which will fail the next rt6_check() and
1386                          * invalidate the sk->sk_dst_cache.
1387                          */
1388                         ip6_ins_rt(nrt6);
1389                 }
1390         }
1391 }
1392
1393 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1394                                struct sk_buff *skb, u32 mtu)
1395 {
1396         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1397 }
1398
1399 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1400                      int oif, u32 mark)
1401 {
1402         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1403         struct dst_entry *dst;
1404         struct flowi6 fl6;
1405
1406         memset(&fl6, 0, sizeof(fl6));
1407         fl6.flowi6_oif = oif;
1408         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1409         fl6.daddr = iph->daddr;
1410         fl6.saddr = iph->saddr;
1411         fl6.flowlabel = ip6_flowinfo(iph);
1412
1413         dst = ip6_route_output(net, NULL, &fl6);
1414         if (!dst->error)
1415                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1416         dst_release(dst);
1417 }
1418 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1419
1420 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1421 {
1422         struct dst_entry *dst;
1423
1424         ip6_update_pmtu(skb, sock_net(sk), mtu,
1425                         sk->sk_bound_dev_if, sk->sk_mark);
1426
1427         dst = __sk_dst_get(sk);
1428         if (!dst || !dst->obsolete ||
1429             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1430                 return;
1431
1432         bh_lock_sock(sk);
1433         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1434                 ip6_datagram_dst_update(sk, false);
1435         bh_unlock_sock(sk);
1436 }
1437 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1438
1439 /* Handle redirects */
1440 struct ip6rd_flowi {
1441         struct flowi6 fl6;
1442         struct in6_addr gateway;
1443 };
1444
1445 static struct rt6_info *__ip6_route_redirect(struct net *net,
1446                                              struct fib6_table *table,
1447                                              struct flowi6 *fl6,
1448                                              int flags)
1449 {
1450         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1451         struct rt6_info *rt;
1452         struct fib6_node *fn;
1453
1454         /* Get the "current" route for this destination and
1455          * check if the redirect has come from approriate router.
1456          *
1457          * RFC 4861 specifies that redirects should only be
1458          * accepted if they come from the nexthop to the target.
1459          * Due to the way the routes are chosen, this notion
1460          * is a bit fuzzy and one might need to check all possible
1461          * routes.
1462          */
1463
1464         read_lock_bh(&table->tb6_lock);
1465         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1466 restart:
1467         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1468                 if (rt6_check_expired(rt))
1469                         continue;
1470                 if (rt->dst.error)
1471                         break;
1472                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1473                         continue;
1474                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1475                         continue;
1476                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1477                         continue;
1478                 break;
1479         }
1480
1481         if (!rt)
1482                 rt = net->ipv6.ip6_null_entry;
1483         else if (rt->dst.error) {
1484                 rt = net->ipv6.ip6_null_entry;
1485                 goto out;
1486         }
1487
1488         if (rt == net->ipv6.ip6_null_entry) {
1489                 fn = fib6_backtrack(fn, &fl6->saddr);
1490                 if (fn)
1491                         goto restart;
1492         }
1493
1494 out:
1495         dst_hold(&rt->dst);
1496
1497         read_unlock_bh(&table->tb6_lock);
1498
1499         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1500         return rt;
1501 };
1502
1503 static struct dst_entry *ip6_route_redirect(struct net *net,
1504                                         const struct flowi6 *fl6,
1505                                         const struct in6_addr *gateway)
1506 {
1507         int flags = RT6_LOOKUP_F_HAS_SADDR;
1508         struct ip6rd_flowi rdfl;
1509
1510         rdfl.fl6 = *fl6;
1511         rdfl.gateway = *gateway;
1512
1513         return fib6_rule_lookup(net, &rdfl.fl6,
1514                                 flags, __ip6_route_redirect);
1515 }
1516
1517 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1518 {
1519         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1520         struct dst_entry *dst;
1521         struct flowi6 fl6;
1522
1523         memset(&fl6, 0, sizeof(fl6));
1524         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1525         fl6.flowi6_oif = oif;
1526         fl6.flowi6_mark = mark;
1527         fl6.daddr = iph->daddr;
1528         fl6.saddr = iph->saddr;
1529         fl6.flowlabel = ip6_flowinfo(iph);
1530
1531         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1532         rt6_do_redirect(dst, NULL, skb);
1533         dst_release(dst);
1534 }
1535 EXPORT_SYMBOL_GPL(ip6_redirect);
1536
1537 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1538                             u32 mark)
1539 {
1540         const struct ipv6hdr *iph = ipv6_hdr(skb);
1541         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1542         struct dst_entry *dst;
1543         struct flowi6 fl6;
1544
1545         memset(&fl6, 0, sizeof(fl6));
1546         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1547         fl6.flowi6_oif = oif;
1548         fl6.flowi6_mark = mark;
1549         fl6.daddr = msg->dest;
1550         fl6.saddr = iph->daddr;
1551
1552         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1553         rt6_do_redirect(dst, NULL, skb);
1554         dst_release(dst);
1555 }
1556
1557 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1558 {
1559         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1560 }
1561 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1562
1563 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1564 {
1565         struct net_device *dev = dst->dev;
1566         unsigned int mtu = dst_mtu(dst);
1567         struct net *net = dev_net(dev);
1568
1569         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1570
1571         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1572                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1573
1574         /*
1575          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1576          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1577          * IPV6_MAXPLEN is also valid and means: "any MSS,
1578          * rely only on pmtu discovery"
1579          */
1580         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1581                 mtu = IPV6_MAXPLEN;
1582         return mtu;
1583 }
1584
1585 static unsigned int ip6_mtu(const struct dst_entry *dst)
1586 {
1587         const struct rt6_info *rt = (const struct rt6_info *)dst;
1588         unsigned int mtu = rt->rt6i_pmtu;
1589         struct inet6_dev *idev;
1590
1591         if (mtu)
1592                 goto out;
1593
1594         mtu = dst_metric_raw(dst, RTAX_MTU);
1595         if (mtu)
1596                 goto out;
1597
1598         mtu = IPV6_MIN_MTU;
1599
1600         rcu_read_lock();
1601         idev = __in6_dev_get(dst->dev);
1602         if (idev)
1603                 mtu = idev->cnf.mtu6;
1604         rcu_read_unlock();
1605
1606 out:
1607         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1608
1609         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1610 }
1611
1612 static struct dst_entry *icmp6_dst_gc_list;
1613 static DEFINE_SPINLOCK(icmp6_dst_lock);
1614
1615 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1616                                   struct flowi6 *fl6)
1617 {
1618         struct dst_entry *dst;
1619         struct rt6_info *rt;
1620         struct inet6_dev *idev = in6_dev_get(dev);
1621         struct net *net = dev_net(dev);
1622
1623         if (unlikely(!idev))
1624                 return ERR_PTR(-ENODEV);
1625
1626         rt = ip6_dst_alloc(net, dev, 0);
1627         if (unlikely(!rt)) {
1628                 in6_dev_put(idev);
1629                 dst = ERR_PTR(-ENOMEM);
1630                 goto out;
1631         }
1632
1633         rt->dst.flags |= DST_HOST;
1634         rt->dst.output  = ip6_output;
1635         atomic_set(&rt->dst.__refcnt, 1);
1636         rt->rt6i_gateway  = fl6->daddr;
1637         rt->rt6i_dst.addr = fl6->daddr;
1638         rt->rt6i_dst.plen = 128;
1639         rt->rt6i_idev     = idev;
1640         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1641
1642         spin_lock_bh(&icmp6_dst_lock);
1643         rt->dst.next = icmp6_dst_gc_list;
1644         icmp6_dst_gc_list = &rt->dst;
1645         spin_unlock_bh(&icmp6_dst_lock);
1646
1647         fib6_force_start_gc(net);
1648
1649         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1650
1651 out:
1652         return dst;
1653 }
1654
1655 int icmp6_dst_gc(void)
1656 {
1657         struct dst_entry *dst, **pprev;
1658         int more = 0;
1659
1660         spin_lock_bh(&icmp6_dst_lock);
1661         pprev = &icmp6_dst_gc_list;
1662
1663         while ((dst = *pprev) != NULL) {
1664                 if (!atomic_read(&dst->__refcnt)) {
1665                         *pprev = dst->next;
1666                         dst_free(dst);
1667                 } else {
1668                         pprev = &dst->next;
1669                         ++more;
1670                 }
1671         }
1672
1673         spin_unlock_bh(&icmp6_dst_lock);
1674
1675         return more;
1676 }
1677
1678 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1679                             void *arg)
1680 {
1681         struct dst_entry *dst, **pprev;
1682
1683         spin_lock_bh(&icmp6_dst_lock);
1684         pprev = &icmp6_dst_gc_list;
1685         while ((dst = *pprev) != NULL) {
1686                 struct rt6_info *rt = (struct rt6_info *) dst;
1687                 if (func(rt, arg)) {
1688                         *pprev = dst->next;
1689                         dst_free(dst);
1690                 } else {
1691                         pprev = &dst->next;
1692                 }
1693         }
1694         spin_unlock_bh(&icmp6_dst_lock);
1695 }
1696
1697 static int ip6_dst_gc(struct dst_ops *ops)
1698 {
1699         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1700         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1701         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1702         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1703         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1704         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1705         int entries;
1706
1707         entries = dst_entries_get_fast(ops);
1708         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1709             entries <= rt_max_size)
1710                 goto out;
1711
1712         net->ipv6.ip6_rt_gc_expire++;
1713         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1714         entries = dst_entries_get_slow(ops);
1715         if (entries < ops->gc_thresh)
1716                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1717 out:
1718         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1719         return entries > rt_max_size;
1720 }
1721
1722 static int ip6_convert_metrics(struct mx6_config *mxc,
1723                                const struct fib6_config *cfg)
1724 {
1725         bool ecn_ca = false;
1726         struct nlattr *nla;
1727         int remaining;
1728         u32 *mp;
1729
1730         if (!cfg->fc_mx)
1731                 return 0;
1732
1733         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1734         if (unlikely(!mp))
1735                 return -ENOMEM;
1736
1737         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1738                 int type = nla_type(nla);
1739                 u32 val;
1740
1741                 if (!type)
1742                         continue;
1743                 if (unlikely(type > RTAX_MAX))
1744                         goto err;
1745
1746                 if (type == RTAX_CC_ALGO) {
1747                         char tmp[TCP_CA_NAME_MAX];
1748
1749                         nla_strlcpy(tmp, nla, sizeof(tmp));
1750                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1751                         if (val == TCP_CA_UNSPEC)
1752                                 goto err;
1753                 } else {
1754                         val = nla_get_u32(nla);
1755                 }
1756                 if (type == RTAX_HOPLIMIT && val > 255)
1757                         val = 255;
1758                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1759                         goto err;
1760
1761                 mp[type - 1] = val;
1762                 __set_bit(type - 1, mxc->mx_valid);
1763         }
1764
1765         if (ecn_ca) {
1766                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1767                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1768         }
1769
1770         mxc->mx = mp;
1771         return 0;
1772  err:
1773         kfree(mp);
1774         return -EINVAL;
1775 }
1776
1777 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1778                                             struct fib6_config *cfg,
1779                                             const struct in6_addr *gw_addr)
1780 {
1781         struct flowi6 fl6 = {
1782                 .flowi6_oif = cfg->fc_ifindex,
1783                 .daddr = *gw_addr,
1784                 .saddr = cfg->fc_prefsrc,
1785         };
1786         struct fib6_table *table;
1787         struct rt6_info *rt;
1788         int flags = RT6_LOOKUP_F_IFACE;
1789
1790         table = fib6_get_table(net, cfg->fc_table);
1791         if (!table)
1792                 return NULL;
1793
1794         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1795                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1796
1797         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1798
1799         /* if table lookup failed, fall back to full lookup */
1800         if (rt == net->ipv6.ip6_null_entry) {
1801                 ip6_rt_put(rt);
1802                 rt = NULL;
1803         }
1804
1805         return rt;
1806 }
1807
1808 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1809 {
1810         struct net *net = cfg->fc_nlinfo.nl_net;
1811         struct rt6_info *rt = NULL;
1812         struct net_device *dev = NULL;
1813         struct inet6_dev *idev = NULL;
1814         struct fib6_table *table;
1815         int addr_type;
1816         int err = -EINVAL;
1817
1818         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1819                 goto out;
1820 #ifndef CONFIG_IPV6_SUBTREES
1821         if (cfg->fc_src_len)
1822                 goto out;
1823 #endif
1824         if (cfg->fc_ifindex) {
1825                 err = -ENODEV;
1826                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1827                 if (!dev)
1828                         goto out;
1829                 idev = in6_dev_get(dev);
1830                 if (!idev)
1831                         goto out;
1832         }
1833
1834         if (cfg->fc_metric == 0)
1835                 cfg->fc_metric = IP6_RT_PRIO_USER;
1836
1837         err = -ENOBUFS;
1838         if (cfg->fc_nlinfo.nlh &&
1839             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1840                 table = fib6_get_table(net, cfg->fc_table);
1841                 if (!table) {
1842                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1843                         table = fib6_new_table(net, cfg->fc_table);
1844                 }
1845         } else {
1846                 table = fib6_new_table(net, cfg->fc_table);
1847         }
1848
1849         if (!table)
1850                 goto out;
1851
1852         rt = ip6_dst_alloc(net, NULL,
1853                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1854
1855         if (!rt) {
1856                 err = -ENOMEM;
1857                 goto out;
1858         }
1859
1860         if (cfg->fc_flags & RTF_EXPIRES)
1861                 rt6_set_expires(rt, jiffies +
1862                                 clock_t_to_jiffies(cfg->fc_expires));
1863         else
1864                 rt6_clean_expires(rt);
1865
1866         if (cfg->fc_protocol == RTPROT_UNSPEC)
1867                 cfg->fc_protocol = RTPROT_BOOT;
1868         rt->rt6i_protocol = cfg->fc_protocol;
1869
1870         addr_type = ipv6_addr_type(&cfg->fc_dst);
1871
1872         if (addr_type & IPV6_ADDR_MULTICAST)
1873                 rt->dst.input = ip6_mc_input;
1874         else if (cfg->fc_flags & RTF_LOCAL)
1875                 rt->dst.input = ip6_input;
1876         else
1877                 rt->dst.input = ip6_forward;
1878
1879         rt->dst.output = ip6_output;
1880
1881         if (cfg->fc_encap) {
1882                 struct lwtunnel_state *lwtstate;
1883
1884                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1885                                            cfg->fc_encap, AF_INET6, cfg,
1886                                            &lwtstate);
1887                 if (err)
1888                         goto out;
1889                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1890                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1891                         rt->dst.lwtstate->orig_output = rt->dst.output;
1892                         rt->dst.output = lwtunnel_output;
1893                 }
1894                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1895                         rt->dst.lwtstate->orig_input = rt->dst.input;
1896                         rt->dst.input = lwtunnel_input;
1897                 }
1898         }
1899
1900         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1901         rt->rt6i_dst.plen = cfg->fc_dst_len;
1902         if (rt->rt6i_dst.plen == 128)
1903                 rt->dst.flags |= DST_HOST;
1904
1905 #ifdef CONFIG_IPV6_SUBTREES
1906         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1907         rt->rt6i_src.plen = cfg->fc_src_len;
1908 #endif
1909
1910         rt->rt6i_metric = cfg->fc_metric;
1911
1912         /* We cannot add true routes via loopback here,
1913            they would result in kernel looping; promote them to reject routes
1914          */
1915         if ((cfg->fc_flags & RTF_REJECT) ||
1916             (dev && (dev->flags & IFF_LOOPBACK) &&
1917              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1918              !(cfg->fc_flags & RTF_LOCAL))) {
1919                 /* hold loopback dev/idev if we haven't done so. */
1920                 if (dev != net->loopback_dev) {
1921                         if (dev) {
1922                                 dev_put(dev);
1923                                 in6_dev_put(idev);
1924                         }
1925                         dev = net->loopback_dev;
1926                         dev_hold(dev);
1927                         idev = in6_dev_get(dev);
1928                         if (!idev) {
1929                                 err = -ENODEV;
1930                                 goto out;
1931                         }
1932                 }
1933                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1934                 switch (cfg->fc_type) {
1935                 case RTN_BLACKHOLE:
1936                         rt->dst.error = -EINVAL;
1937                         rt->dst.output = dst_discard_out;
1938                         rt->dst.input = dst_discard;
1939                         break;
1940                 case RTN_PROHIBIT:
1941                         rt->dst.error = -EACCES;
1942                         rt->dst.output = ip6_pkt_prohibit_out;
1943                         rt->dst.input = ip6_pkt_prohibit;
1944                         break;
1945                 case RTN_THROW:
1946                 case RTN_UNREACHABLE:
1947                 default:
1948                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1949                                         : (cfg->fc_type == RTN_UNREACHABLE)
1950                                         ? -EHOSTUNREACH : -ENETUNREACH;
1951                         rt->dst.output = ip6_pkt_discard_out;
1952                         rt->dst.input = ip6_pkt_discard;
1953                         break;
1954                 }
1955                 goto install_route;
1956         }
1957
1958         if (cfg->fc_flags & RTF_GATEWAY) {
1959                 const struct in6_addr *gw_addr;
1960                 int gwa_type;
1961
1962                 gw_addr = &cfg->fc_gateway;
1963                 gwa_type = ipv6_addr_type(gw_addr);
1964
1965                 /* if gw_addr is local we will fail to detect this in case
1966                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1967                  * will return already-added prefix route via interface that
1968                  * prefix route was assigned to, which might be non-loopback.
1969                  */
1970                 err = -EINVAL;
1971                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1972                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1973                                             dev : NULL, 0, 0))
1974                         goto out;
1975
1976                 rt->rt6i_gateway = *gw_addr;
1977
1978                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1979                         struct rt6_info *grt = NULL;
1980
1981                         /* IPv6 strictly inhibits using not link-local
1982                            addresses as nexthop address.
1983                            Otherwise, router will not able to send redirects.
1984                            It is very good, but in some (rare!) circumstances
1985                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1986                            some exceptions. --ANK
1987                          */
1988                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1989                                 goto out;
1990
1991                         if (cfg->fc_table)
1992                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
1993
1994                         if (!grt)
1995                                 grt = rt6_lookup(net, gw_addr, NULL,
1996                                                  cfg->fc_ifindex, 1);
1997
1998                         err = -EHOSTUNREACH;
1999                         if (!grt)
2000                                 goto out;
2001                         if (dev) {
2002                                 if (dev != grt->dst.dev) {
2003                                         ip6_rt_put(grt);
2004                                         goto out;
2005                                 }
2006                         } else {
2007                                 dev = grt->dst.dev;
2008                                 idev = grt->rt6i_idev;
2009                                 dev_hold(dev);
2010                                 in6_dev_hold(grt->rt6i_idev);
2011                         }
2012                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2013                                 err = 0;
2014                         ip6_rt_put(grt);
2015
2016                         if (err)
2017                                 goto out;
2018                 }
2019                 err = -EINVAL;
2020                 if (!dev || (dev->flags & IFF_LOOPBACK))
2021                         goto out;
2022         }
2023
2024         err = -ENODEV;
2025         if (!dev)
2026                 goto out;
2027
2028         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2029                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2030                         err = -EINVAL;
2031                         goto out;
2032                 }
2033                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2034                 rt->rt6i_prefsrc.plen = 128;
2035         } else
2036                 rt->rt6i_prefsrc.plen = 0;
2037
2038         rt->rt6i_flags = cfg->fc_flags;
2039
2040 install_route:
2041         rt->dst.dev = dev;
2042         rt->rt6i_idev = idev;
2043         rt->rt6i_table = table;
2044
2045         cfg->fc_nlinfo.nl_net = dev_net(dev);
2046
2047         return rt;
2048 out:
2049         if (dev)
2050                 dev_put(dev);
2051         if (idev)
2052                 in6_dev_put(idev);
2053         if (rt)
2054                 dst_free(&rt->dst);
2055
2056         return ERR_PTR(err);
2057 }
2058
2059 int ip6_route_add(struct fib6_config *cfg)
2060 {
2061         struct mx6_config mxc = { .mx = NULL, };
2062         struct rt6_info *rt;
2063         int err;
2064
2065         rt = ip6_route_info_create(cfg);
2066         if (IS_ERR(rt)) {
2067                 err = PTR_ERR(rt);
2068                 rt = NULL;
2069                 goto out;
2070         }
2071
2072         err = ip6_convert_metrics(&mxc, cfg);
2073         if (err)
2074                 goto out;
2075
2076         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2077
2078         kfree(mxc.mx);
2079
2080         return err;
2081 out:
2082         if (rt)
2083                 dst_free(&rt->dst);
2084
2085         return err;
2086 }
2087
2088 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2089 {
2090         int err;
2091         struct fib6_table *table;
2092         struct net *net = dev_net(rt->dst.dev);
2093
2094         if (rt == net->ipv6.ip6_null_entry ||
2095             rt->dst.flags & DST_NOCACHE) {
2096                 err = -ENOENT;
2097                 goto out;
2098         }
2099
2100         table = rt->rt6i_table;
2101         write_lock_bh(&table->tb6_lock);
2102         err = fib6_del(rt, info);
2103         write_unlock_bh(&table->tb6_lock);
2104
2105 out:
2106         ip6_rt_put(rt);
2107         return err;
2108 }
2109
2110 int ip6_del_rt(struct rt6_info *rt)
2111 {
2112         struct nl_info info = {
2113                 .nl_net = dev_net(rt->dst.dev),
2114         };
2115         return __ip6_del_rt(rt, &info);
2116 }
2117
2118 static int ip6_route_del(struct fib6_config *cfg)
2119 {
2120         struct fib6_table *table;
2121         struct fib6_node *fn;
2122         struct rt6_info *rt;
2123         int err = -ESRCH;
2124
2125         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2126         if (!table)
2127                 return err;
2128
2129         read_lock_bh(&table->tb6_lock);
2130
2131         fn = fib6_locate(&table->tb6_root,
2132                          &cfg->fc_dst, cfg->fc_dst_len,
2133                          &cfg->fc_src, cfg->fc_src_len);
2134
2135         if (fn) {
2136                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2137                         if ((rt->rt6i_flags & RTF_CACHE) &&
2138                             !(cfg->fc_flags & RTF_CACHE))
2139                                 continue;
2140                         if (cfg->fc_ifindex &&
2141                             (!rt->dst.dev ||
2142                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2143                                 continue;
2144                         if (cfg->fc_flags & RTF_GATEWAY &&
2145                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2146                                 continue;
2147                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2148                                 continue;
2149                         dst_hold(&rt->dst);
2150                         read_unlock_bh(&table->tb6_lock);
2151
2152                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2153                 }
2154         }
2155         read_unlock_bh(&table->tb6_lock);
2156
2157         return err;
2158 }
2159
2160 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2161 {
2162         struct netevent_redirect netevent;
2163         struct rt6_info *rt, *nrt = NULL;
2164         struct ndisc_options ndopts;
2165         struct inet6_dev *in6_dev;
2166         struct neighbour *neigh;
2167         struct rd_msg *msg;
2168         int optlen, on_link;
2169         u8 *lladdr;
2170
2171         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2172         optlen -= sizeof(*msg);
2173
2174         if (optlen < 0) {
2175                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2176                 return;
2177         }
2178
2179         msg = (struct rd_msg *)icmp6_hdr(skb);
2180
2181         if (ipv6_addr_is_multicast(&msg->dest)) {
2182                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2183                 return;
2184         }
2185
2186         on_link = 0;
2187         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2188                 on_link = 1;
2189         } else if (ipv6_addr_type(&msg->target) !=
2190                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2191                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2192                 return;
2193         }
2194
2195         in6_dev = __in6_dev_get(skb->dev);
2196         if (!in6_dev)
2197                 return;
2198         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2199                 return;
2200
2201         /* RFC2461 8.1:
2202          *      The IP source address of the Redirect MUST be the same as the current
2203          *      first-hop router for the specified ICMP Destination Address.
2204          */
2205
2206         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2207                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2208                 return;
2209         }
2210
2211         lladdr = NULL;
2212         if (ndopts.nd_opts_tgt_lladdr) {
2213                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2214                                              skb->dev);
2215                 if (!lladdr) {
2216                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2217                         return;
2218                 }
2219         }
2220
2221         rt = (struct rt6_info *) dst;
2222         if (rt->rt6i_flags & RTF_REJECT) {
2223                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2224                 return;
2225         }
2226
2227         /* Redirect received -> path was valid.
2228          * Look, redirects are sent only in response to data packets,
2229          * so that this nexthop apparently is reachable. --ANK
2230          */
2231         dst_confirm(&rt->dst);
2232
2233         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2234         if (!neigh)
2235                 return;
2236
2237         /*
2238          *      We have finally decided to accept it.
2239          */
2240
2241         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2242                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2243                      NEIGH_UPDATE_F_OVERRIDE|
2244                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2245                                      NEIGH_UPDATE_F_ISROUTER)),
2246                      NDISC_REDIRECT, &ndopts);
2247
2248         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2249         if (!nrt)
2250                 goto out;
2251
2252         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2253         if (on_link)
2254                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2255
2256         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2257
2258         if (ip6_ins_rt(nrt))
2259                 goto out;
2260
2261         netevent.old = &rt->dst;
2262         netevent.new = &nrt->dst;
2263         netevent.daddr = &msg->dest;
2264         netevent.neigh = neigh;
2265         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2266
2267         if (rt->rt6i_flags & RTF_CACHE) {
2268                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2269                 ip6_del_rt(rt);
2270         }
2271
2272 out:
2273         neigh_release(neigh);
2274 }
2275
2276 /*
2277  *      Misc support functions
2278  */
2279
2280 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2281 {
2282         BUG_ON(from->dst.from);
2283
2284         rt->rt6i_flags &= ~RTF_EXPIRES;
2285         dst_hold(&from->dst);
2286         rt->dst.from = &from->dst;
2287         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2288 }
2289
2290 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2291 {
2292         rt->dst.input = ort->dst.input;
2293         rt->dst.output = ort->dst.output;
2294         rt->rt6i_dst = ort->rt6i_dst;
2295         rt->dst.error = ort->dst.error;
2296         rt->rt6i_idev = ort->rt6i_idev;
2297         if (rt->rt6i_idev)
2298                 in6_dev_hold(rt->rt6i_idev);
2299         rt->dst.lastuse = jiffies;
2300         rt->rt6i_gateway = ort->rt6i_gateway;
2301         rt->rt6i_flags = ort->rt6i_flags;
2302         rt6_set_from(rt, ort);
2303         rt->rt6i_metric = ort->rt6i_metric;
2304 #ifdef CONFIG_IPV6_SUBTREES
2305         rt->rt6i_src = ort->rt6i_src;
2306 #endif
2307         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2308         rt->rt6i_table = ort->rt6i_table;
2309         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2310 }
2311
2312 #ifdef CONFIG_IPV6_ROUTE_INFO
2313 static struct rt6_info *rt6_get_route_info(struct net *net,
2314                                            const struct in6_addr *prefix, int prefixlen,
2315                                            const struct in6_addr *gwaddr, int ifindex)
2316 {
2317         struct fib6_node *fn;
2318         struct rt6_info *rt = NULL;
2319         struct fib6_table *table;
2320
2321         table = fib6_get_table(net, RT6_TABLE_INFO);
2322         if (!table)
2323                 return NULL;
2324
2325         read_lock_bh(&table->tb6_lock);
2326         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2327         if (!fn)
2328                 goto out;
2329
2330         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2331                 if (rt->dst.dev->ifindex != ifindex)
2332                         continue;
2333                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2334                         continue;
2335                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2336                         continue;
2337                 dst_hold(&rt->dst);
2338                 break;
2339         }
2340 out:
2341         read_unlock_bh(&table->tb6_lock);
2342         return rt;
2343 }
2344
2345 static struct rt6_info *rt6_add_route_info(struct net *net,
2346                                            const struct in6_addr *prefix, int prefixlen,
2347                                            const struct in6_addr *gwaddr, int ifindex,
2348                                            unsigned int pref)
2349 {
2350         struct fib6_config cfg = {
2351                 .fc_metric      = IP6_RT_PRIO_USER,
2352                 .fc_ifindex     = ifindex,
2353                 .fc_dst_len     = prefixlen,
2354                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2355                                   RTF_UP | RTF_PREF(pref),
2356                 .fc_nlinfo.portid = 0,
2357                 .fc_nlinfo.nlh = NULL,
2358                 .fc_nlinfo.nl_net = net,
2359         };
2360
2361         cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
2362         cfg.fc_dst = *prefix;
2363         cfg.fc_gateway = *gwaddr;
2364
2365         /* We should treat it as a default route if prefix length is 0. */
2366         if (!prefixlen)
2367                 cfg.fc_flags |= RTF_DEFAULT;
2368
2369         ip6_route_add(&cfg);
2370
2371         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2372 }
2373 #endif
2374
2375 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2376 {
2377         struct rt6_info *rt;
2378         struct fib6_table *table;
2379
2380         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2381         if (!table)
2382                 return NULL;
2383
2384         read_lock_bh(&table->tb6_lock);
2385         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2386                 if (dev == rt->dst.dev &&
2387                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2388                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2389                         break;
2390         }
2391         if (rt)
2392                 dst_hold(&rt->dst);
2393         read_unlock_bh(&table->tb6_lock);
2394         return rt;
2395 }
2396
2397 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2398                                      struct net_device *dev,
2399                                      unsigned int pref)
2400 {
2401         struct fib6_config cfg = {
2402                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2403                 .fc_metric      = IP6_RT_PRIO_USER,
2404                 .fc_ifindex     = dev->ifindex,
2405                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2406                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2407                 .fc_nlinfo.portid = 0,
2408                 .fc_nlinfo.nlh = NULL,
2409                 .fc_nlinfo.nl_net = dev_net(dev),
2410         };
2411
2412         cfg.fc_gateway = *gwaddr;
2413
2414         ip6_route_add(&cfg);
2415
2416         return rt6_get_dflt_router(gwaddr, dev);
2417 }
2418
2419 void rt6_purge_dflt_routers(struct net *net)
2420 {
2421         struct rt6_info *rt;
2422         struct fib6_table *table;
2423
2424         /* NOTE: Keep consistent with rt6_get_dflt_router */
2425         table = fib6_get_table(net, RT6_TABLE_DFLT);
2426         if (!table)
2427                 return;
2428
2429 restart:
2430         read_lock_bh(&table->tb6_lock);
2431         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2432                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2433                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2434                         dst_hold(&rt->dst);
2435                         read_unlock_bh(&table->tb6_lock);
2436                         ip6_del_rt(rt);
2437                         goto restart;
2438                 }
2439         }
2440         read_unlock_bh(&table->tb6_lock);
2441 }
2442
2443 static void rtmsg_to_fib6_config(struct net *net,
2444                                  struct in6_rtmsg *rtmsg,
2445                                  struct fib6_config *cfg)
2446 {
2447         memset(cfg, 0, sizeof(*cfg));
2448
2449         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2450                          : RT6_TABLE_MAIN;
2451         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2452         cfg->fc_metric = rtmsg->rtmsg_metric;
2453         cfg->fc_expires = rtmsg->rtmsg_info;
2454         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2455         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2456         cfg->fc_flags = rtmsg->rtmsg_flags;
2457
2458         cfg->fc_nlinfo.nl_net = net;
2459
2460         cfg->fc_dst = rtmsg->rtmsg_dst;
2461         cfg->fc_src = rtmsg->rtmsg_src;
2462         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2463 }
2464
2465 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2466 {
2467         struct fib6_config cfg;
2468         struct in6_rtmsg rtmsg;
2469         int err;
2470
2471         switch (cmd) {
2472         case SIOCADDRT:         /* Add a route */
2473         case SIOCDELRT:         /* Delete a route */
2474                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2475                         return -EPERM;
2476                 err = copy_from_user(&rtmsg, arg,
2477                                      sizeof(struct in6_rtmsg));
2478                 if (err)
2479                         return -EFAULT;
2480
2481                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2482
2483                 rtnl_lock();
2484                 switch (cmd) {
2485                 case SIOCADDRT:
2486                         err = ip6_route_add(&cfg);
2487                         break;
2488                 case SIOCDELRT:
2489                         err = ip6_route_del(&cfg);
2490                         break;
2491                 default:
2492                         err = -EINVAL;
2493                 }
2494                 rtnl_unlock();
2495
2496                 return err;
2497         }
2498
2499         return -EINVAL;
2500 }
2501
2502 /*
2503  *      Drop the packet on the floor
2504  */
2505
2506 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2507 {
2508         int type;
2509         struct dst_entry *dst = skb_dst(skb);
2510         switch (ipstats_mib_noroutes) {
2511         case IPSTATS_MIB_INNOROUTES:
2512                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2513                 if (type == IPV6_ADDR_ANY) {
2514                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2515                                       IPSTATS_MIB_INADDRERRORS);
2516                         break;
2517                 }
2518                 /* FALLTHROUGH */
2519         case IPSTATS_MIB_OUTNOROUTES:
2520                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2521                               ipstats_mib_noroutes);
2522                 break;
2523         }
2524         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2525         kfree_skb(skb);
2526         return 0;
2527 }
2528
2529 static int ip6_pkt_discard(struct sk_buff *skb)
2530 {
2531         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2532 }
2533
2534 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2535 {
2536         skb->dev = skb_dst(skb)->dev;
2537         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2538 }
2539
2540 static int ip6_pkt_prohibit(struct sk_buff *skb)
2541 {
2542         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2543 }
2544
2545 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2546 {
2547         skb->dev = skb_dst(skb)->dev;
2548         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2549 }
2550
2551 /*
2552  *      Allocate a dst for local (unicast / anycast) address.
2553  */
2554
2555 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2556                                     const struct in6_addr *addr,
2557                                     bool anycast)
2558 {
2559         u32 tb_id;
2560         struct net *net = dev_net(idev->dev);
2561         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2562                                             DST_NOCOUNT);
2563         if (!rt)
2564                 return ERR_PTR(-ENOMEM);
2565
2566         in6_dev_hold(idev);
2567
2568         rt->dst.flags |= DST_HOST;
2569         rt->dst.input = ip6_input;
2570         rt->dst.output = ip6_output;
2571         rt->rt6i_idev = idev;
2572
2573         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2574         if (anycast)
2575                 rt->rt6i_flags |= RTF_ANYCAST;
2576         else
2577                 rt->rt6i_flags |= RTF_LOCAL;
2578
2579         rt->rt6i_gateway  = *addr;
2580         rt->rt6i_dst.addr = *addr;
2581         rt->rt6i_dst.plen = 128;
2582         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2583         rt->rt6i_table = fib6_get_table(net, tb_id);
2584         rt->dst.flags |= DST_NOCACHE;
2585
2586         atomic_set(&rt->dst.__refcnt, 1);
2587
2588         return rt;
2589 }
2590
2591 /* remove deleted ip from prefsrc entries */
2592 struct arg_dev_net_ip {
2593         struct net_device *dev;
2594         struct net *net;
2595         struct in6_addr *addr;
2596 };
2597
2598 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2599 {
2600         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2601         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2602         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2603
2604         if (((void *)rt->dst.dev == dev || !dev) &&
2605             rt != net->ipv6.ip6_null_entry &&
2606             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2607                 /* remove prefsrc entry */
2608                 rt->rt6i_prefsrc.plen = 0;
2609         }
2610         return 0;
2611 }
2612
2613 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2614 {
2615         struct net *net = dev_net(ifp->idev->dev);
2616         struct arg_dev_net_ip adni = {
2617                 .dev = ifp->idev->dev,
2618                 .net = net,
2619                 .addr = &ifp->addr,
2620         };
2621         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2622 }
2623
2624 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2625 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2626
2627 /* Remove routers and update dst entries when gateway turn into host. */
2628 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2629 {
2630         struct in6_addr *gateway = (struct in6_addr *)arg;
2631
2632         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2633              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2634              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2635                 return -1;
2636         }
2637         return 0;
2638 }
2639
2640 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2641 {
2642         fib6_clean_all(net, fib6_clean_tohost, gateway);
2643 }
2644
2645 struct arg_dev_net {
2646         struct net_device *dev;
2647         struct net *net;
2648 };
2649
2650 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2651 {
2652         const struct arg_dev_net *adn = arg;
2653         const struct net_device *dev = adn->dev;
2654
2655         if ((rt->dst.dev == dev || !dev) &&
2656             rt != adn->net->ipv6.ip6_null_entry)
2657                 return -1;
2658
2659         return 0;
2660 }
2661
2662 void rt6_ifdown(struct net *net, struct net_device *dev)
2663 {
2664         struct arg_dev_net adn = {
2665                 .dev = dev,
2666                 .net = net,
2667         };
2668
2669         fib6_clean_all(net, fib6_ifdown, &adn);
2670         icmp6_clean_all(fib6_ifdown, &adn);
2671         if (dev)
2672                 rt6_uncached_list_flush_dev(net, dev);
2673 }
2674
2675 struct rt6_mtu_change_arg {
2676         struct net_device *dev;
2677         unsigned int mtu;
2678 };
2679
2680 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2681 {
2682         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2683         struct inet6_dev *idev;
2684
2685         /* In IPv6 pmtu discovery is not optional,
2686            so that RTAX_MTU lock cannot disable it.
2687            We still use this lock to block changes
2688            caused by addrconf/ndisc.
2689         */
2690
2691         idev = __in6_dev_get(arg->dev);
2692         if (!idev)
2693                 return 0;
2694
2695         /* For administrative MTU increase, there is no way to discover
2696            IPv6 PMTU increase, so PMTU increase should be updated here.
2697            Since RFC 1981 doesn't include administrative MTU increase
2698            update PMTU increase is a MUST. (i.e. jumbo frame)
2699          */
2700         /*
2701            If new MTU is less than route PMTU, this new MTU will be the
2702            lowest MTU in the path, update the route PMTU to reflect PMTU
2703            decreases; if new MTU is greater than route PMTU, and the
2704            old MTU is the lowest MTU in the path, update the route PMTU
2705            to reflect the increase. In this case if the other nodes' MTU
2706            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2707            PMTU discouvery.
2708          */
2709         if (rt->dst.dev == arg->dev &&
2710             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2711                 if (rt->rt6i_flags & RTF_CACHE) {
2712                         /* For RTF_CACHE with rt6i_pmtu == 0
2713                          * (i.e. a redirected route),
2714                          * the metrics of its rt->dst.from has already
2715                          * been updated.
2716                          */
2717                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2718                                 rt->rt6i_pmtu = arg->mtu;
2719                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2720                            (dst_mtu(&rt->dst) < arg->mtu &&
2721                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2722                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2723                 }
2724         }
2725         return 0;
2726 }
2727
2728 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2729 {
2730         struct rt6_mtu_change_arg arg = {
2731                 .dev = dev,
2732                 .mtu = mtu,
2733         };
2734
2735         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2736 }
2737
2738 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2739         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2740         [RTA_OIF]               = { .type = NLA_U32 },
2741         [RTA_IIF]               = { .type = NLA_U32 },
2742         [RTA_PRIORITY]          = { .type = NLA_U32 },
2743         [RTA_METRICS]           = { .type = NLA_NESTED },
2744         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2745         [RTA_PREF]              = { .type = NLA_U8 },
2746         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2747         [RTA_ENCAP]             = { .type = NLA_NESTED },
2748         [RTA_EXPIRES]           = { .type = NLA_U32 },
2749 };
2750
2751 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2752                               struct fib6_config *cfg)
2753 {
2754         struct rtmsg *rtm;
2755         struct nlattr *tb[RTA_MAX+1];
2756         unsigned int pref;
2757         int err;
2758
2759         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2760         if (err < 0)
2761                 goto errout;
2762
2763         err = -EINVAL;
2764         rtm = nlmsg_data(nlh);
2765         memset(cfg, 0, sizeof(*cfg));
2766
2767         cfg->fc_table = rtm->rtm_table;
2768         cfg->fc_dst_len = rtm->rtm_dst_len;
2769         cfg->fc_src_len = rtm->rtm_src_len;
2770         cfg->fc_flags = RTF_UP;
2771         cfg->fc_protocol = rtm->rtm_protocol;
2772         cfg->fc_type = rtm->rtm_type;
2773
2774         if (rtm->rtm_type == RTN_UNREACHABLE ||
2775             rtm->rtm_type == RTN_BLACKHOLE ||
2776             rtm->rtm_type == RTN_PROHIBIT ||
2777             rtm->rtm_type == RTN_THROW)
2778                 cfg->fc_flags |= RTF_REJECT;
2779
2780         if (rtm->rtm_type == RTN_LOCAL)
2781                 cfg->fc_flags |= RTF_LOCAL;
2782
2783         if (rtm->rtm_flags & RTM_F_CLONED)
2784                 cfg->fc_flags |= RTF_CACHE;
2785
2786         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2787         cfg->fc_nlinfo.nlh = nlh;
2788         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2789
2790         if (tb[RTA_GATEWAY]) {
2791                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2792                 cfg->fc_flags |= RTF_GATEWAY;
2793         }
2794
2795         if (tb[RTA_DST]) {
2796                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2797
2798                 if (nla_len(tb[RTA_DST]) < plen)
2799                         goto errout;
2800
2801                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2802         }
2803
2804         if (tb[RTA_SRC]) {
2805                 int plen = (rtm->rtm_src_len + 7) >> 3;
2806
2807                 if (nla_len(tb[RTA_SRC]) < plen)
2808                         goto errout;
2809
2810                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2811         }
2812
2813         if (tb[RTA_PREFSRC])
2814                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2815
2816         if (tb[RTA_OIF])
2817                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2818
2819         if (tb[RTA_PRIORITY])
2820                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2821
2822         if (tb[RTA_METRICS]) {
2823                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2824                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2825         }
2826
2827         if (tb[RTA_TABLE])
2828                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2829
2830         if (tb[RTA_MULTIPATH]) {
2831                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2832                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2833         }
2834
2835         if (tb[RTA_PREF]) {
2836                 pref = nla_get_u8(tb[RTA_PREF]);
2837                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2838                     pref != ICMPV6_ROUTER_PREF_HIGH)
2839                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2840                 cfg->fc_flags |= RTF_PREF(pref);
2841         }
2842
2843         if (tb[RTA_ENCAP])
2844                 cfg->fc_encap = tb[RTA_ENCAP];
2845
2846         if (tb[RTA_ENCAP_TYPE])
2847                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2848
2849         if (tb[RTA_EXPIRES]) {
2850                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2851
2852                 if (addrconf_finite_timeout(timeout)) {
2853                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2854                         cfg->fc_flags |= RTF_EXPIRES;
2855                 }
2856         }
2857
2858         err = 0;
2859 errout:
2860         return err;
2861 }
2862
2863 struct rt6_nh {
2864         struct rt6_info *rt6_info;
2865         struct fib6_config r_cfg;
2866         struct mx6_config mxc;
2867         struct list_head next;
2868 };
2869
2870 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2871 {
2872         struct rt6_nh *nh;
2873
2874         list_for_each_entry(nh, rt6_nh_list, next) {
2875                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2876                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2877                         nh->r_cfg.fc_ifindex);
2878         }
2879 }
2880
2881 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2882                                  struct rt6_info *rt, struct fib6_config *r_cfg)
2883 {
2884         struct rt6_nh *nh;
2885         struct rt6_info *rtnh;
2886         int err = -EEXIST;
2887
2888         list_for_each_entry(nh, rt6_nh_list, next) {
2889                 /* check if rt6_info already exists */
2890                 rtnh = nh->rt6_info;
2891
2892                 if (rtnh->dst.dev == rt->dst.dev &&
2893                     rtnh->rt6i_idev == rt->rt6i_idev &&
2894                     ipv6_addr_equal(&rtnh->rt6i_gateway,
2895                                     &rt->rt6i_gateway))
2896                         return err;
2897         }
2898
2899         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2900         if (!nh)
2901                 return -ENOMEM;
2902         nh->rt6_info = rt;
2903         err = ip6_convert_metrics(&nh->mxc, r_cfg);
2904         if (err) {
2905                 kfree(nh);
2906                 return err;
2907         }
2908         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2909         list_add_tail(&nh->next, rt6_nh_list);
2910
2911         return 0;
2912 }
2913
2914 static int ip6_route_multipath_add(struct fib6_config *cfg)
2915 {
2916         struct fib6_config r_cfg;
2917         struct rtnexthop *rtnh;
2918         struct rt6_info *rt;
2919         struct rt6_nh *err_nh;
2920         struct rt6_nh *nh, *nh_safe;
2921         int remaining;
2922         int attrlen;
2923         int err = 1;
2924         int nhn = 0;
2925         int replace = (cfg->fc_nlinfo.nlh &&
2926                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2927         LIST_HEAD(rt6_nh_list);
2928
2929         remaining = cfg->fc_mp_len;
2930         rtnh = (struct rtnexthop *)cfg->fc_mp;
2931
2932         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2933          * rt6_info structs per nexthop
2934          */
2935         while (rtnh_ok(rtnh, remaining)) {
2936                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2937                 if (rtnh->rtnh_ifindex)
2938                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2939
2940                 attrlen = rtnh_attrlen(rtnh);
2941                 if (attrlen > 0) {
2942                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2943
2944                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2945                         if (nla) {
2946                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2947                                 r_cfg.fc_flags |= RTF_GATEWAY;
2948                         }
2949                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2950                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2951                         if (nla)
2952                                 r_cfg.fc_encap_type = nla_get_u16(nla);
2953                 }
2954
2955                 rt = ip6_route_info_create(&r_cfg);
2956                 if (IS_ERR(rt)) {
2957                         err = PTR_ERR(rt);
2958                         rt = NULL;
2959                         goto cleanup;
2960                 }
2961
2962                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2963                 if (err) {
2964                         dst_free(&rt->dst);
2965                         goto cleanup;
2966                 }
2967
2968                 rtnh = rtnh_next(rtnh, &remaining);
2969         }
2970
2971         err_nh = NULL;
2972         list_for_each_entry(nh, &rt6_nh_list, next) {
2973                 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2974                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
2975                 nh->rt6_info = NULL;
2976                 if (err) {
2977                         if (replace && nhn)
2978                                 ip6_print_replace_route_err(&rt6_nh_list);
2979                         err_nh = nh;
2980                         goto add_errout;
2981                 }
2982
2983                 /* Because each route is added like a single route we remove
2984                  * these flags after the first nexthop: if there is a collision,
2985                  * we have already failed to add the first nexthop:
2986                  * fib6_add_rt2node() has rejected it; when replacing, old
2987                  * nexthops have been replaced by first new, the rest should
2988                  * be added to it.
2989                  */
2990                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2991                                                      NLM_F_REPLACE);
2992                 nhn++;
2993         }
2994
2995         goto cleanup;
2996
2997 add_errout:
2998         /* Delete routes that were already added */
2999         list_for_each_entry(nh, &rt6_nh_list, next) {
3000                 if (err_nh == nh)
3001                         break;
3002                 ip6_route_del(&nh->r_cfg);
3003         }
3004
3005 cleanup:
3006         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3007                 if (nh->rt6_info)
3008                         dst_free(&nh->rt6_info->dst);
3009                 kfree(nh->mxc.mx);
3010                 list_del(&nh->next);
3011                 kfree(nh);
3012         }
3013
3014         return err;
3015 }
3016
3017 static int ip6_route_multipath_del(struct fib6_config *cfg)
3018 {
3019         struct fib6_config r_cfg;
3020         struct rtnexthop *rtnh;
3021         int remaining;
3022         int attrlen;
3023         int err = 1, last_err = 0;
3024
3025         remaining = cfg->fc_mp_len;
3026         rtnh = (struct rtnexthop *)cfg->fc_mp;
3027
3028         /* Parse a Multipath Entry */
3029         while (rtnh_ok(rtnh, remaining)) {
3030                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3031                 if (rtnh->rtnh_ifindex)
3032                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3033
3034                 attrlen = rtnh_attrlen(rtnh);
3035                 if (attrlen > 0) {
3036                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3037
3038                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3039                         if (nla) {
3040                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3041                                 r_cfg.fc_flags |= RTF_GATEWAY;
3042                         }
3043                 }
3044                 err = ip6_route_del(&r_cfg);
3045                 if (err)
3046                         last_err = err;
3047
3048                 rtnh = rtnh_next(rtnh, &remaining);
3049         }
3050
3051         return last_err;
3052 }
3053
3054 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3055 {
3056         struct fib6_config cfg;
3057         int err;
3058
3059         err = rtm_to_fib6_config(skb, nlh, &cfg);
3060         if (err < 0)
3061                 return err;
3062
3063         if (cfg.fc_mp)
3064                 return ip6_route_multipath_del(&cfg);
3065         else
3066                 return ip6_route_del(&cfg);
3067 }
3068
3069 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3070 {
3071         struct fib6_config cfg;
3072         int err;
3073
3074         err = rtm_to_fib6_config(skb, nlh, &cfg);
3075         if (err < 0)
3076                 return err;
3077
3078         if (cfg.fc_mp)
3079                 return ip6_route_multipath_add(&cfg);
3080         else
3081                 return ip6_route_add(&cfg);
3082 }
3083
3084 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3085 {
3086         return NLMSG_ALIGN(sizeof(struct rtmsg))
3087                + nla_total_size(16) /* RTA_SRC */
3088                + nla_total_size(16) /* RTA_DST */
3089                + nla_total_size(16) /* RTA_GATEWAY */
3090                + nla_total_size(16) /* RTA_PREFSRC */
3091                + nla_total_size(4) /* RTA_TABLE */
3092                + nla_total_size(4) /* RTA_IIF */
3093                + nla_total_size(4) /* RTA_OIF */
3094                + nla_total_size(4) /* RTA_PRIORITY */
3095                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3096                + nla_total_size(sizeof(struct rta_cacheinfo))
3097                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3098                + nla_total_size(1) /* RTA_PREF */
3099                + lwtunnel_get_encap_size(rt->dst.lwtstate);
3100 }
3101
3102 static int rt6_fill_node(struct net *net,
3103                          struct sk_buff *skb, struct rt6_info *rt,
3104                          struct in6_addr *dst, struct in6_addr *src,
3105                          int iif, int type, u32 portid, u32 seq,
3106                          int prefix, int nowait, unsigned int flags)
3107 {
3108         u32 metrics[RTAX_MAX];
3109         struct rtmsg *rtm;
3110         struct nlmsghdr *nlh;
3111         long expires;
3112         u32 table;
3113
3114         if (prefix) {   /* user wants prefix routes only */
3115                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3116                         /* success since this is not a prefix route */
3117                         return 1;
3118                 }
3119         }
3120
3121         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3122         if (!nlh)
3123                 return -EMSGSIZE;
3124
3125         rtm = nlmsg_data(nlh);
3126         rtm->rtm_family = AF_INET6;
3127         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3128         rtm->rtm_src_len = rt->rt6i_src.plen;
3129         rtm->rtm_tos = 0;
3130         if (rt->rt6i_table)
3131                 table = rt->rt6i_table->tb6_id;
3132         else
3133                 table = RT6_TABLE_UNSPEC;
3134         rtm->rtm_table = table;
3135         if (nla_put_u32(skb, RTA_TABLE, table))
3136                 goto nla_put_failure;
3137         if (rt->rt6i_flags & RTF_REJECT) {
3138                 switch (rt->dst.error) {
3139                 case -EINVAL:
3140                         rtm->rtm_type = RTN_BLACKHOLE;
3141                         break;
3142                 case -EACCES:
3143                         rtm->rtm_type = RTN_PROHIBIT;
3144                         break;
3145                 case -EAGAIN:
3146                         rtm->rtm_type = RTN_THROW;
3147                         break;
3148                 default:
3149                         rtm->rtm_type = RTN_UNREACHABLE;
3150                         break;
3151                 }
3152         }
3153         else if (rt->rt6i_flags & RTF_LOCAL)
3154                 rtm->rtm_type = RTN_LOCAL;
3155         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3156                 rtm->rtm_type = RTN_LOCAL;
3157         else
3158                 rtm->rtm_type = RTN_UNICAST;
3159         rtm->rtm_flags = 0;
3160         if (!netif_carrier_ok(rt->dst.dev)) {
3161                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3162                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3163                         rtm->rtm_flags |= RTNH_F_DEAD;
3164         }
3165         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3166         rtm->rtm_protocol = rt->rt6i_protocol;
3167         if (rt->rt6i_flags & RTF_DYNAMIC)
3168                 rtm->rtm_protocol = RTPROT_REDIRECT;
3169         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3170                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3171                         rtm->rtm_protocol = RTPROT_RA;
3172                 else
3173                         rtm->rtm_protocol = RTPROT_KERNEL;
3174         }
3175
3176         if (rt->rt6i_flags & RTF_CACHE)
3177                 rtm->rtm_flags |= RTM_F_CLONED;
3178
3179         if (dst) {
3180                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3181                         goto nla_put_failure;
3182                 rtm->rtm_dst_len = 128;
3183         } else if (rtm->rtm_dst_len)
3184                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3185                         goto nla_put_failure;
3186 #ifdef CONFIG_IPV6_SUBTREES
3187         if (src) {
3188                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3189                         goto nla_put_failure;
3190                 rtm->rtm_src_len = 128;
3191         } else if (rtm->rtm_src_len &&
3192                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3193                 goto nla_put_failure;
3194 #endif
3195         if (iif) {
3196 #ifdef CONFIG_IPV6_MROUTE
3197                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3198                         int err = ip6mr_get_route(net, skb, rtm, nowait);
3199                         if (err <= 0) {
3200                                 if (!nowait) {
3201                                         if (err == 0)
3202                                                 return 0;
3203                                         goto nla_put_failure;
3204                                 } else {
3205                                         if (err == -EMSGSIZE)
3206                                                 goto nla_put_failure;
3207                                 }
3208                         }
3209                 } else
3210 #endif
3211                         if (nla_put_u32(skb, RTA_IIF, iif))
3212                                 goto nla_put_failure;
3213         } else if (dst) {
3214                 struct in6_addr saddr_buf;
3215                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3216                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3217                         goto nla_put_failure;
3218         }
3219
3220         if (rt->rt6i_prefsrc.plen) {
3221                 struct in6_addr saddr_buf;
3222                 saddr_buf = rt->rt6i_prefsrc.addr;
3223                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3224                         goto nla_put_failure;
3225         }
3226
3227         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3228         if (rt->rt6i_pmtu)
3229                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3230         if (rtnetlink_put_metrics(skb, metrics) < 0)
3231                 goto nla_put_failure;
3232
3233         if (rt->rt6i_flags & RTF_GATEWAY) {
3234                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3235                         goto nla_put_failure;
3236         }
3237
3238         if (rt->dst.dev &&
3239             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3240                 goto nla_put_failure;
3241         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3242                 goto nla_put_failure;
3243
3244         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3245
3246         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3247                 goto nla_put_failure;
3248
3249         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3250                 goto nla_put_failure;
3251
3252         lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3253
3254         nlmsg_end(skb, nlh);
3255         return 0;
3256
3257 nla_put_failure:
3258         nlmsg_cancel(skb, nlh);
3259         return -EMSGSIZE;
3260 }
3261
3262 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3263 {
3264         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3265         int prefix;
3266
3267         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3268                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3269                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3270         } else
3271                 prefix = 0;
3272
3273         return rt6_fill_node(arg->net,
3274                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3275                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3276                      prefix, 0, NLM_F_MULTI);
3277 }
3278
3279 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3280 {
3281         struct net *net = sock_net(in_skb->sk);
3282         struct nlattr *tb[RTA_MAX+1];
3283         struct rt6_info *rt;
3284         struct sk_buff *skb;
3285         struct rtmsg *rtm;
3286         struct flowi6 fl6;
3287         int err, iif = 0, oif = 0;
3288
3289         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3290         if (err < 0)
3291                 goto errout;
3292
3293         err = -EINVAL;
3294         memset(&fl6, 0, sizeof(fl6));
3295         rtm = nlmsg_data(nlh);
3296         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3297
3298         if (tb[RTA_SRC]) {
3299                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3300                         goto errout;
3301
3302                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3303         }
3304
3305         if (tb[RTA_DST]) {
3306                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3307                         goto errout;
3308
3309                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3310         }
3311
3312         if (tb[RTA_IIF])
3313                 iif = nla_get_u32(tb[RTA_IIF]);
3314
3315         if (tb[RTA_OIF])
3316                 oif = nla_get_u32(tb[RTA_OIF]);
3317
3318         if (tb[RTA_MARK])
3319                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3320
3321         if (iif) {
3322                 struct net_device *dev;
3323                 int flags = 0;
3324
3325                 dev = __dev_get_by_index(net, iif);
3326                 if (!dev) {
3327                         err = -ENODEV;
3328                         goto errout;
3329                 }
3330
3331                 fl6.flowi6_iif = iif;
3332
3333                 if (!ipv6_addr_any(&fl6.saddr))
3334                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3335
3336                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3337                                                                flags);
3338         } else {
3339                 fl6.flowi6_oif = oif;
3340
3341                 if (netif_index_is_l3_master(net, oif)) {
3342                         fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
3343                                            FLOWI_FLAG_SKIP_NH_OIF;
3344                 }
3345
3346                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3347         }
3348
3349         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3350         if (!skb) {
3351                 ip6_rt_put(rt);
3352                 err = -ENOBUFS;
3353                 goto errout;
3354         }
3355
3356         /* Reserve room for dummy headers, this skb can pass
3357            through good chunk of routing engine.
3358          */
3359         skb_reset_mac_header(skb);
3360         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3361
3362         skb_dst_set(skb, &rt->dst);
3363
3364         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3365                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3366                             nlh->nlmsg_seq, 0, 0, 0);
3367         if (err < 0) {
3368                 kfree_skb(skb);
3369                 goto errout;
3370         }
3371
3372         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3373 errout:
3374         return err;
3375 }
3376
3377 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3378                      unsigned int nlm_flags)
3379 {
3380         struct sk_buff *skb;
3381         struct net *net = info->nl_net;
3382         u32 seq;
3383         int err;
3384
3385         err = -ENOBUFS;
3386         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3387
3388         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3389         if (!skb)
3390                 goto errout;
3391
3392         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3393                                 event, info->portid, seq, 0, 0, nlm_flags);
3394         if (err < 0) {
3395                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3396                 WARN_ON(err == -EMSGSIZE);
3397                 kfree_skb(skb);
3398                 goto errout;
3399         }
3400         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3401                     info->nlh, gfp_any());
3402         return;
3403 errout:
3404         if (err < 0)
3405                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3406 }
3407
3408 static int ip6_route_dev_notify(struct notifier_block *this,
3409                                 unsigned long event, void *ptr)
3410 {
3411         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3412         struct net *net = dev_net(dev);
3413
3414         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3415                 net->ipv6.ip6_null_entry->dst.dev = dev;
3416                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3417 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3418                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3419                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3420                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3421                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3422 #endif
3423         }
3424
3425         return NOTIFY_OK;
3426 }
3427
3428 /*
3429  *      /proc
3430  */
3431
3432 #ifdef CONFIG_PROC_FS
3433
3434 static const struct file_operations ipv6_route_proc_fops = {
3435         .owner          = THIS_MODULE,
3436         .open           = ipv6_route_open,
3437         .read           = seq_read,
3438         .llseek         = seq_lseek,
3439         .release        = seq_release_net,
3440 };
3441
3442 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3443 {
3444         struct net *net = (struct net *)seq->private;
3445         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3446                    net->ipv6.rt6_stats->fib_nodes,
3447                    net->ipv6.rt6_stats->fib_route_nodes,
3448                    net->ipv6.rt6_stats->fib_rt_alloc,
3449                    net->ipv6.rt6_stats->fib_rt_entries,
3450                    net->ipv6.rt6_stats->fib_rt_cache,
3451                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3452                    net->ipv6.rt6_stats->fib_discarded_routes);
3453
3454         return 0;
3455 }
3456
3457 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3458 {
3459         return single_open_net(inode, file, rt6_stats_seq_show);
3460 }
3461
3462 static const struct file_operations rt6_stats_seq_fops = {
3463         .owner   = THIS_MODULE,
3464         .open    = rt6_stats_seq_open,
3465         .read    = seq_read,
3466         .llseek  = seq_lseek,
3467         .release = single_release_net,
3468 };
3469 #endif  /* CONFIG_PROC_FS */
3470
3471 #ifdef CONFIG_SYSCTL
3472
3473 static
3474 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3475                               void __user *buffer, size_t *lenp, loff_t *ppos)
3476 {
3477         struct net *net;
3478         int delay;
3479         if (!write)
3480                 return -EINVAL;
3481
3482         net = (struct net *)ctl->extra1;
3483         delay = net->ipv6.sysctl.flush_delay;
3484         proc_dointvec(ctl, write, buffer, lenp, ppos);
3485         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3486         return 0;
3487 }
3488
3489 struct ctl_table ipv6_route_table_template[] = {
3490         {
3491                 .procname       =       "flush",
3492                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3493                 .maxlen         =       sizeof(int),
3494                 .mode           =       0200,
3495                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3496         },
3497         {
3498                 .procname       =       "gc_thresh",
3499                 .data           =       &ip6_dst_ops_template.gc_thresh,
3500                 .maxlen         =       sizeof(int),
3501                 .mode           =       0644,
3502                 .proc_handler   =       proc_dointvec,
3503         },
3504         {
3505                 .procname       =       "max_size",
3506                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3507                 .maxlen         =       sizeof(int),
3508                 .mode           =       0644,
3509                 .proc_handler   =       proc_dointvec,
3510         },
3511         {
3512                 .procname       =       "gc_min_interval",
3513                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3514                 .maxlen         =       sizeof(int),
3515                 .mode           =       0644,
3516                 .proc_handler   =       proc_dointvec_jiffies,
3517         },
3518         {
3519                 .procname       =       "gc_timeout",
3520                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3521                 .maxlen         =       sizeof(int),
3522                 .mode           =       0644,
3523                 .proc_handler   =       proc_dointvec_jiffies,
3524         },
3525         {
3526                 .procname       =       "gc_interval",
3527                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3528                 .maxlen         =       sizeof(int),
3529                 .mode           =       0644,
3530                 .proc_handler   =       proc_dointvec_jiffies,
3531         },
3532         {
3533                 .procname       =       "gc_elasticity",
3534                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3535                 .maxlen         =       sizeof(int),
3536                 .mode           =       0644,
3537                 .proc_handler   =       proc_dointvec,
3538         },
3539         {
3540                 .procname       =       "mtu_expires",
3541                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3542                 .maxlen         =       sizeof(int),
3543                 .mode           =       0644,
3544                 .proc_handler   =       proc_dointvec_jiffies,
3545         },
3546         {
3547                 .procname       =       "min_adv_mss",
3548                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3549                 .maxlen         =       sizeof(int),
3550                 .mode           =       0644,
3551                 .proc_handler   =       proc_dointvec,
3552         },
3553         {
3554                 .procname       =       "gc_min_interval_ms",
3555                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3556                 .maxlen         =       sizeof(int),
3557                 .mode           =       0644,
3558                 .proc_handler   =       proc_dointvec_ms_jiffies,
3559         },
3560         { }
3561 };
3562
3563 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3564 {
3565         struct ctl_table *table;
3566
3567         table = kmemdup(ipv6_route_table_template,
3568                         sizeof(ipv6_route_table_template),
3569                         GFP_KERNEL);
3570
3571         if (table) {
3572                 table[0].data = &net->ipv6.sysctl.flush_delay;
3573                 table[0].extra1 = net;
3574                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3575                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3576                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3577                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3578                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3579                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3580                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3581                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3582                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3583
3584                 /* Don't export sysctls to unprivileged users */
3585                 if (net->user_ns != &init_user_ns)
3586                         table[0].procname = NULL;
3587         }
3588
3589         return table;
3590 }
3591 #endif
3592
3593 static int __net_init ip6_route_net_init(struct net *net)
3594 {
3595         int ret = -ENOMEM;
3596
3597         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3598                sizeof(net->ipv6.ip6_dst_ops));
3599
3600         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3601                 goto out_ip6_dst_ops;
3602
3603         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3604                                            sizeof(*net->ipv6.ip6_null_entry),
3605                                            GFP_KERNEL);
3606         if (!net->ipv6.ip6_null_entry)
3607                 goto out_ip6_dst_entries;
3608         net->ipv6.ip6_null_entry->dst.path =
3609                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3610         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3611         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3612                          ip6_template_metrics, true);
3613
3614 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3615         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3616                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3617                                                GFP_KERNEL);
3618         if (!net->ipv6.ip6_prohibit_entry)
3619                 goto out_ip6_null_entry;
3620         net->ipv6.ip6_prohibit_entry->dst.path =
3621                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3622         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3623         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3624                          ip6_template_metrics, true);
3625
3626         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3627                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3628                                                GFP_KERNEL);
3629         if (!net->ipv6.ip6_blk_hole_entry)
3630                 goto out_ip6_prohibit_entry;
3631         net->ipv6.ip6_blk_hole_entry->dst.path =
3632                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3633         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3634         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3635                          ip6_template_metrics, true);
3636 #endif
3637
3638         net->ipv6.sysctl.flush_delay = 0;
3639         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3640         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3641         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3642         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3643         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3644         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3645         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3646
3647         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3648
3649         ret = 0;
3650 out:
3651         return ret;
3652
3653 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3654 out_ip6_prohibit_entry:
3655         kfree(net->ipv6.ip6_prohibit_entry);
3656 out_ip6_null_entry:
3657         kfree(net->ipv6.ip6_null_entry);
3658 #endif
3659 out_ip6_dst_entries:
3660         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3661 out_ip6_dst_ops:
3662         goto out;
3663 }
3664
3665 static void __net_exit ip6_route_net_exit(struct net *net)
3666 {
3667         kfree(net->ipv6.ip6_null_entry);
3668 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3669         kfree(net->ipv6.ip6_prohibit_entry);
3670         kfree(net->ipv6.ip6_blk_hole_entry);
3671 #endif
3672         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3673 }
3674
3675 static int __net_init ip6_route_net_init_late(struct net *net)
3676 {
3677 #ifdef CONFIG_PROC_FS
3678         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3679         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3680 #endif
3681         return 0;
3682 }
3683
3684 static void __net_exit ip6_route_net_exit_late(struct net *net)
3685 {
3686 #ifdef CONFIG_PROC_FS
3687         remove_proc_entry("ipv6_route", net->proc_net);
3688         remove_proc_entry("rt6_stats", net->proc_net);
3689 #endif
3690 }
3691
3692 static struct pernet_operations ip6_route_net_ops = {
3693         .init = ip6_route_net_init,
3694         .exit = ip6_route_net_exit,
3695 };
3696
3697 static int __net_init ipv6_inetpeer_init(struct net *net)
3698 {
3699         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3700
3701         if (!bp)
3702                 return -ENOMEM;
3703         inet_peer_base_init(bp);
3704         net->ipv6.peers = bp;
3705         return 0;
3706 }
3707
3708 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3709 {
3710         struct inet_peer_base *bp = net->ipv6.peers;
3711
3712         net->ipv6.peers = NULL;
3713         inetpeer_invalidate_tree(bp);
3714         kfree(bp);
3715 }
3716
3717 static struct pernet_operations ipv6_inetpeer_ops = {
3718         .init   =       ipv6_inetpeer_init,
3719         .exit   =       ipv6_inetpeer_exit,
3720 };
3721
3722 static struct pernet_operations ip6_route_net_late_ops = {
3723         .init = ip6_route_net_init_late,
3724         .exit = ip6_route_net_exit_late,
3725 };
3726
3727 static struct notifier_block ip6_route_dev_notifier = {
3728         .notifier_call = ip6_route_dev_notify,
3729         .priority = 0,
3730 };
3731
3732 int __init ip6_route_init(void)
3733 {
3734         int ret;
3735         int cpu;
3736
3737         ret = -ENOMEM;
3738         ip6_dst_ops_template.kmem_cachep =
3739                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3740                                   SLAB_HWCACHE_ALIGN, NULL);
3741         if (!ip6_dst_ops_template.kmem_cachep)
3742                 goto out;
3743
3744         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3745         if (ret)
3746                 goto out_kmem_cache;
3747
3748         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3749         if (ret)
3750                 goto out_dst_entries;
3751
3752         ret = register_pernet_subsys(&ip6_route_net_ops);
3753         if (ret)
3754                 goto out_register_inetpeer;
3755
3756         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3757
3758         /* Registering of the loopback is done before this portion of code,
3759          * the loopback reference in rt6_info will not be taken, do it
3760          * manually for init_net */
3761         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3762         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3763   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3764         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3765         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3766         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3767         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3768   #endif
3769         ret = fib6_init();
3770         if (ret)
3771                 goto out_register_subsys;
3772
3773         ret = xfrm6_init();
3774         if (ret)
3775                 goto out_fib6_init;
3776
3777         ret = fib6_rules_init();
3778         if (ret)
3779                 goto xfrm6_init;
3780
3781         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3782         if (ret)
3783                 goto fib6_rules_init;
3784
3785         ret = -ENOBUFS;
3786         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3787             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3788             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3789                 goto out_register_late_subsys;
3790
3791         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3792         if (ret)
3793                 goto out_register_late_subsys;
3794
3795         for_each_possible_cpu(cpu) {
3796                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3797
3798                 INIT_LIST_HEAD(&ul->head);
3799                 spin_lock_init(&ul->lock);
3800         }
3801
3802 out:
3803         return ret;
3804
3805 out_register_late_subsys:
3806         unregister_pernet_subsys(&ip6_route_net_late_ops);
3807 fib6_rules_init:
3808         fib6_rules_cleanup();
3809 xfrm6_init:
3810         xfrm6_fini();
3811 out_fib6_init:
3812         fib6_gc_cleanup();
3813 out_register_subsys:
3814         unregister_pernet_subsys(&ip6_route_net_ops);
3815 out_register_inetpeer:
3816         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3817 out_dst_entries:
3818         dst_entries_destroy(&ip6_dst_blackhole_ops);
3819 out_kmem_cache:
3820         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3821         goto out;
3822 }
3823
3824 void ip6_route_cleanup(void)
3825 {
3826         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3827         unregister_pernet_subsys(&ip6_route_net_late_ops);
3828         fib6_rules_cleanup();
3829         xfrm6_fini();
3830         fib6_gc_cleanup();
3831         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3832         unregister_pernet_subsys(&ip6_route_net_ops);
3833         dst_entries_destroy(&ip6_dst_blackhole_ops);
3834         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3835 }