net: ipv6: Do not consider link state for nexthop validation
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <asm/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr,
106                                            struct net_device *dev,
107                                            unsigned int pref);
108 static struct rt6_info *rt6_get_route_info(struct net *net,
109                                            const struct in6_addr *prefix, int prefixlen,
110                                            const struct in6_addr *gwaddr,
111                                            struct net_device *dev);
112 #endif
113
114 struct uncached_list {
115         spinlock_t              lock;
116         struct list_head        head;
117 };
118
119 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
120
121 static void rt6_uncached_list_add(struct rt6_info *rt)
122 {
123         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
124
125         rt->dst.flags |= DST_NOCACHE;
126         rt->rt6i_uncached_list = ul;
127
128         spin_lock_bh(&ul->lock);
129         list_add_tail(&rt->rt6i_uncached, &ul->head);
130         spin_unlock_bh(&ul->lock);
131 }
132
133 static void rt6_uncached_list_del(struct rt6_info *rt)
134 {
135         if (!list_empty(&rt->rt6i_uncached)) {
136                 struct uncached_list *ul = rt->rt6i_uncached_list;
137
138                 spin_lock_bh(&ul->lock);
139                 list_del(&rt->rt6i_uncached);
140                 spin_unlock_bh(&ul->lock);
141         }
142 }
143
144 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
145 {
146         struct net_device *loopback_dev = net->loopback_dev;
147         int cpu;
148
149         if (dev == loopback_dev)
150                 return;
151
152         for_each_possible_cpu(cpu) {
153                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
154                 struct rt6_info *rt;
155
156                 spin_lock_bh(&ul->lock);
157                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
158                         struct inet6_dev *rt_idev = rt->rt6i_idev;
159                         struct net_device *rt_dev = rt->dst.dev;
160
161                         if (rt_idev->dev == dev) {
162                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
163                                 in6_dev_put(rt_idev);
164                         }
165
166                         if (rt_dev == dev) {
167                                 rt->dst.dev = loopback_dev;
168                                 dev_hold(rt->dst.dev);
169                                 dev_put(rt_dev);
170                         }
171                 }
172                 spin_unlock_bh(&ul->lock);
173         }
174 }
175
176 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
177 {
178         return dst_metrics_write_ptr(rt->dst.from);
179 }
180
181 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
182 {
183         struct rt6_info *rt = (struct rt6_info *)dst;
184
185         if (rt->rt6i_flags & RTF_PCPU)
186                 return rt6_pcpu_cow_metrics(rt);
187         else if (rt->rt6i_flags & RTF_CACHE)
188                 return NULL;
189         else
190                 return dst_cow_metrics_generic(dst, old);
191 }
192
193 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
194                                              struct sk_buff *skb,
195                                              const void *daddr)
196 {
197         struct in6_addr *p = &rt->rt6i_gateway;
198
199         if (!ipv6_addr_any(p))
200                 return (const void *) p;
201         else if (skb)
202                 return &ipv6_hdr(skb)->daddr;
203         return daddr;
204 }
205
206 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
207                                           struct sk_buff *skb,
208                                           const void *daddr)
209 {
210         struct rt6_info *rt = (struct rt6_info *) dst;
211         struct neighbour *n;
212
213         daddr = choose_neigh_daddr(rt, skb, daddr);
214         n = __ipv6_neigh_lookup(dst->dev, daddr);
215         if (n)
216                 return n;
217         return neigh_create(&nd_tbl, daddr, dst->dev);
218 }
219
220 static struct dst_ops ip6_dst_ops_template = {
221         .family                 =       AF_INET6,
222         .gc                     =       ip6_dst_gc,
223         .gc_thresh              =       1024,
224         .check                  =       ip6_dst_check,
225         .default_advmss         =       ip6_default_advmss,
226         .mtu                    =       ip6_mtu,
227         .cow_metrics            =       ipv6_cow_metrics,
228         .destroy                =       ip6_dst_destroy,
229         .ifdown                 =       ip6_dst_ifdown,
230         .negative_advice        =       ip6_negative_advice,
231         .link_failure           =       ip6_link_failure,
232         .update_pmtu            =       ip6_rt_update_pmtu,
233         .redirect               =       rt6_do_redirect,
234         .local_out              =       __ip6_local_out,
235         .neigh_lookup           =       ip6_neigh_lookup,
236 };
237
238 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
239 {
240         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
241
242         return mtu ? : dst->dev->mtu;
243 }
244
245 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
246                                          struct sk_buff *skb, u32 mtu)
247 {
248 }
249
250 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251                                       struct sk_buff *skb)
252 {
253 }
254
255 static struct dst_ops ip6_dst_blackhole_ops = {
256         .family                 =       AF_INET6,
257         .destroy                =       ip6_dst_destroy,
258         .check                  =       ip6_dst_check,
259         .mtu                    =       ip6_blackhole_mtu,
260         .default_advmss         =       ip6_default_advmss,
261         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
262         .redirect               =       ip6_rt_blackhole_redirect,
263         .cow_metrics            =       dst_cow_metrics_generic,
264         .neigh_lookup           =       ip6_neigh_lookup,
265 };
266
267 static const u32 ip6_template_metrics[RTAX_MAX] = {
268         [RTAX_HOPLIMIT - 1] = 0,
269 };
270
271 static const struct rt6_info ip6_null_entry_template = {
272         .dst = {
273                 .__refcnt       = ATOMIC_INIT(1),
274                 .__use          = 1,
275                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
276                 .error          = -ENETUNREACH,
277                 .input          = ip6_pkt_discard,
278                 .output         = ip6_pkt_discard_out,
279         },
280         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
281         .rt6i_protocol  = RTPROT_KERNEL,
282         .rt6i_metric    = ~(u32) 0,
283         .rt6i_ref       = ATOMIC_INIT(1),
284 };
285
286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287
288 static const struct rt6_info ip6_prohibit_entry_template = {
289         .dst = {
290                 .__refcnt       = ATOMIC_INIT(1),
291                 .__use          = 1,
292                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
293                 .error          = -EACCES,
294                 .input          = ip6_pkt_prohibit,
295                 .output         = ip6_pkt_prohibit_out,
296         },
297         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
298         .rt6i_protocol  = RTPROT_KERNEL,
299         .rt6i_metric    = ~(u32) 0,
300         .rt6i_ref       = ATOMIC_INIT(1),
301 };
302
303 static const struct rt6_info ip6_blk_hole_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -EINVAL,
309                 .input          = dst_discard,
310                 .output         = dst_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313         .rt6i_protocol  = RTPROT_KERNEL,
314         .rt6i_metric    = ~(u32) 0,
315         .rt6i_ref       = ATOMIC_INIT(1),
316 };
317
318 #endif
319
320 static void rt6_info_init(struct rt6_info *rt)
321 {
322         struct dst_entry *dst = &rt->dst;
323
324         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
325         INIT_LIST_HEAD(&rt->rt6i_siblings);
326         INIT_LIST_HEAD(&rt->rt6i_uncached);
327 }
328
329 /* allocate dst with ip6_dst_ops */
330 static struct rt6_info *__ip6_dst_alloc(struct net *net,
331                                         struct net_device *dev,
332                                         int flags)
333 {
334         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
335                                         0, DST_OBSOLETE_FORCE_CHK, flags);
336
337         if (rt)
338                 rt6_info_init(rt);
339
340         return rt;
341 }
342
343 struct rt6_info *ip6_dst_alloc(struct net *net,
344                                struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
348
349         if (rt) {
350                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
351                 if (rt->rt6i_pcpu) {
352                         int cpu;
353
354                         for_each_possible_cpu(cpu) {
355                                 struct rt6_info **p;
356
357                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
358                                 /* no one shares rt */
359                                 *p =  NULL;
360                         }
361                 } else {
362                         dst_destroy((struct dst_entry *)rt);
363                         return NULL;
364                 }
365         }
366
367         return rt;
368 }
369 EXPORT_SYMBOL(ip6_dst_alloc);
370
371 static void ip6_dst_destroy(struct dst_entry *dst)
372 {
373         struct rt6_info *rt = (struct rt6_info *)dst;
374         struct dst_entry *from = dst->from;
375         struct inet6_dev *idev;
376
377         dst_destroy_metrics_generic(dst);
378         free_percpu(rt->rt6i_pcpu);
379         rt6_uncached_list_del(rt);
380
381         idev = rt->rt6i_idev;
382         if (idev) {
383                 rt->rt6i_idev = NULL;
384                 in6_dev_put(idev);
385         }
386
387         dst->from = NULL;
388         dst_release(from);
389 }
390
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392                            int how)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct inet6_dev *idev = rt->rt6i_idev;
396         struct net_device *loopback_dev =
397                 dev_net(dev)->loopback_dev;
398
399         if (dev != loopback_dev) {
400                 if (idev && idev->dev == dev) {
401                         struct inet6_dev *loopback_idev =
402                                 in6_dev_get(loopback_dev);
403                         if (loopback_idev) {
404                                 rt->rt6i_idev = loopback_idev;
405                                 in6_dev_put(idev);
406                         }
407                 }
408         }
409 }
410
411 static bool __rt6_check_expired(const struct rt6_info *rt)
412 {
413         if (rt->rt6i_flags & RTF_EXPIRES)
414                 return time_after(jiffies, rt->dst.expires);
415         else
416                 return false;
417 }
418
419 static bool rt6_check_expired(const struct rt6_info *rt)
420 {
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (rt->dst.from) {
425                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
426         }
427         return false;
428 }
429
430 /* Multipath route selection:
431  *   Hash based function using packet header and flowlabel.
432  * Adapted from fib_info_hashfn()
433  */
434 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
435                                const struct flowi6 *fl6)
436 {
437         return get_hash_from_flowi6(fl6) % candidate_count;
438 }
439
440 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
441                                              struct flowi6 *fl6, int oif,
442                                              int strict)
443 {
444         struct rt6_info *sibling, *next_sibling;
445         int route_choosen;
446
447         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
448         /* Don't change the route, if route_choosen == 0
449          * (siblings does not include ourself)
450          */
451         if (route_choosen)
452                 list_for_each_entry_safe(sibling, next_sibling,
453                                 &match->rt6i_siblings, rt6i_siblings) {
454                         route_choosen--;
455                         if (route_choosen == 0) {
456                                 if (rt6_score_route(sibling, oif, strict) < 0)
457                                         break;
458                                 match = sibling;
459                                 break;
460                         }
461                 }
462         return match;
463 }
464
465 /*
466  *      Route lookup. Any table->tb6_lock is implied.
467  */
468
469 static inline struct rt6_info *rt6_device_match(struct net *net,
470                                                     struct rt6_info *rt,
471                                                     const struct in6_addr *saddr,
472                                                     int oif,
473                                                     int flags)
474 {
475         struct rt6_info *local = NULL;
476         struct rt6_info *sprt;
477
478         if (!oif && ipv6_addr_any(saddr))
479                 goto out;
480
481         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
482                 struct net_device *dev = sprt->dst.dev;
483
484                 if (oif) {
485                         if (dev->ifindex == oif)
486                                 return sprt;
487                         if (dev->flags & IFF_LOOPBACK) {
488                                 if (!sprt->rt6i_idev ||
489                                     sprt->rt6i_idev->dev->ifindex != oif) {
490                                         if (flags & RT6_LOOKUP_F_IFACE)
491                                                 continue;
492                                         if (local &&
493                                             local->rt6i_idev->dev->ifindex == oif)
494                                                 continue;
495                                 }
496                                 local = sprt;
497                         }
498                 } else {
499                         if (ipv6_chk_addr(net, saddr, dev,
500                                           flags & RT6_LOOKUP_F_IFACE))
501                                 return sprt;
502                 }
503         }
504
505         if (oif) {
506                 if (local)
507                         return local;
508
509                 if (flags & RT6_LOOKUP_F_IFACE)
510                         return net->ipv6.ip6_null_entry;
511         }
512 out:
513         return rt;
514 }
515
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518         struct work_struct work;
519         struct in6_addr target;
520         struct net_device *dev;
521 };
522
523 static void rt6_probe_deferred(struct work_struct *w)
524 {
525         struct in6_addr mcaddr;
526         struct __rt6_probe_work *work =
527                 container_of(w, struct __rt6_probe_work, work);
528
529         addrconf_addr_solict_mult(&work->target, &mcaddr);
530         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
531         dev_put(work->dev);
532         kfree(work);
533 }
534
535 static void rt6_probe(struct rt6_info *rt)
536 {
537         struct __rt6_probe_work *work;
538         struct neighbour *neigh;
539         /*
540          * Okay, this does not seem to be appropriate
541          * for now, however, we need to check if it
542          * is really so; aka Router Reachability Probing.
543          *
544          * Router Reachability Probe MUST be rate-limited
545          * to no more than one per minute.
546          */
547         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
548                 return;
549         rcu_read_lock_bh();
550         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
551         if (neigh) {
552                 if (neigh->nud_state & NUD_VALID)
553                         goto out;
554
555                 work = NULL;
556                 write_lock(&neigh->lock);
557                 if (!(neigh->nud_state & NUD_VALID) &&
558                     time_after(jiffies,
559                                neigh->updated +
560                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
561                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
562                         if (work)
563                                 __neigh_set_probe_once(neigh);
564                 }
565                 write_unlock(&neigh->lock);
566         } else {
567                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
568         }
569
570         if (work) {
571                 INIT_WORK(&work->work, rt6_probe_deferred);
572                 work->target = rt->rt6i_gateway;
573                 dev_hold(rt->dst.dev);
574                 work->dev = rt->dst.dev;
575                 schedule_work(&work->work);
576         }
577
578 out:
579         rcu_read_unlock_bh();
580 }
581 #else
582 static inline void rt6_probe(struct rt6_info *rt)
583 {
584 }
585 #endif
586
587 /*
588  * Default Router Selection (RFC 2461 6.3.6)
589  */
590 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
591 {
592         struct net_device *dev = rt->dst.dev;
593         if (!oif || dev->ifindex == oif)
594                 return 2;
595         if ((dev->flags & IFF_LOOPBACK) &&
596             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
597                 return 1;
598         return 0;
599 }
600
601 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
602 {
603         struct neighbour *neigh;
604         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
605
606         if (rt->rt6i_flags & RTF_NONEXTHOP ||
607             !(rt->rt6i_flags & RTF_GATEWAY))
608                 return RT6_NUD_SUCCEED;
609
610         rcu_read_lock_bh();
611         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
612         if (neigh) {
613                 read_lock(&neigh->lock);
614                 if (neigh->nud_state & NUD_VALID)
615                         ret = RT6_NUD_SUCCEED;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617                 else if (!(neigh->nud_state & NUD_FAILED))
618                         ret = RT6_NUD_SUCCEED;
619                 else
620                         ret = RT6_NUD_FAIL_PROBE;
621 #endif
622                 read_unlock(&neigh->lock);
623         } else {
624                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
625                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
626         }
627         rcu_read_unlock_bh();
628
629         return ret;
630 }
631
632 static int rt6_score_route(struct rt6_info *rt, int oif,
633                            int strict)
634 {
635         int m;
636
637         m = rt6_check_dev(rt, oif);
638         if (!m && (strict & RT6_LOOKUP_F_IFACE))
639                 return RT6_NUD_FAIL_HARD;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
642 #endif
643         if (strict & RT6_LOOKUP_F_REACHABLE) {
644                 int n = rt6_check_neigh(rt);
645                 if (n < 0)
646                         return n;
647         }
648         return m;
649 }
650
651 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
652                                    int *mpri, struct rt6_info *match,
653                                    bool *do_rr)
654 {
655         int m;
656         bool match_do_rr = false;
657         struct inet6_dev *idev = rt->rt6i_idev;
658         struct net_device *dev = rt->dst.dev;
659
660         if (dev && !netif_carrier_ok(dev) &&
661             idev->cnf.ignore_routes_with_linkdown &&
662             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
663                 goto out;
664
665         if (rt6_check_expired(rt))
666                 goto out;
667
668         m = rt6_score_route(rt, oif, strict);
669         if (m == RT6_NUD_FAIL_DO_RR) {
670                 match_do_rr = true;
671                 m = 0; /* lowest valid score */
672         } else if (m == RT6_NUD_FAIL_HARD) {
673                 goto out;
674         }
675
676         if (strict & RT6_LOOKUP_F_REACHABLE)
677                 rt6_probe(rt);
678
679         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680         if (m > *mpri) {
681                 *do_rr = match_do_rr;
682                 *mpri = m;
683                 match = rt;
684         }
685 out:
686         return match;
687 }
688
689 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
690                                      struct rt6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct rt6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
700                 if (rt->rt6i_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
709                 if (rt->rt6i_metric != metric) {
710                         cont = rt;
711                         break;
712                 }
713
714                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
715         }
716
717         if (match || !cont)
718                 return match;
719
720         for (rt = cont; rt; rt = rt->dst.rt6_next)
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722
723         return match;
724 }
725
726 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
727 {
728         struct rt6_info *match, *rt0;
729         struct net *net;
730         bool do_rr = false;
731
732         rt0 = fn->rr_ptr;
733         if (!rt0)
734                 fn->rr_ptr = rt0 = fn->leaf;
735
736         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
737                              &do_rr);
738
739         if (do_rr) {
740                 struct rt6_info *next = rt0->dst.rt6_next;
741
742                 /* no entries matched; do round-robin */
743                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
744                         next = fn->leaf;
745
746                 if (next != rt0)
747                         fn->rr_ptr = next;
748         }
749
750         net = dev_net(rt0->dst.dev);
751         return match ? match : net->ipv6.ip6_null_entry;
752 }
753
754 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
755 {
756         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
757 }
758
759 #ifdef CONFIG_IPV6_ROUTE_INFO
760 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
761                   const struct in6_addr *gwaddr)
762 {
763         struct net *net = dev_net(dev);
764         struct route_info *rinfo = (struct route_info *) opt;
765         struct in6_addr prefix_buf, *prefix;
766         unsigned int pref;
767         unsigned long lifetime;
768         struct rt6_info *rt;
769
770         if (len < sizeof(struct route_info)) {
771                 return -EINVAL;
772         }
773
774         /* Sanity check for prefix_len and length */
775         if (rinfo->length > 3) {
776                 return -EINVAL;
777         } else if (rinfo->prefix_len > 128) {
778                 return -EINVAL;
779         } else if (rinfo->prefix_len > 64) {
780                 if (rinfo->length < 2) {
781                         return -EINVAL;
782                 }
783         } else if (rinfo->prefix_len > 0) {
784                 if (rinfo->length < 1) {
785                         return -EINVAL;
786                 }
787         }
788
789         pref = rinfo->route_pref;
790         if (pref == ICMPV6_ROUTER_PREF_INVALID)
791                 return -EINVAL;
792
793         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
794
795         if (rinfo->length == 3)
796                 prefix = (struct in6_addr *)rinfo->prefix;
797         else {
798                 /* this function is safe */
799                 ipv6_addr_prefix(&prefix_buf,
800                                  (struct in6_addr *)rinfo->prefix,
801                                  rinfo->prefix_len);
802                 prefix = &prefix_buf;
803         }
804
805         if (rinfo->prefix_len == 0)
806                 rt = rt6_get_dflt_router(gwaddr, dev);
807         else
808                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
809                                         gwaddr, dev);
810
811         if (rt && !lifetime) {
812                 ip6_del_rt(rt);
813                 rt = NULL;
814         }
815
816         if (!rt && lifetime)
817                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
818                                         dev, pref);
819         else if (rt)
820                 rt->rt6i_flags = RTF_ROUTEINFO |
821                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
822
823         if (rt) {
824                 if (!addrconf_finite_timeout(lifetime))
825                         rt6_clean_expires(rt);
826                 else
827                         rt6_set_expires(rt, jiffies + HZ * lifetime);
828
829                 ip6_rt_put(rt);
830         }
831         return 0;
832 }
833 #endif
834
835 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
836                                         struct in6_addr *saddr)
837 {
838         struct fib6_node *pn;
839         while (1) {
840                 if (fn->fn_flags & RTN_TL_ROOT)
841                         return NULL;
842                 pn = fn->parent;
843                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
844                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
845                 else
846                         fn = pn;
847                 if (fn->fn_flags & RTN_RTINFO)
848                         return fn;
849         }
850 }
851
852 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
853                                              struct fib6_table *table,
854                                              struct flowi6 *fl6, int flags)
855 {
856         struct fib6_node *fn;
857         struct rt6_info *rt;
858
859         read_lock_bh(&table->tb6_lock);
860         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
861 restart:
862         rt = fn->leaf;
863         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
864         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
865                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
866         if (rt == net->ipv6.ip6_null_entry) {
867                 fn = fib6_backtrack(fn, &fl6->saddr);
868                 if (fn)
869                         goto restart;
870         }
871         dst_use(&rt->dst, jiffies);
872         read_unlock_bh(&table->tb6_lock);
873
874         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
875
876         return rt;
877
878 }
879
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881                                     int flags)
882 {
883         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888                             const struct in6_addr *saddr, int oif, int strict)
889 {
890         struct flowi6 fl6 = {
891                 .flowi6_oif = oif,
892                 .daddr = *daddr,
893         };
894         struct dst_entry *dst;
895         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896
897         if (saddr) {
898                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899                 flags |= RT6_LOOKUP_F_HAS_SADDR;
900         }
901
902         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903         if (dst->error == 0)
904                 return (struct rt6_info *) dst;
905
906         dst_release(dst);
907
908         return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919                         struct mx6_config *mxc)
920 {
921         int err;
922         struct fib6_table *table;
923
924         table = rt->rt6i_table;
925         write_lock_bh(&table->tb6_lock);
926         err = fib6_add(&table->tb6_root, rt, info, mxc);
927         write_unlock_bh(&table->tb6_lock);
928
929         return err;
930 }
931
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
935         struct mx6_config mxc = { .mx = NULL, };
936
937         return __ip6_ins_rt(rt, &info, &mxc);
938 }
939
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941                                            const struct in6_addr *daddr,
942                                            const struct in6_addr *saddr)
943 {
944         struct rt6_info *rt;
945
946         /*
947          *      Clone the route.
948          */
949
950         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951                 ort = (struct rt6_info *)ort->dst.from;
952
953         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
954
955         if (!rt)
956                 return NULL;
957
958         ip6_rt_copy_init(rt, ort);
959         rt->rt6i_flags |= RTF_CACHE;
960         rt->rt6i_metric = 0;
961         rt->dst.flags |= DST_HOST;
962         rt->rt6i_dst.addr = *daddr;
963         rt->rt6i_dst.plen = 128;
964
965         if (!rt6_is_gw_or_nonexthop(ort)) {
966                 if (ort->rt6i_dst.plen != 128 &&
967                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
968                         rt->rt6i_flags |= RTF_ANYCAST;
969 #ifdef CONFIG_IPV6_SUBTREES
970                 if (rt->rt6i_src.plen && saddr) {
971                         rt->rt6i_src.addr = *saddr;
972                         rt->rt6i_src.plen = 128;
973                 }
974 #endif
975         }
976
977         return rt;
978 }
979
980 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
981 {
982         struct rt6_info *pcpu_rt;
983
984         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
985                                   rt->dst.dev, rt->dst.flags);
986
987         if (!pcpu_rt)
988                 return NULL;
989         ip6_rt_copy_init(pcpu_rt, rt);
990         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
991         pcpu_rt->rt6i_flags |= RTF_PCPU;
992         return pcpu_rt;
993 }
994
995 /* It should be called with read_lock_bh(&tb6_lock) acquired */
996 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
997 {
998         struct rt6_info *pcpu_rt, **p;
999
1000         p = this_cpu_ptr(rt->rt6i_pcpu);
1001         pcpu_rt = *p;
1002
1003         if (pcpu_rt) {
1004                 dst_hold(&pcpu_rt->dst);
1005                 rt6_dst_from_metrics_check(pcpu_rt);
1006         }
1007         return pcpu_rt;
1008 }
1009
1010 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1011 {
1012         struct fib6_table *table = rt->rt6i_table;
1013         struct rt6_info *pcpu_rt, *prev, **p;
1014
1015         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016         if (!pcpu_rt) {
1017                 struct net *net = dev_net(rt->dst.dev);
1018
1019                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1020                 return net->ipv6.ip6_null_entry;
1021         }
1022
1023         read_lock_bh(&table->tb6_lock);
1024         if (rt->rt6i_pcpu) {
1025                 p = this_cpu_ptr(rt->rt6i_pcpu);
1026                 prev = cmpxchg(p, NULL, pcpu_rt);
1027                 if (prev) {
1028                         /* If someone did it before us, return prev instead */
1029                         dst_destroy(&pcpu_rt->dst);
1030                         pcpu_rt = prev;
1031                 }
1032         } else {
1033                 /* rt has been removed from the fib6 tree
1034                  * before we have a chance to acquire the read_lock.
1035                  * In this case, don't brother to create a pcpu rt
1036                  * since rt is going away anyway.  The next
1037                  * dst_check() will trigger a re-lookup.
1038                  */
1039                 dst_destroy(&pcpu_rt->dst);
1040                 pcpu_rt = rt;
1041         }
1042         dst_hold(&pcpu_rt->dst);
1043         rt6_dst_from_metrics_check(pcpu_rt);
1044         read_unlock_bh(&table->tb6_lock);
1045         return pcpu_rt;
1046 }
1047
1048 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1049                                int oif, struct flowi6 *fl6, int flags)
1050 {
1051         struct fib6_node *fn, *saved_fn;
1052         struct rt6_info *rt;
1053         int strict = 0;
1054
1055         strict |= flags & RT6_LOOKUP_F_IFACE;
1056         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1057         if (net->ipv6.devconf_all->forwarding == 0)
1058                 strict |= RT6_LOOKUP_F_REACHABLE;
1059
1060         read_lock_bh(&table->tb6_lock);
1061
1062         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063         saved_fn = fn;
1064
1065         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1066                 oif = 0;
1067
1068 redo_rt6_select:
1069         rt = rt6_select(fn, oif, strict);
1070         if (rt->rt6i_nsiblings)
1071                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1072         if (rt == net->ipv6.ip6_null_entry) {
1073                 fn = fib6_backtrack(fn, &fl6->saddr);
1074                 if (fn)
1075                         goto redo_rt6_select;
1076                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1077                         /* also consider unreachable route */
1078                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1079                         fn = saved_fn;
1080                         goto redo_rt6_select;
1081                 }
1082         }
1083
1084
1085         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1086                 dst_use(&rt->dst, jiffies);
1087                 read_unlock_bh(&table->tb6_lock);
1088
1089                 rt6_dst_from_metrics_check(rt);
1090
1091                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1092                 return rt;
1093         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1094                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1095                 /* Create a RTF_CACHE clone which will not be
1096                  * owned by the fib6 tree.  It is for the special case where
1097                  * the daddr in the skb during the neighbor look-up is different
1098                  * from the fl6->daddr used to look-up route here.
1099                  */
1100
1101                 struct rt6_info *uncached_rt;
1102
1103                 dst_use(&rt->dst, jiffies);
1104                 read_unlock_bh(&table->tb6_lock);
1105
1106                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1107                 dst_release(&rt->dst);
1108
1109                 if (uncached_rt)
1110                         rt6_uncached_list_add(uncached_rt);
1111                 else
1112                         uncached_rt = net->ipv6.ip6_null_entry;
1113
1114                 dst_hold(&uncached_rt->dst);
1115
1116                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1117                 return uncached_rt;
1118
1119         } else {
1120                 /* Get a percpu copy */
1121
1122                 struct rt6_info *pcpu_rt;
1123
1124                 rt->dst.lastuse = jiffies;
1125                 rt->dst.__use++;
1126                 pcpu_rt = rt6_get_pcpu_route(rt);
1127
1128                 if (pcpu_rt) {
1129                         read_unlock_bh(&table->tb6_lock);
1130                 } else {
1131                         /* We have to do the read_unlock first
1132                          * because rt6_make_pcpu_route() may trigger
1133                          * ip6_dst_gc() which will take the write_lock.
1134                          */
1135                         dst_hold(&rt->dst);
1136                         read_unlock_bh(&table->tb6_lock);
1137                         pcpu_rt = rt6_make_pcpu_route(rt);
1138                         dst_release(&rt->dst);
1139                 }
1140
1141                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1142                 return pcpu_rt;
1143
1144         }
1145 }
1146 EXPORT_SYMBOL_GPL(ip6_pol_route);
1147
1148 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1149                                             struct flowi6 *fl6, int flags)
1150 {
1151         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1152 }
1153
1154 struct dst_entry *ip6_route_input_lookup(struct net *net,
1155                                          struct net_device *dev,
1156                                          struct flowi6 *fl6, int flags)
1157 {
1158         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1159                 flags |= RT6_LOOKUP_F_IFACE;
1160
1161         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1162 }
1163 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1164
1165 void ip6_route_input(struct sk_buff *skb)
1166 {
1167         const struct ipv6hdr *iph = ipv6_hdr(skb);
1168         struct net *net = dev_net(skb->dev);
1169         int flags = RT6_LOOKUP_F_HAS_SADDR;
1170         struct ip_tunnel_info *tun_info;
1171         struct flowi6 fl6 = {
1172                 .flowi6_iif = skb->dev->ifindex,
1173                 .daddr = iph->daddr,
1174                 .saddr = iph->saddr,
1175                 .flowlabel = ip6_flowinfo(iph),
1176                 .flowi6_mark = skb->mark,
1177                 .flowi6_proto = iph->nexthdr,
1178         };
1179
1180         tun_info = skb_tunnel_info(skb);
1181         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1182                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1183         skb_dst_drop(skb);
1184         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1185 }
1186
1187 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1188                                              struct flowi6 *fl6, int flags)
1189 {
1190         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1191 }
1192
1193 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1194                                          struct flowi6 *fl6, int flags)
1195 {
1196         bool any_src;
1197
1198         if (rt6_need_strict(&fl6->daddr)) {
1199                 struct dst_entry *dst;
1200
1201                 dst = l3mdev_link_scope_lookup(net, fl6);
1202                 if (dst)
1203                         return dst;
1204         }
1205
1206         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1207
1208         any_src = ipv6_addr_any(&fl6->saddr);
1209         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1210             (fl6->flowi6_oif && any_src))
1211                 flags |= RT6_LOOKUP_F_IFACE;
1212
1213         if (!any_src)
1214                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1215         else if (sk)
1216                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1217
1218         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1221
1222 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1223 {
1224         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1225         struct dst_entry *new = NULL;
1226
1227         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1228         if (rt) {
1229                 rt6_info_init(rt);
1230
1231                 new = &rt->dst;
1232                 new->__use = 1;
1233                 new->input = dst_discard;
1234                 new->output = dst_discard_out;
1235
1236                 dst_copy_metrics(new, &ort->dst);
1237                 rt->rt6i_idev = ort->rt6i_idev;
1238                 if (rt->rt6i_idev)
1239                         in6_dev_hold(rt->rt6i_idev);
1240
1241                 rt->rt6i_gateway = ort->rt6i_gateway;
1242                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1243                 rt->rt6i_metric = 0;
1244
1245                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1246 #ifdef CONFIG_IPV6_SUBTREES
1247                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1248 #endif
1249
1250                 dst_free(new);
1251         }
1252
1253         dst_release(dst_orig);
1254         return new ? new : ERR_PTR(-ENOMEM);
1255 }
1256
1257 /*
1258  *      Destination cache support functions
1259  */
1260
1261 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1262 {
1263         if (rt->dst.from &&
1264             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1265                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1266 }
1267
1268 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1269 {
1270         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1271                 return NULL;
1272
1273         if (rt6_check_expired(rt))
1274                 return NULL;
1275
1276         return &rt->dst;
1277 }
1278
1279 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1280 {
1281         if (!__rt6_check_expired(rt) &&
1282             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1283             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1284                 return &rt->dst;
1285         else
1286                 return NULL;
1287 }
1288
1289 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1290 {
1291         struct rt6_info *rt;
1292
1293         rt = (struct rt6_info *) dst;
1294
1295         /* All IPV6 dsts are created with ->obsolete set to the value
1296          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1297          * into this function always.
1298          */
1299
1300         rt6_dst_from_metrics_check(rt);
1301
1302         if (rt->rt6i_flags & RTF_PCPU ||
1303             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1304                 return rt6_dst_from_check(rt, cookie);
1305         else
1306                 return rt6_check(rt, cookie);
1307 }
1308
1309 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1310 {
1311         struct rt6_info *rt = (struct rt6_info *) dst;
1312
1313         if (rt) {
1314                 if (rt->rt6i_flags & RTF_CACHE) {
1315                         if (rt6_check_expired(rt)) {
1316                                 ip6_del_rt(rt);
1317                                 dst = NULL;
1318                         }
1319                 } else {
1320                         dst_release(dst);
1321                         dst = NULL;
1322                 }
1323         }
1324         return dst;
1325 }
1326
1327 static void ip6_link_failure(struct sk_buff *skb)
1328 {
1329         struct rt6_info *rt;
1330
1331         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1332
1333         rt = (struct rt6_info *) skb_dst(skb);
1334         if (rt) {
1335                 if (rt->rt6i_flags & RTF_CACHE) {
1336                         dst_hold(&rt->dst);
1337                         ip6_del_rt(rt);
1338                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1339                         rt->rt6i_node->fn_sernum = -1;
1340                 }
1341         }
1342 }
1343
1344 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1345 {
1346         struct net *net = dev_net(rt->dst.dev);
1347
1348         rt->rt6i_flags |= RTF_MODIFIED;
1349         rt->rt6i_pmtu = mtu;
1350         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1351 }
1352
1353 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1354 {
1355         return !(rt->rt6i_flags & RTF_CACHE) &&
1356                 (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1357 }
1358
1359 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1360                                  const struct ipv6hdr *iph, u32 mtu)
1361 {
1362         struct rt6_info *rt6 = (struct rt6_info *)dst;
1363
1364         if (rt6->rt6i_flags & RTF_LOCAL)
1365                 return;
1366
1367         dst_confirm(dst);
1368         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1369         if (mtu >= dst_mtu(dst))
1370                 return;
1371
1372         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1373                 rt6_do_update_pmtu(rt6, mtu);
1374         } else {
1375                 const struct in6_addr *daddr, *saddr;
1376                 struct rt6_info *nrt6;
1377
1378                 if (iph) {
1379                         daddr = &iph->daddr;
1380                         saddr = &iph->saddr;
1381                 } else if (sk) {
1382                         daddr = &sk->sk_v6_daddr;
1383                         saddr = &inet6_sk(sk)->saddr;
1384                 } else {
1385                         return;
1386                 }
1387                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1388                 if (nrt6) {
1389                         rt6_do_update_pmtu(nrt6, mtu);
1390
1391                         /* ip6_ins_rt(nrt6) will bump the
1392                          * rt6->rt6i_node->fn_sernum
1393                          * which will fail the next rt6_check() and
1394                          * invalidate the sk->sk_dst_cache.
1395                          */
1396                         ip6_ins_rt(nrt6);
1397                 }
1398         }
1399 }
1400
1401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1402                                struct sk_buff *skb, u32 mtu)
1403 {
1404         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1405 }
1406
1407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1408                      int oif, u32 mark)
1409 {
1410         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1411         struct dst_entry *dst;
1412         struct flowi6 fl6;
1413
1414         memset(&fl6, 0, sizeof(fl6));
1415         fl6.flowi6_oif = oif;
1416         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1417         fl6.daddr = iph->daddr;
1418         fl6.saddr = iph->saddr;
1419         fl6.flowlabel = ip6_flowinfo(iph);
1420
1421         dst = ip6_route_output(net, NULL, &fl6);
1422         if (!dst->error)
1423                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1424         dst_release(dst);
1425 }
1426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1427
1428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1429 {
1430         struct dst_entry *dst;
1431
1432         ip6_update_pmtu(skb, sock_net(sk), mtu,
1433                         sk->sk_bound_dev_if, sk->sk_mark);
1434
1435         dst = __sk_dst_get(sk);
1436         if (!dst || !dst->obsolete ||
1437             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1438                 return;
1439
1440         bh_lock_sock(sk);
1441         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1442                 ip6_datagram_dst_update(sk, false);
1443         bh_unlock_sock(sk);
1444 }
1445 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1446
1447 /* Handle redirects */
1448 struct ip6rd_flowi {
1449         struct flowi6 fl6;
1450         struct in6_addr gateway;
1451 };
1452
1453 static struct rt6_info *__ip6_route_redirect(struct net *net,
1454                                              struct fib6_table *table,
1455                                              struct flowi6 *fl6,
1456                                              int flags)
1457 {
1458         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1459         struct rt6_info *rt;
1460         struct fib6_node *fn;
1461
1462         /* Get the "current" route for this destination and
1463          * check if the redirect has come from approriate router.
1464          *
1465          * RFC 4861 specifies that redirects should only be
1466          * accepted if they come from the nexthop to the target.
1467          * Due to the way the routes are chosen, this notion
1468          * is a bit fuzzy and one might need to check all possible
1469          * routes.
1470          */
1471
1472         read_lock_bh(&table->tb6_lock);
1473         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1474 restart:
1475         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1476                 if (rt6_check_expired(rt))
1477                         continue;
1478                 if (rt->dst.error)
1479                         break;
1480                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1481                         continue;
1482                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1483                         continue;
1484                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1485                         continue;
1486                 break;
1487         }
1488
1489         if (!rt)
1490                 rt = net->ipv6.ip6_null_entry;
1491         else if (rt->dst.error) {
1492                 rt = net->ipv6.ip6_null_entry;
1493                 goto out;
1494         }
1495
1496         if (rt == net->ipv6.ip6_null_entry) {
1497                 fn = fib6_backtrack(fn, &fl6->saddr);
1498                 if (fn)
1499                         goto restart;
1500         }
1501
1502 out:
1503         dst_hold(&rt->dst);
1504
1505         read_unlock_bh(&table->tb6_lock);
1506
1507         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1508         return rt;
1509 };
1510
1511 static struct dst_entry *ip6_route_redirect(struct net *net,
1512                                         const struct flowi6 *fl6,
1513                                         const struct in6_addr *gateway)
1514 {
1515         int flags = RT6_LOOKUP_F_HAS_SADDR;
1516         struct ip6rd_flowi rdfl;
1517
1518         rdfl.fl6 = *fl6;
1519         rdfl.gateway = *gateway;
1520
1521         return fib6_rule_lookup(net, &rdfl.fl6,
1522                                 flags, __ip6_route_redirect);
1523 }
1524
1525 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1526 {
1527         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1528         struct dst_entry *dst;
1529         struct flowi6 fl6;
1530
1531         memset(&fl6, 0, sizeof(fl6));
1532         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1533         fl6.flowi6_oif = oif;
1534         fl6.flowi6_mark = mark;
1535         fl6.daddr = iph->daddr;
1536         fl6.saddr = iph->saddr;
1537         fl6.flowlabel = ip6_flowinfo(iph);
1538
1539         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1540         rt6_do_redirect(dst, NULL, skb);
1541         dst_release(dst);
1542 }
1543 EXPORT_SYMBOL_GPL(ip6_redirect);
1544
1545 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1546                             u32 mark)
1547 {
1548         const struct ipv6hdr *iph = ipv6_hdr(skb);
1549         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1550         struct dst_entry *dst;
1551         struct flowi6 fl6;
1552
1553         memset(&fl6, 0, sizeof(fl6));
1554         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1555         fl6.flowi6_oif = oif;
1556         fl6.flowi6_mark = mark;
1557         fl6.daddr = msg->dest;
1558         fl6.saddr = iph->daddr;
1559
1560         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1561         rt6_do_redirect(dst, NULL, skb);
1562         dst_release(dst);
1563 }
1564
1565 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1566 {
1567         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1568 }
1569 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1570
1571 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1572 {
1573         struct net_device *dev = dst->dev;
1574         unsigned int mtu = dst_mtu(dst);
1575         struct net *net = dev_net(dev);
1576
1577         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1578
1579         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1580                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1581
1582         /*
1583          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1584          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1585          * IPV6_MAXPLEN is also valid and means: "any MSS,
1586          * rely only on pmtu discovery"
1587          */
1588         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1589                 mtu = IPV6_MAXPLEN;
1590         return mtu;
1591 }
1592
1593 static unsigned int ip6_mtu(const struct dst_entry *dst)
1594 {
1595         const struct rt6_info *rt = (const struct rt6_info *)dst;
1596         unsigned int mtu = rt->rt6i_pmtu;
1597         struct inet6_dev *idev;
1598
1599         if (mtu)
1600                 goto out;
1601
1602         mtu = dst_metric_raw(dst, RTAX_MTU);
1603         if (mtu)
1604                 goto out;
1605
1606         mtu = IPV6_MIN_MTU;
1607
1608         rcu_read_lock();
1609         idev = __in6_dev_get(dst->dev);
1610         if (idev)
1611                 mtu = idev->cnf.mtu6;
1612         rcu_read_unlock();
1613
1614 out:
1615         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1616
1617         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1618 }
1619
1620 static struct dst_entry *icmp6_dst_gc_list;
1621 static DEFINE_SPINLOCK(icmp6_dst_lock);
1622
1623 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1624                                   struct flowi6 *fl6)
1625 {
1626         struct dst_entry *dst;
1627         struct rt6_info *rt;
1628         struct inet6_dev *idev = in6_dev_get(dev);
1629         struct net *net = dev_net(dev);
1630
1631         if (unlikely(!idev))
1632                 return ERR_PTR(-ENODEV);
1633
1634         rt = ip6_dst_alloc(net, dev, 0);
1635         if (unlikely(!rt)) {
1636                 in6_dev_put(idev);
1637                 dst = ERR_PTR(-ENOMEM);
1638                 goto out;
1639         }
1640
1641         rt->dst.flags |= DST_HOST;
1642         rt->dst.output  = ip6_output;
1643         atomic_set(&rt->dst.__refcnt, 1);
1644         rt->rt6i_gateway  = fl6->daddr;
1645         rt->rt6i_dst.addr = fl6->daddr;
1646         rt->rt6i_dst.plen = 128;
1647         rt->rt6i_idev     = idev;
1648         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1649
1650         spin_lock_bh(&icmp6_dst_lock);
1651         rt->dst.next = icmp6_dst_gc_list;
1652         icmp6_dst_gc_list = &rt->dst;
1653         spin_unlock_bh(&icmp6_dst_lock);
1654
1655         fib6_force_start_gc(net);
1656
1657         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1658
1659 out:
1660         return dst;
1661 }
1662
1663 int icmp6_dst_gc(void)
1664 {
1665         struct dst_entry *dst, **pprev;
1666         int more = 0;
1667
1668         spin_lock_bh(&icmp6_dst_lock);
1669         pprev = &icmp6_dst_gc_list;
1670
1671         while ((dst = *pprev) != NULL) {
1672                 if (!atomic_read(&dst->__refcnt)) {
1673                         *pprev = dst->next;
1674                         dst_free(dst);
1675                 } else {
1676                         pprev = &dst->next;
1677                         ++more;
1678                 }
1679         }
1680
1681         spin_unlock_bh(&icmp6_dst_lock);
1682
1683         return more;
1684 }
1685
1686 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1687                             void *arg)
1688 {
1689         struct dst_entry *dst, **pprev;
1690
1691         spin_lock_bh(&icmp6_dst_lock);
1692         pprev = &icmp6_dst_gc_list;
1693         while ((dst = *pprev) != NULL) {
1694                 struct rt6_info *rt = (struct rt6_info *) dst;
1695                 if (func(rt, arg)) {
1696                         *pprev = dst->next;
1697                         dst_free(dst);
1698                 } else {
1699                         pprev = &dst->next;
1700                 }
1701         }
1702         spin_unlock_bh(&icmp6_dst_lock);
1703 }
1704
1705 static int ip6_dst_gc(struct dst_ops *ops)
1706 {
1707         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1708         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1709         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1710         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1711         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1712         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1713         int entries;
1714
1715         entries = dst_entries_get_fast(ops);
1716         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1717             entries <= rt_max_size)
1718                 goto out;
1719
1720         net->ipv6.ip6_rt_gc_expire++;
1721         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1722         entries = dst_entries_get_slow(ops);
1723         if (entries < ops->gc_thresh)
1724                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1725 out:
1726         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1727         return entries > rt_max_size;
1728 }
1729
1730 static int ip6_convert_metrics(struct mx6_config *mxc,
1731                                const struct fib6_config *cfg)
1732 {
1733         bool ecn_ca = false;
1734         struct nlattr *nla;
1735         int remaining;
1736         u32 *mp;
1737
1738         if (!cfg->fc_mx)
1739                 return 0;
1740
1741         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1742         if (unlikely(!mp))
1743                 return -ENOMEM;
1744
1745         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1746                 int type = nla_type(nla);
1747                 u32 val;
1748
1749                 if (!type)
1750                         continue;
1751                 if (unlikely(type > RTAX_MAX))
1752                         goto err;
1753
1754                 if (type == RTAX_CC_ALGO) {
1755                         char tmp[TCP_CA_NAME_MAX];
1756
1757                         nla_strlcpy(tmp, nla, sizeof(tmp));
1758                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1759                         if (val == TCP_CA_UNSPEC)
1760                                 goto err;
1761                 } else {
1762                         val = nla_get_u32(nla);
1763                 }
1764                 if (type == RTAX_HOPLIMIT && val > 255)
1765                         val = 255;
1766                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1767                         goto err;
1768
1769                 mp[type - 1] = val;
1770                 __set_bit(type - 1, mxc->mx_valid);
1771         }
1772
1773         if (ecn_ca) {
1774                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1775                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1776         }
1777
1778         mxc->mx = mp;
1779         return 0;
1780  err:
1781         kfree(mp);
1782         return -EINVAL;
1783 }
1784
1785 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1786                                             struct fib6_config *cfg,
1787                                             const struct in6_addr *gw_addr)
1788 {
1789         struct flowi6 fl6 = {
1790                 .flowi6_oif = cfg->fc_ifindex,
1791                 .daddr = *gw_addr,
1792                 .saddr = cfg->fc_prefsrc,
1793         };
1794         struct fib6_table *table;
1795         struct rt6_info *rt;
1796         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1797
1798         table = fib6_get_table(net, cfg->fc_table);
1799         if (!table)
1800                 return NULL;
1801
1802         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1803                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1804
1805         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1806
1807         /* if table lookup failed, fall back to full lookup */
1808         if (rt == net->ipv6.ip6_null_entry) {
1809                 ip6_rt_put(rt);
1810                 rt = NULL;
1811         }
1812
1813         return rt;
1814 }
1815
1816 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1817 {
1818         struct net *net = cfg->fc_nlinfo.nl_net;
1819         struct rt6_info *rt = NULL;
1820         struct net_device *dev = NULL;
1821         struct inet6_dev *idev = NULL;
1822         struct fib6_table *table;
1823         int addr_type;
1824         int err = -EINVAL;
1825
1826         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1827                 goto out;
1828 #ifndef CONFIG_IPV6_SUBTREES
1829         if (cfg->fc_src_len)
1830                 goto out;
1831 #endif
1832         if (cfg->fc_ifindex) {
1833                 err = -ENODEV;
1834                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1835                 if (!dev)
1836                         goto out;
1837                 idev = in6_dev_get(dev);
1838                 if (!idev)
1839                         goto out;
1840         }
1841
1842         if (cfg->fc_metric == 0)
1843                 cfg->fc_metric = IP6_RT_PRIO_USER;
1844
1845         err = -ENOBUFS;
1846         if (cfg->fc_nlinfo.nlh &&
1847             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1848                 table = fib6_get_table(net, cfg->fc_table);
1849                 if (!table) {
1850                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1851                         table = fib6_new_table(net, cfg->fc_table);
1852                 }
1853         } else {
1854                 table = fib6_new_table(net, cfg->fc_table);
1855         }
1856
1857         if (!table)
1858                 goto out;
1859
1860         rt = ip6_dst_alloc(net, NULL,
1861                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1862
1863         if (!rt) {
1864                 err = -ENOMEM;
1865                 goto out;
1866         }
1867
1868         if (cfg->fc_flags & RTF_EXPIRES)
1869                 rt6_set_expires(rt, jiffies +
1870                                 clock_t_to_jiffies(cfg->fc_expires));
1871         else
1872                 rt6_clean_expires(rt);
1873
1874         if (cfg->fc_protocol == RTPROT_UNSPEC)
1875                 cfg->fc_protocol = RTPROT_BOOT;
1876         rt->rt6i_protocol = cfg->fc_protocol;
1877
1878         addr_type = ipv6_addr_type(&cfg->fc_dst);
1879
1880         if (addr_type & IPV6_ADDR_MULTICAST)
1881                 rt->dst.input = ip6_mc_input;
1882         else if (cfg->fc_flags & RTF_LOCAL)
1883                 rt->dst.input = ip6_input;
1884         else
1885                 rt->dst.input = ip6_forward;
1886
1887         rt->dst.output = ip6_output;
1888
1889         if (cfg->fc_encap) {
1890                 struct lwtunnel_state *lwtstate;
1891
1892                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1893                                            cfg->fc_encap, AF_INET6, cfg,
1894                                            &lwtstate);
1895                 if (err)
1896                         goto out;
1897                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1898                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1899                         rt->dst.lwtstate->orig_output = rt->dst.output;
1900                         rt->dst.output = lwtunnel_output;
1901                 }
1902                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1903                         rt->dst.lwtstate->orig_input = rt->dst.input;
1904                         rt->dst.input = lwtunnel_input;
1905                 }
1906         }
1907
1908         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1909         rt->rt6i_dst.plen = cfg->fc_dst_len;
1910         if (rt->rt6i_dst.plen == 128)
1911                 rt->dst.flags |= DST_HOST;
1912
1913 #ifdef CONFIG_IPV6_SUBTREES
1914         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1915         rt->rt6i_src.plen = cfg->fc_src_len;
1916 #endif
1917
1918         rt->rt6i_metric = cfg->fc_metric;
1919
1920         /* We cannot add true routes via loopback here,
1921            they would result in kernel looping; promote them to reject routes
1922          */
1923         if ((cfg->fc_flags & RTF_REJECT) ||
1924             (dev && (dev->flags & IFF_LOOPBACK) &&
1925              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1926              !(cfg->fc_flags & RTF_LOCAL))) {
1927                 /* hold loopback dev/idev if we haven't done so. */
1928                 if (dev != net->loopback_dev) {
1929                         if (dev) {
1930                                 dev_put(dev);
1931                                 in6_dev_put(idev);
1932                         }
1933                         dev = net->loopback_dev;
1934                         dev_hold(dev);
1935                         idev = in6_dev_get(dev);
1936                         if (!idev) {
1937                                 err = -ENODEV;
1938                                 goto out;
1939                         }
1940                 }
1941                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1942                 switch (cfg->fc_type) {
1943                 case RTN_BLACKHOLE:
1944                         rt->dst.error = -EINVAL;
1945                         rt->dst.output = dst_discard_out;
1946                         rt->dst.input = dst_discard;
1947                         break;
1948                 case RTN_PROHIBIT:
1949                         rt->dst.error = -EACCES;
1950                         rt->dst.output = ip6_pkt_prohibit_out;
1951                         rt->dst.input = ip6_pkt_prohibit;
1952                         break;
1953                 case RTN_THROW:
1954                 case RTN_UNREACHABLE:
1955                 default:
1956                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1957                                         : (cfg->fc_type == RTN_UNREACHABLE)
1958                                         ? -EHOSTUNREACH : -ENETUNREACH;
1959                         rt->dst.output = ip6_pkt_discard_out;
1960                         rt->dst.input = ip6_pkt_discard;
1961                         break;
1962                 }
1963                 goto install_route;
1964         }
1965
1966         if (cfg->fc_flags & RTF_GATEWAY) {
1967                 const struct in6_addr *gw_addr;
1968                 int gwa_type;
1969
1970                 gw_addr = &cfg->fc_gateway;
1971                 gwa_type = ipv6_addr_type(gw_addr);
1972
1973                 /* if gw_addr is local we will fail to detect this in case
1974                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1975                  * will return already-added prefix route via interface that
1976                  * prefix route was assigned to, which might be non-loopback.
1977                  */
1978                 err = -EINVAL;
1979                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1980                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1981                                             dev : NULL, 0, 0))
1982                         goto out;
1983
1984                 rt->rt6i_gateway = *gw_addr;
1985
1986                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1987                         struct rt6_info *grt = NULL;
1988
1989                         /* IPv6 strictly inhibits using not link-local
1990                            addresses as nexthop address.
1991                            Otherwise, router will not able to send redirects.
1992                            It is very good, but in some (rare!) circumstances
1993                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1994                            some exceptions. --ANK
1995                          */
1996                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1997                                 goto out;
1998
1999                         if (cfg->fc_table) {
2000                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2001
2002                                 if (grt) {
2003                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2004                                             (dev && dev != grt->dst.dev)) {
2005                                                 ip6_rt_put(grt);
2006                                                 grt = NULL;
2007                                         }
2008                                 }
2009                         }
2010
2011                         if (!grt)
2012                                 grt = rt6_lookup(net, gw_addr, NULL,
2013                                                  cfg->fc_ifindex, 1);
2014
2015                         err = -EHOSTUNREACH;
2016                         if (!grt)
2017                                 goto out;
2018                         if (dev) {
2019                                 if (dev != grt->dst.dev) {
2020                                         ip6_rt_put(grt);
2021                                         goto out;
2022                                 }
2023                         } else {
2024                                 dev = grt->dst.dev;
2025                                 idev = grt->rt6i_idev;
2026                                 dev_hold(dev);
2027                                 in6_dev_hold(grt->rt6i_idev);
2028                         }
2029                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2030                                 err = 0;
2031                         ip6_rt_put(grt);
2032
2033                         if (err)
2034                                 goto out;
2035                 }
2036                 err = -EINVAL;
2037                 if (!dev || (dev->flags & IFF_LOOPBACK))
2038                         goto out;
2039         }
2040
2041         err = -ENODEV;
2042         if (!dev)
2043                 goto out;
2044
2045         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2046                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2047                         err = -EINVAL;
2048                         goto out;
2049                 }
2050                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2051                 rt->rt6i_prefsrc.plen = 128;
2052         } else
2053                 rt->rt6i_prefsrc.plen = 0;
2054
2055         rt->rt6i_flags = cfg->fc_flags;
2056
2057 install_route:
2058         rt->dst.dev = dev;
2059         rt->rt6i_idev = idev;
2060         rt->rt6i_table = table;
2061
2062         cfg->fc_nlinfo.nl_net = dev_net(dev);
2063
2064         return rt;
2065 out:
2066         if (dev)
2067                 dev_put(dev);
2068         if (idev)
2069                 in6_dev_put(idev);
2070         if (rt)
2071                 dst_free(&rt->dst);
2072
2073         return ERR_PTR(err);
2074 }
2075
2076 int ip6_route_add(struct fib6_config *cfg)
2077 {
2078         struct mx6_config mxc = { .mx = NULL, };
2079         struct rt6_info *rt;
2080         int err;
2081
2082         rt = ip6_route_info_create(cfg);
2083         if (IS_ERR(rt)) {
2084                 err = PTR_ERR(rt);
2085                 rt = NULL;
2086                 goto out;
2087         }
2088
2089         err = ip6_convert_metrics(&mxc, cfg);
2090         if (err)
2091                 goto out;
2092
2093         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2094
2095         kfree(mxc.mx);
2096
2097         return err;
2098 out:
2099         if (rt)
2100                 dst_free(&rt->dst);
2101
2102         return err;
2103 }
2104
2105 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2106 {
2107         int err;
2108         struct fib6_table *table;
2109         struct net *net = dev_net(rt->dst.dev);
2110
2111         if (rt == net->ipv6.ip6_null_entry ||
2112             rt->dst.flags & DST_NOCACHE) {
2113                 err = -ENOENT;
2114                 goto out;
2115         }
2116
2117         table = rt->rt6i_table;
2118         write_lock_bh(&table->tb6_lock);
2119         err = fib6_del(rt, info);
2120         write_unlock_bh(&table->tb6_lock);
2121
2122 out:
2123         ip6_rt_put(rt);
2124         return err;
2125 }
2126
2127 int ip6_del_rt(struct rt6_info *rt)
2128 {
2129         struct nl_info info = {
2130                 .nl_net = dev_net(rt->dst.dev),
2131         };
2132         return __ip6_del_rt(rt, &info);
2133 }
2134
2135 static int ip6_route_del(struct fib6_config *cfg)
2136 {
2137         struct fib6_table *table;
2138         struct fib6_node *fn;
2139         struct rt6_info *rt;
2140         int err = -ESRCH;
2141
2142         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2143         if (!table)
2144                 return err;
2145
2146         read_lock_bh(&table->tb6_lock);
2147
2148         fn = fib6_locate(&table->tb6_root,
2149                          &cfg->fc_dst, cfg->fc_dst_len,
2150                          &cfg->fc_src, cfg->fc_src_len);
2151
2152         if (fn) {
2153                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2154                         if ((rt->rt6i_flags & RTF_CACHE) &&
2155                             !(cfg->fc_flags & RTF_CACHE))
2156                                 continue;
2157                         if (cfg->fc_ifindex &&
2158                             (!rt->dst.dev ||
2159                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2160                                 continue;
2161                         if (cfg->fc_flags & RTF_GATEWAY &&
2162                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2163                                 continue;
2164                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2165                                 continue;
2166                         dst_hold(&rt->dst);
2167                         read_unlock_bh(&table->tb6_lock);
2168
2169                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2170                 }
2171         }
2172         read_unlock_bh(&table->tb6_lock);
2173
2174         return err;
2175 }
2176
2177 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2178 {
2179         struct netevent_redirect netevent;
2180         struct rt6_info *rt, *nrt = NULL;
2181         struct ndisc_options ndopts;
2182         struct inet6_dev *in6_dev;
2183         struct neighbour *neigh;
2184         struct rd_msg *msg;
2185         int optlen, on_link;
2186         u8 *lladdr;
2187
2188         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2189         optlen -= sizeof(*msg);
2190
2191         if (optlen < 0) {
2192                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2193                 return;
2194         }
2195
2196         msg = (struct rd_msg *)icmp6_hdr(skb);
2197
2198         if (ipv6_addr_is_multicast(&msg->dest)) {
2199                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2200                 return;
2201         }
2202
2203         on_link = 0;
2204         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2205                 on_link = 1;
2206         } else if (ipv6_addr_type(&msg->target) !=
2207                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2208                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2209                 return;
2210         }
2211
2212         in6_dev = __in6_dev_get(skb->dev);
2213         if (!in6_dev)
2214                 return;
2215         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2216                 return;
2217
2218         /* RFC2461 8.1:
2219          *      The IP source address of the Redirect MUST be the same as the current
2220          *      first-hop router for the specified ICMP Destination Address.
2221          */
2222
2223         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2224                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2225                 return;
2226         }
2227
2228         lladdr = NULL;
2229         if (ndopts.nd_opts_tgt_lladdr) {
2230                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2231                                              skb->dev);
2232                 if (!lladdr) {
2233                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2234                         return;
2235                 }
2236         }
2237
2238         rt = (struct rt6_info *) dst;
2239         if (rt->rt6i_flags & RTF_REJECT) {
2240                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2241                 return;
2242         }
2243
2244         /* Redirect received -> path was valid.
2245          * Look, redirects are sent only in response to data packets,
2246          * so that this nexthop apparently is reachable. --ANK
2247          */
2248         dst_confirm(&rt->dst);
2249
2250         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2251         if (!neigh)
2252                 return;
2253
2254         /*
2255          *      We have finally decided to accept it.
2256          */
2257
2258         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2259                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2260                      NEIGH_UPDATE_F_OVERRIDE|
2261                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2262                                      NEIGH_UPDATE_F_ISROUTER)),
2263                      NDISC_REDIRECT, &ndopts);
2264
2265         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2266         if (!nrt)
2267                 goto out;
2268
2269         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2270         if (on_link)
2271                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2272
2273         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2274
2275         if (ip6_ins_rt(nrt))
2276                 goto out;
2277
2278         netevent.old = &rt->dst;
2279         netevent.new = &nrt->dst;
2280         netevent.daddr = &msg->dest;
2281         netevent.neigh = neigh;
2282         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2283
2284         if (rt->rt6i_flags & RTF_CACHE) {
2285                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2286                 ip6_del_rt(rt);
2287         }
2288
2289 out:
2290         neigh_release(neigh);
2291 }
2292
2293 /*
2294  *      Misc support functions
2295  */
2296
2297 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2298 {
2299         BUG_ON(from->dst.from);
2300
2301         rt->rt6i_flags &= ~RTF_EXPIRES;
2302         dst_hold(&from->dst);
2303         rt->dst.from = &from->dst;
2304         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2305 }
2306
2307 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2308 {
2309         rt->dst.input = ort->dst.input;
2310         rt->dst.output = ort->dst.output;
2311         rt->rt6i_dst = ort->rt6i_dst;
2312         rt->dst.error = ort->dst.error;
2313         rt->rt6i_idev = ort->rt6i_idev;
2314         if (rt->rt6i_idev)
2315                 in6_dev_hold(rt->rt6i_idev);
2316         rt->dst.lastuse = jiffies;
2317         rt->rt6i_gateway = ort->rt6i_gateway;
2318         rt->rt6i_flags = ort->rt6i_flags;
2319         rt6_set_from(rt, ort);
2320         rt->rt6i_metric = ort->rt6i_metric;
2321 #ifdef CONFIG_IPV6_SUBTREES
2322         rt->rt6i_src = ort->rt6i_src;
2323 #endif
2324         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2325         rt->rt6i_table = ort->rt6i_table;
2326         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2327 }
2328
2329 #ifdef CONFIG_IPV6_ROUTE_INFO
2330 static struct rt6_info *rt6_get_route_info(struct net *net,
2331                                            const struct in6_addr *prefix, int prefixlen,
2332                                            const struct in6_addr *gwaddr,
2333                                            struct net_device *dev)
2334 {
2335         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2336         int ifindex = dev->ifindex;
2337         struct fib6_node *fn;
2338         struct rt6_info *rt = NULL;
2339         struct fib6_table *table;
2340
2341         table = fib6_get_table(net, tb_id);
2342         if (!table)
2343                 return NULL;
2344
2345         read_lock_bh(&table->tb6_lock);
2346         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2347         if (!fn)
2348                 goto out;
2349
2350         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2351                 if (rt->dst.dev->ifindex != ifindex)
2352                         continue;
2353                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2354                         continue;
2355                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2356                         continue;
2357                 dst_hold(&rt->dst);
2358                 break;
2359         }
2360 out:
2361         read_unlock_bh(&table->tb6_lock);
2362         return rt;
2363 }
2364
2365 static struct rt6_info *rt6_add_route_info(struct net *net,
2366                                            const struct in6_addr *prefix, int prefixlen,
2367                                            const struct in6_addr *gwaddr,
2368                                            struct net_device *dev,
2369                                            unsigned int pref)
2370 {
2371         struct fib6_config cfg = {
2372                 .fc_metric      = IP6_RT_PRIO_USER,
2373                 .fc_ifindex     = dev->ifindex,
2374                 .fc_dst_len     = prefixlen,
2375                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2376                                   RTF_UP | RTF_PREF(pref),
2377                 .fc_nlinfo.portid = 0,
2378                 .fc_nlinfo.nlh = NULL,
2379                 .fc_nlinfo.nl_net = net,
2380         };
2381
2382         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2383         cfg.fc_dst = *prefix;
2384         cfg.fc_gateway = *gwaddr;
2385
2386         /* We should treat it as a default route if prefix length is 0. */
2387         if (!prefixlen)
2388                 cfg.fc_flags |= RTF_DEFAULT;
2389
2390         ip6_route_add(&cfg);
2391
2392         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2393 }
2394 #endif
2395
2396 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2397 {
2398         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2399         struct rt6_info *rt;
2400         struct fib6_table *table;
2401
2402         table = fib6_get_table(dev_net(dev), tb_id);
2403         if (!table)
2404                 return NULL;
2405
2406         read_lock_bh(&table->tb6_lock);
2407         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2408                 if (dev == rt->dst.dev &&
2409                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2410                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2411                         break;
2412         }
2413         if (rt)
2414                 dst_hold(&rt->dst);
2415         read_unlock_bh(&table->tb6_lock);
2416         return rt;
2417 }
2418
2419 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2420                                      struct net_device *dev,
2421                                      unsigned int pref)
2422 {
2423         struct fib6_config cfg = {
2424                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2425                 .fc_metric      = IP6_RT_PRIO_USER,
2426                 .fc_ifindex     = dev->ifindex,
2427                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2428                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2429                 .fc_nlinfo.portid = 0,
2430                 .fc_nlinfo.nlh = NULL,
2431                 .fc_nlinfo.nl_net = dev_net(dev),
2432         };
2433
2434         cfg.fc_gateway = *gwaddr;
2435
2436         if (!ip6_route_add(&cfg)) {
2437                 struct fib6_table *table;
2438
2439                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2440                 if (table)
2441                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2442         }
2443
2444         return rt6_get_dflt_router(gwaddr, dev);
2445 }
2446
2447 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2448 {
2449         struct rt6_info *rt;
2450
2451 restart:
2452         read_lock_bh(&table->tb6_lock);
2453         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2454                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2455                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2456                         dst_hold(&rt->dst);
2457                         read_unlock_bh(&table->tb6_lock);
2458                         ip6_del_rt(rt);
2459                         goto restart;
2460                 }
2461         }
2462         read_unlock_bh(&table->tb6_lock);
2463
2464         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2465 }
2466
2467 void rt6_purge_dflt_routers(struct net *net)
2468 {
2469         struct fib6_table *table;
2470         struct hlist_head *head;
2471         unsigned int h;
2472
2473         rcu_read_lock();
2474
2475         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2476                 head = &net->ipv6.fib_table_hash[h];
2477                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2478                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2479                                 __rt6_purge_dflt_routers(table);
2480                 }
2481         }
2482
2483         rcu_read_unlock();
2484 }
2485
2486 static void rtmsg_to_fib6_config(struct net *net,
2487                                  struct in6_rtmsg *rtmsg,
2488                                  struct fib6_config *cfg)
2489 {
2490         memset(cfg, 0, sizeof(*cfg));
2491
2492         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2493                          : RT6_TABLE_MAIN;
2494         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2495         cfg->fc_metric = rtmsg->rtmsg_metric;
2496         cfg->fc_expires = rtmsg->rtmsg_info;
2497         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2498         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2499         cfg->fc_flags = rtmsg->rtmsg_flags;
2500
2501         cfg->fc_nlinfo.nl_net = net;
2502
2503         cfg->fc_dst = rtmsg->rtmsg_dst;
2504         cfg->fc_src = rtmsg->rtmsg_src;
2505         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2506 }
2507
2508 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2509 {
2510         struct fib6_config cfg;
2511         struct in6_rtmsg rtmsg;
2512         int err;
2513
2514         switch (cmd) {
2515         case SIOCADDRT:         /* Add a route */
2516         case SIOCDELRT:         /* Delete a route */
2517                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2518                         return -EPERM;
2519                 err = copy_from_user(&rtmsg, arg,
2520                                      sizeof(struct in6_rtmsg));
2521                 if (err)
2522                         return -EFAULT;
2523
2524                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2525
2526                 rtnl_lock();
2527                 switch (cmd) {
2528                 case SIOCADDRT:
2529                         err = ip6_route_add(&cfg);
2530                         break;
2531                 case SIOCDELRT:
2532                         err = ip6_route_del(&cfg);
2533                         break;
2534                 default:
2535                         err = -EINVAL;
2536                 }
2537                 rtnl_unlock();
2538
2539                 return err;
2540         }
2541
2542         return -EINVAL;
2543 }
2544
2545 /*
2546  *      Drop the packet on the floor
2547  */
2548
2549 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2550 {
2551         int type;
2552         struct dst_entry *dst = skb_dst(skb);
2553         switch (ipstats_mib_noroutes) {
2554         case IPSTATS_MIB_INNOROUTES:
2555                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2556                 if (type == IPV6_ADDR_ANY) {
2557                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2558                                       IPSTATS_MIB_INADDRERRORS);
2559                         break;
2560                 }
2561                 /* FALLTHROUGH */
2562         case IPSTATS_MIB_OUTNOROUTES:
2563                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2564                               ipstats_mib_noroutes);
2565                 break;
2566         }
2567         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2568         kfree_skb(skb);
2569         return 0;
2570 }
2571
2572 static int ip6_pkt_discard(struct sk_buff *skb)
2573 {
2574         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2575 }
2576
2577 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2578 {
2579         skb->dev = skb_dst(skb)->dev;
2580         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2581 }
2582
2583 static int ip6_pkt_prohibit(struct sk_buff *skb)
2584 {
2585         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2586 }
2587
2588 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2589 {
2590         skb->dev = skb_dst(skb)->dev;
2591         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2592 }
2593
2594 /*
2595  *      Allocate a dst for local (unicast / anycast) address.
2596  */
2597
2598 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2599                                     const struct in6_addr *addr,
2600                                     bool anycast)
2601 {
2602         u32 tb_id;
2603         struct net *net = dev_net(idev->dev);
2604         struct net_device *dev = net->loopback_dev;
2605         struct rt6_info *rt;
2606
2607         /* use L3 Master device as loopback for host routes if device
2608          * is enslaved and address is not link local or multicast
2609          */
2610         if (!rt6_need_strict(addr))
2611                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2612
2613         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2614         if (!rt)
2615                 return ERR_PTR(-ENOMEM);
2616
2617         in6_dev_hold(idev);
2618
2619         rt->dst.flags |= DST_HOST;
2620         rt->dst.input = ip6_input;
2621         rt->dst.output = ip6_output;
2622         rt->rt6i_idev = idev;
2623
2624         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2625         if (anycast)
2626                 rt->rt6i_flags |= RTF_ANYCAST;
2627         else
2628                 rt->rt6i_flags |= RTF_LOCAL;
2629
2630         rt->rt6i_gateway  = *addr;
2631         rt->rt6i_dst.addr = *addr;
2632         rt->rt6i_dst.plen = 128;
2633         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2634         rt->rt6i_table = fib6_get_table(net, tb_id);
2635         rt->dst.flags |= DST_NOCACHE;
2636
2637         atomic_set(&rt->dst.__refcnt, 1);
2638
2639         return rt;
2640 }
2641
2642 /* remove deleted ip from prefsrc entries */
2643 struct arg_dev_net_ip {
2644         struct net_device *dev;
2645         struct net *net;
2646         struct in6_addr *addr;
2647 };
2648
2649 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2650 {
2651         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2652         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2653         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2654
2655         if (((void *)rt->dst.dev == dev || !dev) &&
2656             rt != net->ipv6.ip6_null_entry &&
2657             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2658                 /* remove prefsrc entry */
2659                 rt->rt6i_prefsrc.plen = 0;
2660         }
2661         return 0;
2662 }
2663
2664 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2665 {
2666         struct net *net = dev_net(ifp->idev->dev);
2667         struct arg_dev_net_ip adni = {
2668                 .dev = ifp->idev->dev,
2669                 .net = net,
2670                 .addr = &ifp->addr,
2671         };
2672         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2673 }
2674
2675 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2676 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2677
2678 /* Remove routers and update dst entries when gateway turn into host. */
2679 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2680 {
2681         struct in6_addr *gateway = (struct in6_addr *)arg;
2682
2683         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2684              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2685              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2686                 return -1;
2687         }
2688         return 0;
2689 }
2690
2691 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2692 {
2693         fib6_clean_all(net, fib6_clean_tohost, gateway);
2694 }
2695
2696 struct arg_dev_net {
2697         struct net_device *dev;
2698         struct net *net;
2699 };
2700
2701 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2702 {
2703         const struct arg_dev_net *adn = arg;
2704         const struct net_device *dev = adn->dev;
2705
2706         if ((rt->dst.dev == dev || !dev) &&
2707             rt != adn->net->ipv6.ip6_null_entry)
2708                 return -1;
2709
2710         return 0;
2711 }
2712
2713 void rt6_ifdown(struct net *net, struct net_device *dev)
2714 {
2715         struct arg_dev_net adn = {
2716                 .dev = dev,
2717                 .net = net,
2718         };
2719
2720         fib6_clean_all(net, fib6_ifdown, &adn);
2721         icmp6_clean_all(fib6_ifdown, &adn);
2722         if (dev)
2723                 rt6_uncached_list_flush_dev(net, dev);
2724 }
2725
2726 struct rt6_mtu_change_arg {
2727         struct net_device *dev;
2728         unsigned int mtu;
2729 };
2730
2731 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2732 {
2733         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2734         struct inet6_dev *idev;
2735
2736         /* In IPv6 pmtu discovery is not optional,
2737            so that RTAX_MTU lock cannot disable it.
2738            We still use this lock to block changes
2739            caused by addrconf/ndisc.
2740         */
2741
2742         idev = __in6_dev_get(arg->dev);
2743         if (!idev)
2744                 return 0;
2745
2746         /* For administrative MTU increase, there is no way to discover
2747            IPv6 PMTU increase, so PMTU increase should be updated here.
2748            Since RFC 1981 doesn't include administrative MTU increase
2749            update PMTU increase is a MUST. (i.e. jumbo frame)
2750          */
2751         /*
2752            If new MTU is less than route PMTU, this new MTU will be the
2753            lowest MTU in the path, update the route PMTU to reflect PMTU
2754            decreases; if new MTU is greater than route PMTU, and the
2755            old MTU is the lowest MTU in the path, update the route PMTU
2756            to reflect the increase. In this case if the other nodes' MTU
2757            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2758            PMTU discouvery.
2759          */
2760         if (rt->dst.dev == arg->dev &&
2761             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2762                 if (rt->rt6i_flags & RTF_CACHE) {
2763                         /* For RTF_CACHE with rt6i_pmtu == 0
2764                          * (i.e. a redirected route),
2765                          * the metrics of its rt->dst.from has already
2766                          * been updated.
2767                          */
2768                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2769                                 rt->rt6i_pmtu = arg->mtu;
2770                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2771                            (dst_mtu(&rt->dst) < arg->mtu &&
2772                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2773                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2774                 }
2775         }
2776         return 0;
2777 }
2778
2779 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2780 {
2781         struct rt6_mtu_change_arg arg = {
2782                 .dev = dev,
2783                 .mtu = mtu,
2784         };
2785
2786         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2787 }
2788
2789 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2790         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2791         [RTA_OIF]               = { .type = NLA_U32 },
2792         [RTA_IIF]               = { .type = NLA_U32 },
2793         [RTA_PRIORITY]          = { .type = NLA_U32 },
2794         [RTA_METRICS]           = { .type = NLA_NESTED },
2795         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2796         [RTA_PREF]              = { .type = NLA_U8 },
2797         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2798         [RTA_ENCAP]             = { .type = NLA_NESTED },
2799         [RTA_EXPIRES]           = { .type = NLA_U32 },
2800 };
2801
2802 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2803                               struct fib6_config *cfg)
2804 {
2805         struct rtmsg *rtm;
2806         struct nlattr *tb[RTA_MAX+1];
2807         unsigned int pref;
2808         int err;
2809
2810         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2811         if (err < 0)
2812                 goto errout;
2813
2814         err = -EINVAL;
2815         rtm = nlmsg_data(nlh);
2816         memset(cfg, 0, sizeof(*cfg));
2817
2818         cfg->fc_table = rtm->rtm_table;
2819         cfg->fc_dst_len = rtm->rtm_dst_len;
2820         cfg->fc_src_len = rtm->rtm_src_len;
2821         cfg->fc_flags = RTF_UP;
2822         cfg->fc_protocol = rtm->rtm_protocol;
2823         cfg->fc_type = rtm->rtm_type;
2824
2825         if (rtm->rtm_type == RTN_UNREACHABLE ||
2826             rtm->rtm_type == RTN_BLACKHOLE ||
2827             rtm->rtm_type == RTN_PROHIBIT ||
2828             rtm->rtm_type == RTN_THROW)
2829                 cfg->fc_flags |= RTF_REJECT;
2830
2831         if (rtm->rtm_type == RTN_LOCAL)
2832                 cfg->fc_flags |= RTF_LOCAL;
2833
2834         if (rtm->rtm_flags & RTM_F_CLONED)
2835                 cfg->fc_flags |= RTF_CACHE;
2836
2837         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2838         cfg->fc_nlinfo.nlh = nlh;
2839         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2840
2841         if (tb[RTA_GATEWAY]) {
2842                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2843                 cfg->fc_flags |= RTF_GATEWAY;
2844         }
2845
2846         if (tb[RTA_DST]) {
2847                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2848
2849                 if (nla_len(tb[RTA_DST]) < plen)
2850                         goto errout;
2851
2852                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2853         }
2854
2855         if (tb[RTA_SRC]) {
2856                 int plen = (rtm->rtm_src_len + 7) >> 3;
2857
2858                 if (nla_len(tb[RTA_SRC]) < plen)
2859                         goto errout;
2860
2861                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2862         }
2863
2864         if (tb[RTA_PREFSRC])
2865                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2866
2867         if (tb[RTA_OIF])
2868                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2869
2870         if (tb[RTA_PRIORITY])
2871                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2872
2873         if (tb[RTA_METRICS]) {
2874                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2875                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2876         }
2877
2878         if (tb[RTA_TABLE])
2879                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2880
2881         if (tb[RTA_MULTIPATH]) {
2882                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2883                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2884         }
2885
2886         if (tb[RTA_PREF]) {
2887                 pref = nla_get_u8(tb[RTA_PREF]);
2888                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2889                     pref != ICMPV6_ROUTER_PREF_HIGH)
2890                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2891                 cfg->fc_flags |= RTF_PREF(pref);
2892         }
2893
2894         if (tb[RTA_ENCAP])
2895                 cfg->fc_encap = tb[RTA_ENCAP];
2896
2897         if (tb[RTA_ENCAP_TYPE])
2898                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2899
2900         if (tb[RTA_EXPIRES]) {
2901                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2902
2903                 if (addrconf_finite_timeout(timeout)) {
2904                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2905                         cfg->fc_flags |= RTF_EXPIRES;
2906                 }
2907         }
2908
2909         err = 0;
2910 errout:
2911         return err;
2912 }
2913
2914 struct rt6_nh {
2915         struct rt6_info *rt6_info;
2916         struct fib6_config r_cfg;
2917         struct mx6_config mxc;
2918         struct list_head next;
2919 };
2920
2921 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2922 {
2923         struct rt6_nh *nh;
2924
2925         list_for_each_entry(nh, rt6_nh_list, next) {
2926                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2927                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2928                         nh->r_cfg.fc_ifindex);
2929         }
2930 }
2931
2932 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2933                                  struct rt6_info *rt, struct fib6_config *r_cfg)
2934 {
2935         struct rt6_nh *nh;
2936         struct rt6_info *rtnh;
2937         int err = -EEXIST;
2938
2939         list_for_each_entry(nh, rt6_nh_list, next) {
2940                 /* check if rt6_info already exists */
2941                 rtnh = nh->rt6_info;
2942
2943                 if (rtnh->dst.dev == rt->dst.dev &&
2944                     rtnh->rt6i_idev == rt->rt6i_idev &&
2945                     ipv6_addr_equal(&rtnh->rt6i_gateway,
2946                                     &rt->rt6i_gateway))
2947                         return err;
2948         }
2949
2950         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2951         if (!nh)
2952                 return -ENOMEM;
2953         nh->rt6_info = rt;
2954         err = ip6_convert_metrics(&nh->mxc, r_cfg);
2955         if (err) {
2956                 kfree(nh);
2957                 return err;
2958         }
2959         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2960         list_add_tail(&nh->next, rt6_nh_list);
2961
2962         return 0;
2963 }
2964
2965 static int ip6_route_multipath_add(struct fib6_config *cfg)
2966 {
2967         struct fib6_config r_cfg;
2968         struct rtnexthop *rtnh;
2969         struct rt6_info *rt;
2970         struct rt6_nh *err_nh;
2971         struct rt6_nh *nh, *nh_safe;
2972         int remaining;
2973         int attrlen;
2974         int err = 1;
2975         int nhn = 0;
2976         int replace = (cfg->fc_nlinfo.nlh &&
2977                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2978         LIST_HEAD(rt6_nh_list);
2979
2980         remaining = cfg->fc_mp_len;
2981         rtnh = (struct rtnexthop *)cfg->fc_mp;
2982
2983         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2984          * rt6_info structs per nexthop
2985          */
2986         while (rtnh_ok(rtnh, remaining)) {
2987                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2988                 if (rtnh->rtnh_ifindex)
2989                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2990
2991                 attrlen = rtnh_attrlen(rtnh);
2992                 if (attrlen > 0) {
2993                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2994
2995                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2996                         if (nla) {
2997                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2998                                 r_cfg.fc_flags |= RTF_GATEWAY;
2999                         }
3000                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3001                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3002                         if (nla)
3003                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3004                 }
3005
3006                 rt = ip6_route_info_create(&r_cfg);
3007                 if (IS_ERR(rt)) {
3008                         err = PTR_ERR(rt);
3009                         rt = NULL;
3010                         goto cleanup;
3011                 }
3012
3013                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3014                 if (err) {
3015                         dst_free(&rt->dst);
3016                         goto cleanup;
3017                 }
3018
3019                 rtnh = rtnh_next(rtnh, &remaining);
3020         }
3021
3022         err_nh = NULL;
3023         list_for_each_entry(nh, &rt6_nh_list, next) {
3024                 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
3025                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3026                 nh->rt6_info = NULL;
3027                 if (err) {
3028                         if (replace && nhn)
3029                                 ip6_print_replace_route_err(&rt6_nh_list);
3030                         err_nh = nh;
3031                         goto add_errout;
3032                 }
3033
3034                 /* Because each route is added like a single route we remove
3035                  * these flags after the first nexthop: if there is a collision,
3036                  * we have already failed to add the first nexthop:
3037                  * fib6_add_rt2node() has rejected it; when replacing, old
3038                  * nexthops have been replaced by first new, the rest should
3039                  * be added to it.
3040                  */
3041                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3042                                                      NLM_F_REPLACE);
3043                 nhn++;
3044         }
3045
3046         goto cleanup;
3047
3048 add_errout:
3049         /* Delete routes that were already added */
3050         list_for_each_entry(nh, &rt6_nh_list, next) {
3051                 if (err_nh == nh)
3052                         break;
3053                 ip6_route_del(&nh->r_cfg);
3054         }
3055
3056 cleanup:
3057         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3058                 if (nh->rt6_info)
3059                         dst_free(&nh->rt6_info->dst);
3060                 kfree(nh->mxc.mx);
3061                 list_del(&nh->next);
3062                 kfree(nh);
3063         }
3064
3065         return err;
3066 }
3067
3068 static int ip6_route_multipath_del(struct fib6_config *cfg)
3069 {
3070         struct fib6_config r_cfg;
3071         struct rtnexthop *rtnh;
3072         int remaining;
3073         int attrlen;
3074         int err = 1, last_err = 0;
3075
3076         remaining = cfg->fc_mp_len;
3077         rtnh = (struct rtnexthop *)cfg->fc_mp;
3078
3079         /* Parse a Multipath Entry */
3080         while (rtnh_ok(rtnh, remaining)) {
3081                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3082                 if (rtnh->rtnh_ifindex)
3083                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3084
3085                 attrlen = rtnh_attrlen(rtnh);
3086                 if (attrlen > 0) {
3087                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3088
3089                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3090                         if (nla) {
3091                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3092                                 r_cfg.fc_flags |= RTF_GATEWAY;
3093                         }
3094                 }
3095                 err = ip6_route_del(&r_cfg);
3096                 if (err)
3097                         last_err = err;
3098
3099                 rtnh = rtnh_next(rtnh, &remaining);
3100         }
3101
3102         return last_err;
3103 }
3104
3105 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3106 {
3107         struct fib6_config cfg;
3108         int err;
3109
3110         err = rtm_to_fib6_config(skb, nlh, &cfg);
3111         if (err < 0)
3112                 return err;
3113
3114         if (cfg.fc_mp)
3115                 return ip6_route_multipath_del(&cfg);
3116         else
3117                 return ip6_route_del(&cfg);
3118 }
3119
3120 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3121 {
3122         struct fib6_config cfg;
3123         int err;
3124
3125         err = rtm_to_fib6_config(skb, nlh, &cfg);
3126         if (err < 0)
3127                 return err;
3128
3129         if (cfg.fc_mp)
3130                 return ip6_route_multipath_add(&cfg);
3131         else
3132                 return ip6_route_add(&cfg);
3133 }
3134
3135 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3136 {
3137         return NLMSG_ALIGN(sizeof(struct rtmsg))
3138                + nla_total_size(16) /* RTA_SRC */
3139                + nla_total_size(16) /* RTA_DST */
3140                + nla_total_size(16) /* RTA_GATEWAY */
3141                + nla_total_size(16) /* RTA_PREFSRC */
3142                + nla_total_size(4) /* RTA_TABLE */
3143                + nla_total_size(4) /* RTA_IIF */
3144                + nla_total_size(4) /* RTA_OIF */
3145                + nla_total_size(4) /* RTA_PRIORITY */
3146                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3147                + nla_total_size(sizeof(struct rta_cacheinfo))
3148                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3149                + nla_total_size(1) /* RTA_PREF */
3150                + lwtunnel_get_encap_size(rt->dst.lwtstate);
3151 }
3152
3153 static int rt6_fill_node(struct net *net,
3154                          struct sk_buff *skb, struct rt6_info *rt,
3155                          struct in6_addr *dst, struct in6_addr *src,
3156                          int iif, int type, u32 portid, u32 seq,
3157                          int prefix, int nowait, unsigned int flags)
3158 {
3159         u32 metrics[RTAX_MAX];
3160         struct rtmsg *rtm;
3161         struct nlmsghdr *nlh;
3162         long expires;
3163         u32 table;
3164
3165         if (prefix) {   /* user wants prefix routes only */
3166                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3167                         /* success since this is not a prefix route */
3168                         return 1;
3169                 }
3170         }
3171
3172         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3173         if (!nlh)
3174                 return -EMSGSIZE;
3175
3176         rtm = nlmsg_data(nlh);
3177         rtm->rtm_family = AF_INET6;
3178         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3179         rtm->rtm_src_len = rt->rt6i_src.plen;
3180         rtm->rtm_tos = 0;
3181         if (rt->rt6i_table)
3182                 table = rt->rt6i_table->tb6_id;
3183         else
3184                 table = RT6_TABLE_UNSPEC;
3185         rtm->rtm_table = table;
3186         if (nla_put_u32(skb, RTA_TABLE, table))
3187                 goto nla_put_failure;
3188         if (rt->rt6i_flags & RTF_REJECT) {
3189                 switch (rt->dst.error) {
3190                 case -EINVAL:
3191                         rtm->rtm_type = RTN_BLACKHOLE;
3192                         break;
3193                 case -EACCES:
3194                         rtm->rtm_type = RTN_PROHIBIT;
3195                         break;
3196                 case -EAGAIN:
3197                         rtm->rtm_type = RTN_THROW;
3198                         break;
3199                 default:
3200                         rtm->rtm_type = RTN_UNREACHABLE;
3201                         break;
3202                 }
3203         }
3204         else if (rt->rt6i_flags & RTF_LOCAL)
3205                 rtm->rtm_type = RTN_LOCAL;
3206         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3207                 rtm->rtm_type = RTN_LOCAL;
3208         else
3209                 rtm->rtm_type = RTN_UNICAST;
3210         rtm->rtm_flags = 0;
3211         if (!netif_carrier_ok(rt->dst.dev)) {
3212                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3213                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3214                         rtm->rtm_flags |= RTNH_F_DEAD;
3215         }
3216         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3217         rtm->rtm_protocol = rt->rt6i_protocol;
3218         if (rt->rt6i_flags & RTF_DYNAMIC)
3219                 rtm->rtm_protocol = RTPROT_REDIRECT;
3220         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3221                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3222                         rtm->rtm_protocol = RTPROT_RA;
3223                 else
3224                         rtm->rtm_protocol = RTPROT_KERNEL;
3225         }
3226
3227         if (rt->rt6i_flags & RTF_CACHE)
3228                 rtm->rtm_flags |= RTM_F_CLONED;
3229
3230         if (dst) {
3231                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3232                         goto nla_put_failure;
3233                 rtm->rtm_dst_len = 128;
3234         } else if (rtm->rtm_dst_len)
3235                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3236                         goto nla_put_failure;
3237 #ifdef CONFIG_IPV6_SUBTREES
3238         if (src) {
3239                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3240                         goto nla_put_failure;
3241                 rtm->rtm_src_len = 128;
3242         } else if (rtm->rtm_src_len &&
3243                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3244                 goto nla_put_failure;
3245 #endif
3246         if (iif) {
3247 #ifdef CONFIG_IPV6_MROUTE
3248                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3249                         int err = ip6mr_get_route(net, skb, rtm, nowait,
3250                                                   portid);
3251
3252                         if (err <= 0) {
3253                                 if (!nowait) {
3254                                         if (err == 0)
3255                                                 return 0;
3256                                         goto nla_put_failure;
3257                                 } else {
3258                                         if (err == -EMSGSIZE)
3259                                                 goto nla_put_failure;
3260                                 }
3261                         }
3262                 } else
3263 #endif
3264                         if (nla_put_u32(skb, RTA_IIF, iif))
3265                                 goto nla_put_failure;
3266         } else if (dst) {
3267                 struct in6_addr saddr_buf;
3268                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3269                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3270                         goto nla_put_failure;
3271         }
3272
3273         if (rt->rt6i_prefsrc.plen) {
3274                 struct in6_addr saddr_buf;
3275                 saddr_buf = rt->rt6i_prefsrc.addr;
3276                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3277                         goto nla_put_failure;
3278         }
3279
3280         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3281         if (rt->rt6i_pmtu)
3282                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3283         if (rtnetlink_put_metrics(skb, metrics) < 0)
3284                 goto nla_put_failure;
3285
3286         if (rt->rt6i_flags & RTF_GATEWAY) {
3287                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3288                         goto nla_put_failure;
3289         }
3290
3291         if (rt->dst.dev &&
3292             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3293                 goto nla_put_failure;
3294         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3295                 goto nla_put_failure;
3296
3297         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3298
3299         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3300                 goto nla_put_failure;
3301
3302         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3303                 goto nla_put_failure;
3304
3305         lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3306
3307         nlmsg_end(skb, nlh);
3308         return 0;
3309
3310 nla_put_failure:
3311         nlmsg_cancel(skb, nlh);
3312         return -EMSGSIZE;
3313 }
3314
3315 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3316 {
3317         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3318         int prefix;
3319
3320         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3321                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3322                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3323         } else
3324                 prefix = 0;
3325
3326         return rt6_fill_node(arg->net,
3327                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3328                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3329                      prefix, 0, NLM_F_MULTI);
3330 }
3331
3332 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3333 {
3334         struct net *net = sock_net(in_skb->sk);
3335         struct nlattr *tb[RTA_MAX+1];
3336         struct rt6_info *rt;
3337         struct sk_buff *skb;
3338         struct rtmsg *rtm;
3339         struct flowi6 fl6;
3340         int err, iif = 0, oif = 0;
3341
3342         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3343         if (err < 0)
3344                 goto errout;
3345
3346         err = -EINVAL;
3347         memset(&fl6, 0, sizeof(fl6));
3348         rtm = nlmsg_data(nlh);
3349         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3350
3351         if (tb[RTA_SRC]) {
3352                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3353                         goto errout;
3354
3355                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3356         }
3357
3358         if (tb[RTA_DST]) {
3359                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3360                         goto errout;
3361
3362                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3363         }
3364
3365         if (tb[RTA_IIF])
3366                 iif = nla_get_u32(tb[RTA_IIF]);
3367
3368         if (tb[RTA_OIF])
3369                 oif = nla_get_u32(tb[RTA_OIF]);
3370
3371         if (tb[RTA_MARK])
3372                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3373
3374         if (iif) {
3375                 struct net_device *dev;
3376                 int flags = 0;
3377
3378                 dev = __dev_get_by_index(net, iif);
3379                 if (!dev) {
3380                         err = -ENODEV;
3381                         goto errout;
3382                 }
3383
3384                 fl6.flowi6_iif = iif;
3385
3386                 if (!ipv6_addr_any(&fl6.saddr))
3387                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3388
3389                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3390                                                                flags);
3391         } else {
3392                 fl6.flowi6_oif = oif;
3393
3394                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3395         }
3396
3397         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3398         if (!skb) {
3399                 ip6_rt_put(rt);
3400                 err = -ENOBUFS;
3401                 goto errout;
3402         }
3403
3404         /* Reserve room for dummy headers, this skb can pass
3405            through good chunk of routing engine.
3406          */
3407         skb_reset_mac_header(skb);
3408         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3409
3410         skb_dst_set(skb, &rt->dst);
3411
3412         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3413                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3414                             nlh->nlmsg_seq, 0, 0, 0);
3415         if (err < 0) {
3416                 kfree_skb(skb);
3417                 goto errout;
3418         }
3419
3420         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3421 errout:
3422         return err;
3423 }
3424
3425 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3426                      unsigned int nlm_flags)
3427 {
3428         struct sk_buff *skb;
3429         struct net *net = info->nl_net;
3430         u32 seq;
3431         int err;
3432
3433         err = -ENOBUFS;
3434         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3435
3436         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3437         if (!skb)
3438                 goto errout;
3439
3440         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3441                                 event, info->portid, seq, 0, 0, nlm_flags);
3442         if (err < 0) {
3443                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3444                 WARN_ON(err == -EMSGSIZE);
3445                 kfree_skb(skb);
3446                 goto errout;
3447         }
3448         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3449                     info->nlh, gfp_any());
3450         return;
3451 errout:
3452         if (err < 0)
3453                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3454 }
3455
3456 static int ip6_route_dev_notify(struct notifier_block *this,
3457                                 unsigned long event, void *ptr)
3458 {
3459         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3460         struct net *net = dev_net(dev);
3461
3462         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3463                 net->ipv6.ip6_null_entry->dst.dev = dev;
3464                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3465 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3466                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3467                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3468                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3469                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3470 #endif
3471         }
3472
3473         return NOTIFY_OK;
3474 }
3475
3476 /*
3477  *      /proc
3478  */
3479
3480 #ifdef CONFIG_PROC_FS
3481
3482 static const struct file_operations ipv6_route_proc_fops = {
3483         .owner          = THIS_MODULE,
3484         .open           = ipv6_route_open,
3485         .read           = seq_read,
3486         .llseek         = seq_lseek,
3487         .release        = seq_release_net,
3488 };
3489
3490 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3491 {
3492         struct net *net = (struct net *)seq->private;
3493         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3494                    net->ipv6.rt6_stats->fib_nodes,
3495                    net->ipv6.rt6_stats->fib_route_nodes,
3496                    net->ipv6.rt6_stats->fib_rt_alloc,
3497                    net->ipv6.rt6_stats->fib_rt_entries,
3498                    net->ipv6.rt6_stats->fib_rt_cache,
3499                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3500                    net->ipv6.rt6_stats->fib_discarded_routes);
3501
3502         return 0;
3503 }
3504
3505 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3506 {
3507         return single_open_net(inode, file, rt6_stats_seq_show);
3508 }
3509
3510 static const struct file_operations rt6_stats_seq_fops = {
3511         .owner   = THIS_MODULE,
3512         .open    = rt6_stats_seq_open,
3513         .read    = seq_read,
3514         .llseek  = seq_lseek,
3515         .release = single_release_net,
3516 };
3517 #endif  /* CONFIG_PROC_FS */
3518
3519 #ifdef CONFIG_SYSCTL
3520
3521 static
3522 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3523                               void __user *buffer, size_t *lenp, loff_t *ppos)
3524 {
3525         struct net *net;
3526         int delay;
3527         if (!write)
3528                 return -EINVAL;
3529
3530         net = (struct net *)ctl->extra1;
3531         delay = net->ipv6.sysctl.flush_delay;
3532         proc_dointvec(ctl, write, buffer, lenp, ppos);
3533         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3534         return 0;
3535 }
3536
3537 struct ctl_table ipv6_route_table_template[] = {
3538         {
3539                 .procname       =       "flush",
3540                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3541                 .maxlen         =       sizeof(int),
3542                 .mode           =       0200,
3543                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3544         },
3545         {
3546                 .procname       =       "gc_thresh",
3547                 .data           =       &ip6_dst_ops_template.gc_thresh,
3548                 .maxlen         =       sizeof(int),
3549                 .mode           =       0644,
3550                 .proc_handler   =       proc_dointvec,
3551         },
3552         {
3553                 .procname       =       "max_size",
3554                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3555                 .maxlen         =       sizeof(int),
3556                 .mode           =       0644,
3557                 .proc_handler   =       proc_dointvec,
3558         },
3559         {
3560                 .procname       =       "gc_min_interval",
3561                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3562                 .maxlen         =       sizeof(int),
3563                 .mode           =       0644,
3564                 .proc_handler   =       proc_dointvec_jiffies,
3565         },
3566         {
3567                 .procname       =       "gc_timeout",
3568                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3569                 .maxlen         =       sizeof(int),
3570                 .mode           =       0644,
3571                 .proc_handler   =       proc_dointvec_jiffies,
3572         },
3573         {
3574                 .procname       =       "gc_interval",
3575                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3576                 .maxlen         =       sizeof(int),
3577                 .mode           =       0644,
3578                 .proc_handler   =       proc_dointvec_jiffies,
3579         },
3580         {
3581                 .procname       =       "gc_elasticity",
3582                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3583                 .maxlen         =       sizeof(int),
3584                 .mode           =       0644,
3585                 .proc_handler   =       proc_dointvec,
3586         },
3587         {
3588                 .procname       =       "mtu_expires",
3589                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3590                 .maxlen         =       sizeof(int),
3591                 .mode           =       0644,
3592                 .proc_handler   =       proc_dointvec_jiffies,
3593         },
3594         {
3595                 .procname       =       "min_adv_mss",
3596                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3597                 .maxlen         =       sizeof(int),
3598                 .mode           =       0644,
3599                 .proc_handler   =       proc_dointvec,
3600         },
3601         {
3602                 .procname       =       "gc_min_interval_ms",
3603                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3604                 .maxlen         =       sizeof(int),
3605                 .mode           =       0644,
3606                 .proc_handler   =       proc_dointvec_ms_jiffies,
3607         },
3608         { }
3609 };
3610
3611 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3612 {
3613         struct ctl_table *table;
3614
3615         table = kmemdup(ipv6_route_table_template,
3616                         sizeof(ipv6_route_table_template),
3617                         GFP_KERNEL);
3618
3619         if (table) {
3620                 table[0].data = &net->ipv6.sysctl.flush_delay;
3621                 table[0].extra1 = net;
3622                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3623                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3624                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3625                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3626                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3627                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3628                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3629                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3630                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3631
3632                 /* Don't export sysctls to unprivileged users */
3633                 if (net->user_ns != &init_user_ns)
3634                         table[0].procname = NULL;
3635         }
3636
3637         return table;
3638 }
3639 #endif
3640
3641 static int __net_init ip6_route_net_init(struct net *net)
3642 {
3643         int ret = -ENOMEM;
3644
3645         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3646                sizeof(net->ipv6.ip6_dst_ops));
3647
3648         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3649                 goto out_ip6_dst_ops;
3650
3651         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3652                                            sizeof(*net->ipv6.ip6_null_entry),
3653                                            GFP_KERNEL);
3654         if (!net->ipv6.ip6_null_entry)
3655                 goto out_ip6_dst_entries;
3656         net->ipv6.ip6_null_entry->dst.path =
3657                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3658         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3659         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3660                          ip6_template_metrics, true);
3661
3662 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3663         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3664                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3665                                                GFP_KERNEL);
3666         if (!net->ipv6.ip6_prohibit_entry)
3667                 goto out_ip6_null_entry;
3668         net->ipv6.ip6_prohibit_entry->dst.path =
3669                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3670         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3671         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3672                          ip6_template_metrics, true);
3673
3674         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3675                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3676                                                GFP_KERNEL);
3677         if (!net->ipv6.ip6_blk_hole_entry)
3678                 goto out_ip6_prohibit_entry;
3679         net->ipv6.ip6_blk_hole_entry->dst.path =
3680                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3681         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3682         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3683                          ip6_template_metrics, true);
3684 #endif
3685
3686         net->ipv6.sysctl.flush_delay = 0;
3687         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3688         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3689         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3690         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3691         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3692         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3693         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3694
3695         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3696
3697         ret = 0;
3698 out:
3699         return ret;
3700
3701 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3702 out_ip6_prohibit_entry:
3703         kfree(net->ipv6.ip6_prohibit_entry);
3704 out_ip6_null_entry:
3705         kfree(net->ipv6.ip6_null_entry);
3706 #endif
3707 out_ip6_dst_entries:
3708         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3709 out_ip6_dst_ops:
3710         goto out;
3711 }
3712
3713 static void __net_exit ip6_route_net_exit(struct net *net)
3714 {
3715         kfree(net->ipv6.ip6_null_entry);
3716 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3717         kfree(net->ipv6.ip6_prohibit_entry);
3718         kfree(net->ipv6.ip6_blk_hole_entry);
3719 #endif
3720         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3721 }
3722
3723 static int __net_init ip6_route_net_init_late(struct net *net)
3724 {
3725 #ifdef CONFIG_PROC_FS
3726         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3727         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3728 #endif
3729         return 0;
3730 }
3731
3732 static void __net_exit ip6_route_net_exit_late(struct net *net)
3733 {
3734 #ifdef CONFIG_PROC_FS
3735         remove_proc_entry("ipv6_route", net->proc_net);
3736         remove_proc_entry("rt6_stats", net->proc_net);
3737 #endif
3738 }
3739
3740 static struct pernet_operations ip6_route_net_ops = {
3741         .init = ip6_route_net_init,
3742         .exit = ip6_route_net_exit,
3743 };
3744
3745 static int __net_init ipv6_inetpeer_init(struct net *net)
3746 {
3747         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3748
3749         if (!bp)
3750                 return -ENOMEM;
3751         inet_peer_base_init(bp);
3752         net->ipv6.peers = bp;
3753         return 0;
3754 }
3755
3756 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3757 {
3758         struct inet_peer_base *bp = net->ipv6.peers;
3759
3760         net->ipv6.peers = NULL;
3761         inetpeer_invalidate_tree(bp);
3762         kfree(bp);
3763 }
3764
3765 static struct pernet_operations ipv6_inetpeer_ops = {
3766         .init   =       ipv6_inetpeer_init,
3767         .exit   =       ipv6_inetpeer_exit,
3768 };
3769
3770 static struct pernet_operations ip6_route_net_late_ops = {
3771         .init = ip6_route_net_init_late,
3772         .exit = ip6_route_net_exit_late,
3773 };
3774
3775 static struct notifier_block ip6_route_dev_notifier = {
3776         .notifier_call = ip6_route_dev_notify,
3777         .priority = 0,
3778 };
3779
3780 int __init ip6_route_init(void)
3781 {
3782         int ret;
3783         int cpu;
3784
3785         ret = -ENOMEM;
3786         ip6_dst_ops_template.kmem_cachep =
3787                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3788                                   SLAB_HWCACHE_ALIGN, NULL);
3789         if (!ip6_dst_ops_template.kmem_cachep)
3790                 goto out;
3791
3792         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3793         if (ret)
3794                 goto out_kmem_cache;
3795
3796         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3797         if (ret)
3798                 goto out_dst_entries;
3799
3800         ret = register_pernet_subsys(&ip6_route_net_ops);
3801         if (ret)
3802                 goto out_register_inetpeer;
3803
3804         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3805
3806         /* Registering of the loopback is done before this portion of code,
3807          * the loopback reference in rt6_info will not be taken, do it
3808          * manually for init_net */
3809         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3810         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3811   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3812         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3813         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3814         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3815         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3816   #endif
3817         ret = fib6_init();
3818         if (ret)
3819                 goto out_register_subsys;
3820
3821         ret = xfrm6_init();
3822         if (ret)
3823                 goto out_fib6_init;
3824
3825         ret = fib6_rules_init();
3826         if (ret)
3827                 goto xfrm6_init;
3828
3829         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3830         if (ret)
3831                 goto fib6_rules_init;
3832
3833         ret = -ENOBUFS;
3834         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3835             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3836             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3837                 goto out_register_late_subsys;
3838
3839         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3840         if (ret)
3841                 goto out_register_late_subsys;
3842
3843         for_each_possible_cpu(cpu) {
3844                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3845
3846                 INIT_LIST_HEAD(&ul->head);
3847                 spin_lock_init(&ul->lock);
3848         }
3849
3850 out:
3851         return ret;
3852
3853 out_register_late_subsys:
3854         unregister_pernet_subsys(&ip6_route_net_late_ops);
3855 fib6_rules_init:
3856         fib6_rules_cleanup();
3857 xfrm6_init:
3858         xfrm6_fini();
3859 out_fib6_init:
3860         fib6_gc_cleanup();
3861 out_register_subsys:
3862         unregister_pernet_subsys(&ip6_route_net_ops);
3863 out_register_inetpeer:
3864         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3865 out_dst_entries:
3866         dst_entries_destroy(&ip6_dst_blackhole_ops);
3867 out_kmem_cache:
3868         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3869         goto out;
3870 }
3871
3872 void ip6_route_cleanup(void)
3873 {
3874         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3875         unregister_pernet_subsys(&ip6_route_net_late_ops);
3876         fib6_rules_cleanup();
3877         xfrm6_fini();
3878         fib6_gc_cleanup();
3879         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3880         unregister_pernet_subsys(&ip6_route_net_ops);
3881         dst_entries_destroy(&ip6_dst_blackhole_ops);
3882         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3883 }