e6bbcdee7707aba4e8cd219df234103c8d0c537d
[cascardo/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
62
63 #include <asm/uaccess.h>
64
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68
69 enum rt6_nud_state {
70         RT6_NUD_FAIL_HARD = -3,
71         RT6_NUD_FAIL_PROBE = -2,
72         RT6_NUD_FAIL_DO_RR = -1,
73         RT6_NUD_SUCCEED = 1
74 };
75
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int      ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int              ip6_pkt_prohibit(struct sk_buff *skb);
89 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void             ip6_link_failure(struct sk_buff *skb);
91 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92                                            struct sk_buff *skb, u32 mtu);
93 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
94                                         struct sk_buff *skb);
95 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
97
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100                                            const struct in6_addr *prefix, int prefixlen,
101                                            const struct in6_addr *gwaddr, int ifindex,
102                                            unsigned int pref);
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 struct uncached_list {
109         spinlock_t              lock;
110         struct list_head        head;
111 };
112
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
114
115 static void rt6_uncached_list_add(struct rt6_info *rt)
116 {
117         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
118
119         rt->dst.flags |= DST_NOCACHE;
120         rt->rt6i_uncached_list = ul;
121
122         spin_lock_bh(&ul->lock);
123         list_add_tail(&rt->rt6i_uncached, &ul->head);
124         spin_unlock_bh(&ul->lock);
125 }
126
127 static void rt6_uncached_list_del(struct rt6_info *rt)
128 {
129         if (!list_empty(&rt->rt6i_uncached)) {
130                 struct uncached_list *ul = rt->rt6i_uncached_list;
131
132                 spin_lock_bh(&ul->lock);
133                 list_del(&rt->rt6i_uncached);
134                 spin_unlock_bh(&ul->lock);
135         }
136 }
137
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
139 {
140         struct net_device *loopback_dev = net->loopback_dev;
141         int cpu;
142
143         for_each_possible_cpu(cpu) {
144                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
145                 struct rt6_info *rt;
146
147                 spin_lock_bh(&ul->lock);
148                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149                         struct inet6_dev *rt_idev = rt->rt6i_idev;
150                         struct net_device *rt_dev = rt->dst.dev;
151
152                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
153                             rt_idev->dev != loopback_dev) {
154                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
155                                 in6_dev_put(rt_idev);
156                         }
157
158                         if (rt_dev && (rt_dev == dev || !dev) &&
159                             rt_dev != loopback_dev) {
160                                 rt->dst.dev = loopback_dev;
161                                 dev_hold(rt->dst.dev);
162                                 dev_put(rt_dev);
163                         }
164                 }
165                 spin_unlock_bh(&ul->lock);
166         }
167 }
168
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
170 {
171         return dst_metrics_write_ptr(rt->dst.from);
172 }
173
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
175 {
176         struct rt6_info *rt = (struct rt6_info *)dst;
177
178         if (rt->rt6i_flags & RTF_PCPU)
179                 return rt6_pcpu_cow_metrics(rt);
180         else if (rt->rt6i_flags & RTF_CACHE)
181                 return NULL;
182         else
183                 return dst_cow_metrics_generic(dst, old);
184 }
185
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
187                                              struct sk_buff *skb,
188                                              const void *daddr)
189 {
190         struct in6_addr *p = &rt->rt6i_gateway;
191
192         if (!ipv6_addr_any(p))
193                 return (const void *) p;
194         else if (skb)
195                 return &ipv6_hdr(skb)->daddr;
196         return daddr;
197 }
198
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
200                                           struct sk_buff *skb,
201                                           const void *daddr)
202 {
203         struct rt6_info *rt = (struct rt6_info *) dst;
204         struct neighbour *n;
205
206         daddr = choose_neigh_daddr(rt, skb, daddr);
207         n = __ipv6_neigh_lookup(dst->dev, daddr);
208         if (n)
209                 return n;
210         return neigh_create(&nd_tbl, daddr, dst->dev);
211 }
212
213 static struct dst_ops ip6_dst_ops_template = {
214         .family                 =       AF_INET6,
215         .gc                     =       ip6_dst_gc,
216         .gc_thresh              =       1024,
217         .check                  =       ip6_dst_check,
218         .default_advmss         =       ip6_default_advmss,
219         .mtu                    =       ip6_mtu,
220         .cow_metrics            =       ipv6_cow_metrics,
221         .destroy                =       ip6_dst_destroy,
222         .ifdown                 =       ip6_dst_ifdown,
223         .negative_advice        =       ip6_negative_advice,
224         .link_failure           =       ip6_link_failure,
225         .update_pmtu            =       ip6_rt_update_pmtu,
226         .redirect               =       rt6_do_redirect,
227         .local_out              =       __ip6_local_out,
228         .neigh_lookup           =       ip6_neigh_lookup,
229 };
230
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
232 {
233         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
234
235         return mtu ? : dst->dev->mtu;
236 }
237
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239                                          struct sk_buff *skb, u32 mtu)
240 {
241 }
242
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
244                                       struct sk_buff *skb)
245 {
246 }
247
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
249                                          unsigned long old)
250 {
251         return NULL;
252 }
253
254 static struct dst_ops ip6_dst_blackhole_ops = {
255         .family                 =       AF_INET6,
256         .destroy                =       ip6_dst_destroy,
257         .check                  =       ip6_dst_check,
258         .mtu                    =       ip6_blackhole_mtu,
259         .default_advmss         =       ip6_default_advmss,
260         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
261         .redirect               =       ip6_rt_blackhole_redirect,
262         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
263         .neigh_lookup           =       ip6_neigh_lookup,
264 };
265
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267         [RTAX_HOPLIMIT - 1] = 0,
268 };
269
270 static const struct rt6_info ip6_null_entry_template = {
271         .dst = {
272                 .__refcnt       = ATOMIC_INIT(1),
273                 .__use          = 1,
274                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
275                 .error          = -ENETUNREACH,
276                 .input          = ip6_pkt_discard,
277                 .output         = ip6_pkt_discard_out,
278         },
279         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
280         .rt6i_protocol  = RTPROT_KERNEL,
281         .rt6i_metric    = ~(u32) 0,
282         .rt6i_ref       = ATOMIC_INIT(1),
283 };
284
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
286
287 static const struct rt6_info ip6_prohibit_entry_template = {
288         .dst = {
289                 .__refcnt       = ATOMIC_INIT(1),
290                 .__use          = 1,
291                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
292                 .error          = -EACCES,
293                 .input          = ip6_pkt_prohibit,
294                 .output         = ip6_pkt_prohibit_out,
295         },
296         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
297         .rt6i_protocol  = RTPROT_KERNEL,
298         .rt6i_metric    = ~(u32) 0,
299         .rt6i_ref       = ATOMIC_INIT(1),
300 };
301
302 static const struct rt6_info ip6_blk_hole_entry_template = {
303         .dst = {
304                 .__refcnt       = ATOMIC_INIT(1),
305                 .__use          = 1,
306                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
307                 .error          = -EINVAL,
308                 .input          = dst_discard,
309                 .output         = dst_discard_sk,
310         },
311         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
312         .rt6i_protocol  = RTPROT_KERNEL,
313         .rt6i_metric    = ~(u32) 0,
314         .rt6i_ref       = ATOMIC_INIT(1),
315 };
316
317 #endif
318
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321                                         struct net_device *dev,
322                                         int flags,
323                                         struct fib6_table *table)
324 {
325         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326                                         0, DST_OBSOLETE_FORCE_CHK, flags);
327
328         if (rt) {
329                 struct dst_entry *dst = &rt->dst;
330
331                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332                 INIT_LIST_HEAD(&rt->rt6i_siblings);
333                 INIT_LIST_HEAD(&rt->rt6i_uncached);
334         }
335         return rt;
336 }
337
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339                                       struct net_device *dev,
340                                       int flags,
341                                       struct fib6_table *table)
342 {
343         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
344
345         if (rt) {
346                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
347                 if (rt->rt6i_pcpu) {
348                         int cpu;
349
350                         for_each_possible_cpu(cpu) {
351                                 struct rt6_info **p;
352
353                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354                                 /* no one shares rt */
355                                 *p =  NULL;
356                         }
357                 } else {
358                         dst_destroy((struct dst_entry *)rt);
359                         return NULL;
360                 }
361         }
362
363         return rt;
364 }
365
366 static void ip6_dst_destroy(struct dst_entry *dst)
367 {
368         struct rt6_info *rt = (struct rt6_info *)dst;
369         struct dst_entry *from = dst->from;
370         struct inet6_dev *idev;
371
372         dst_destroy_metrics_generic(dst);
373         free_percpu(rt->rt6i_pcpu);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         dst->from = NULL;
383         dst_release(from);
384 }
385
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387                            int how)
388 {
389         struct rt6_info *rt = (struct rt6_info *)dst;
390         struct inet6_dev *idev = rt->rt6i_idev;
391         struct net_device *loopback_dev =
392                 dev_net(dev)->loopback_dev;
393
394         if (dev != loopback_dev) {
395                 if (idev && idev->dev == dev) {
396                         struct inet6_dev *loopback_idev =
397                                 in6_dev_get(loopback_dev);
398                         if (loopback_idev) {
399                                 rt->rt6i_idev = loopback_idev;
400                                 in6_dev_put(idev);
401                         }
402                 }
403         }
404 }
405
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES) {
409                 if (time_after(jiffies, rt->dst.expires))
410                         return true;
411         } else if (rt->dst.from) {
412                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
413         }
414         return false;
415 }
416
417 /* Multipath route selection:
418  *   Hash based function using packet header and flowlabel.
419  * Adapted from fib_info_hashfn()
420  */
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422                                const struct flowi6 *fl6)
423 {
424         unsigned int val = fl6->flowi6_proto;
425
426         val ^= ipv6_addr_hash(&fl6->daddr);
427         val ^= ipv6_addr_hash(&fl6->saddr);
428
429         /* Work only if this not encapsulated */
430         switch (fl6->flowi6_proto) {
431         case IPPROTO_UDP:
432         case IPPROTO_TCP:
433         case IPPROTO_SCTP:
434                 val ^= (__force u16)fl6->fl6_sport;
435                 val ^= (__force u16)fl6->fl6_dport;
436                 break;
437
438         case IPPROTO_ICMPV6:
439                 val ^= (__force u16)fl6->fl6_icmp_type;
440                 val ^= (__force u16)fl6->fl6_icmp_code;
441                 break;
442         }
443         /* RFC6438 recommands to use flowlabel */
444         val ^= (__force u32)fl6->flowlabel;
445
446         /* Perhaps, we need to tune, this function? */
447         val = val ^ (val >> 7) ^ (val >> 12);
448         return val % candidate_count;
449 }
450
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452                                              struct flowi6 *fl6, int oif,
453                                              int strict)
454 {
455         struct rt6_info *sibling, *next_sibling;
456         int route_choosen;
457
458         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459         /* Don't change the route, if route_choosen == 0
460          * (siblings does not include ourself)
461          */
462         if (route_choosen)
463                 list_for_each_entry_safe(sibling, next_sibling,
464                                 &match->rt6i_siblings, rt6i_siblings) {
465                         route_choosen--;
466                         if (route_choosen == 0) {
467                                 if (rt6_score_route(sibling, oif, strict) < 0)
468                                         break;
469                                 match = sibling;
470                                 break;
471                         }
472                 }
473         return match;
474 }
475
476 /*
477  *      Route lookup. Any table->tb6_lock is implied.
478  */
479
480 static inline struct rt6_info *rt6_device_match(struct net *net,
481                                                     struct rt6_info *rt,
482                                                     const struct in6_addr *saddr,
483                                                     int oif,
484                                                     int flags)
485 {
486         struct rt6_info *local = NULL;
487         struct rt6_info *sprt;
488
489         if (!oif && ipv6_addr_any(saddr))
490                 goto out;
491
492         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493                 struct net_device *dev = sprt->dst.dev;
494
495                 if (oif) {
496                         if (dev->ifindex == oif)
497                                 return sprt;
498                         if (dev->flags & IFF_LOOPBACK) {
499                                 if (!sprt->rt6i_idev ||
500                                     sprt->rt6i_idev->dev->ifindex != oif) {
501                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
502                                                 continue;
503                                         if (local && (!oif ||
504                                                       local->rt6i_idev->dev->ifindex == oif))
505                                                 continue;
506                                 }
507                                 local = sprt;
508                         }
509                 } else {
510                         if (ipv6_chk_addr(net, saddr, dev,
511                                           flags & RT6_LOOKUP_F_IFACE))
512                                 return sprt;
513                 }
514         }
515
516         if (oif) {
517                 if (local)
518                         return local;
519
520                 if (flags & RT6_LOOKUP_F_IFACE)
521                         return net->ipv6.ip6_null_entry;
522         }
523 out:
524         return rt;
525 }
526
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529         struct work_struct work;
530         struct in6_addr target;
531         struct net_device *dev;
532 };
533
534 static void rt6_probe_deferred(struct work_struct *w)
535 {
536         struct in6_addr mcaddr;
537         struct __rt6_probe_work *work =
538                 container_of(w, struct __rt6_probe_work, work);
539
540         addrconf_addr_solict_mult(&work->target, &mcaddr);
541         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
542         dev_put(work->dev);
543         kfree(work);
544 }
545
546 static void rt6_probe(struct rt6_info *rt)
547 {
548         struct __rt6_probe_work *work;
549         struct neighbour *neigh;
550         /*
551          * Okay, this does not seem to be appropriate
552          * for now, however, we need to check if it
553          * is really so; aka Router Reachability Probing.
554          *
555          * Router Reachability Probe MUST be rate-limited
556          * to no more than one per minute.
557          */
558         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
559                 return;
560         rcu_read_lock_bh();
561         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
562         if (neigh) {
563                 if (neigh->nud_state & NUD_VALID)
564                         goto out;
565
566                 work = NULL;
567                 write_lock(&neigh->lock);
568                 if (!(neigh->nud_state & NUD_VALID) &&
569                     time_after(jiffies,
570                                neigh->updated +
571                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
572                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
573                         if (work)
574                                 __neigh_set_probe_once(neigh);
575                 }
576                 write_unlock(&neigh->lock);
577         } else {
578                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
579         }
580
581         if (work) {
582                 INIT_WORK(&work->work, rt6_probe_deferred);
583                 work->target = rt->rt6i_gateway;
584                 dev_hold(rt->dst.dev);
585                 work->dev = rt->dst.dev;
586                 schedule_work(&work->work);
587         }
588
589 out:
590         rcu_read_unlock_bh();
591 }
592 #else
593 static inline void rt6_probe(struct rt6_info *rt)
594 {
595 }
596 #endif
597
598 /*
599  * Default Router Selection (RFC 2461 6.3.6)
600  */
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
602 {
603         struct net_device *dev = rt->dst.dev;
604         if (!oif || dev->ifindex == oif)
605                 return 2;
606         if ((dev->flags & IFF_LOOPBACK) &&
607             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
608                 return 1;
609         return 0;
610 }
611
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
613 {
614         struct neighbour *neigh;
615         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
616
617         if (rt->rt6i_flags & RTF_NONEXTHOP ||
618             !(rt->rt6i_flags & RTF_GATEWAY))
619                 return RT6_NUD_SUCCEED;
620
621         rcu_read_lock_bh();
622         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
623         if (neigh) {
624                 read_lock(&neigh->lock);
625                 if (neigh->nud_state & NUD_VALID)
626                         ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628                 else if (!(neigh->nud_state & NUD_FAILED))
629                         ret = RT6_NUD_SUCCEED;
630                 else
631                         ret = RT6_NUD_FAIL_PROBE;
632 #endif
633                 read_unlock(&neigh->lock);
634         } else {
635                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
637         }
638         rcu_read_unlock_bh();
639
640         return ret;
641 }
642
643 static int rt6_score_route(struct rt6_info *rt, int oif,
644                            int strict)
645 {
646         int m;
647
648         m = rt6_check_dev(rt, oif);
649         if (!m && (strict & RT6_LOOKUP_F_IFACE))
650                 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
653 #endif
654         if (strict & RT6_LOOKUP_F_REACHABLE) {
655                 int n = rt6_check_neigh(rt);
656                 if (n < 0)
657                         return n;
658         }
659         return m;
660 }
661
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663                                    int *mpri, struct rt6_info *match,
664                                    bool *do_rr)
665 {
666         int m;
667         bool match_do_rr = false;
668         struct inet6_dev *idev = rt->rt6i_idev;
669         struct net_device *dev = rt->dst.dev;
670
671         if (dev && !netif_carrier_ok(dev) &&
672             idev->cnf.ignore_routes_with_linkdown)
673                 goto out;
674
675         if (rt6_check_expired(rt))
676                 goto out;
677
678         m = rt6_score_route(rt, oif, strict);
679         if (m == RT6_NUD_FAIL_DO_RR) {
680                 match_do_rr = true;
681                 m = 0; /* lowest valid score */
682         } else if (m == RT6_NUD_FAIL_HARD) {
683                 goto out;
684         }
685
686         if (strict & RT6_LOOKUP_F_REACHABLE)
687                 rt6_probe(rt);
688
689         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
690         if (m > *mpri) {
691                 *do_rr = match_do_rr;
692                 *mpri = m;
693                 match = rt;
694         }
695 out:
696         return match;
697 }
698
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700                                      struct rt6_info *rr_head,
701                                      u32 metric, int oif, int strict,
702                                      bool *do_rr)
703 {
704         struct rt6_info *rt, *match, *cont;
705         int mpri = -1;
706
707         match = NULL;
708         cont = NULL;
709         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
710                 if (rt->rt6i_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
719                 if (rt->rt6i_metric != metric) {
720                         cont = rt;
721                         break;
722                 }
723
724                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725         }
726
727         if (match || !cont)
728                 return match;
729
730         for (rt = cont; rt; rt = rt->dst.rt6_next)
731                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732
733         return match;
734 }
735
736 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
737 {
738         struct rt6_info *match, *rt0;
739         struct net *net;
740         bool do_rr = false;
741
742         rt0 = fn->rr_ptr;
743         if (!rt0)
744                 fn->rr_ptr = rt0 = fn->leaf;
745
746         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
747                              &do_rr);
748
749         if (do_rr) {
750                 struct rt6_info *next = rt0->dst.rt6_next;
751
752                 /* no entries matched; do round-robin */
753                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
754                         next = fn->leaf;
755
756                 if (next != rt0)
757                         fn->rr_ptr = next;
758         }
759
760         net = dev_net(rt0->dst.dev);
761         return match ? match : net->ipv6.ip6_null_entry;
762 }
763
764 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
765 {
766         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
767 }
768
769 #ifdef CONFIG_IPV6_ROUTE_INFO
770 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
771                   const struct in6_addr *gwaddr)
772 {
773         struct net *net = dev_net(dev);
774         struct route_info *rinfo = (struct route_info *) opt;
775         struct in6_addr prefix_buf, *prefix;
776         unsigned int pref;
777         unsigned long lifetime;
778         struct rt6_info *rt;
779
780         if (len < sizeof(struct route_info)) {
781                 return -EINVAL;
782         }
783
784         /* Sanity check for prefix_len and length */
785         if (rinfo->length > 3) {
786                 return -EINVAL;
787         } else if (rinfo->prefix_len > 128) {
788                 return -EINVAL;
789         } else if (rinfo->prefix_len > 64) {
790                 if (rinfo->length < 2) {
791                         return -EINVAL;
792                 }
793         } else if (rinfo->prefix_len > 0) {
794                 if (rinfo->length < 1) {
795                         return -EINVAL;
796                 }
797         }
798
799         pref = rinfo->route_pref;
800         if (pref == ICMPV6_ROUTER_PREF_INVALID)
801                 return -EINVAL;
802
803         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
804
805         if (rinfo->length == 3)
806                 prefix = (struct in6_addr *)rinfo->prefix;
807         else {
808                 /* this function is safe */
809                 ipv6_addr_prefix(&prefix_buf,
810                                  (struct in6_addr *)rinfo->prefix,
811                                  rinfo->prefix_len);
812                 prefix = &prefix_buf;
813         }
814
815         if (rinfo->prefix_len == 0)
816                 rt = rt6_get_dflt_router(gwaddr, dev);
817         else
818                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
819                                         gwaddr, dev->ifindex);
820
821         if (rt && !lifetime) {
822                 ip6_del_rt(rt);
823                 rt = NULL;
824         }
825
826         if (!rt && lifetime)
827                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
828                                         pref);
829         else if (rt)
830                 rt->rt6i_flags = RTF_ROUTEINFO |
831                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
832
833         if (rt) {
834                 if (!addrconf_finite_timeout(lifetime))
835                         rt6_clean_expires(rt);
836                 else
837                         rt6_set_expires(rt, jiffies + HZ * lifetime);
838
839                 ip6_rt_put(rt);
840         }
841         return 0;
842 }
843 #endif
844
845 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
846                                         struct in6_addr *saddr)
847 {
848         struct fib6_node *pn;
849         while (1) {
850                 if (fn->fn_flags & RTN_TL_ROOT)
851                         return NULL;
852                 pn = fn->parent;
853                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
854                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
855                 else
856                         fn = pn;
857                 if (fn->fn_flags & RTN_RTINFO)
858                         return fn;
859         }
860 }
861
862 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
863                                              struct fib6_table *table,
864                                              struct flowi6 *fl6, int flags)
865 {
866         struct fib6_node *fn;
867         struct rt6_info *rt;
868
869         read_lock_bh(&table->tb6_lock);
870         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
871 restart:
872         rt = fn->leaf;
873         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
874         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
875                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
876         if (rt == net->ipv6.ip6_null_entry) {
877                 fn = fib6_backtrack(fn, &fl6->saddr);
878                 if (fn)
879                         goto restart;
880         }
881         dst_use(&rt->dst, jiffies);
882         read_unlock_bh(&table->tb6_lock);
883         return rt;
884
885 }
886
887 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
888                                     int flags)
889 {
890         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
891 }
892 EXPORT_SYMBOL_GPL(ip6_route_lookup);
893
894 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
895                             const struct in6_addr *saddr, int oif, int strict)
896 {
897         struct flowi6 fl6 = {
898                 .flowi6_oif = oif,
899                 .daddr = *daddr,
900         };
901         struct dst_entry *dst;
902         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
903
904         if (saddr) {
905                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
906                 flags |= RT6_LOOKUP_F_HAS_SADDR;
907         }
908
909         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
910         if (dst->error == 0)
911                 return (struct rt6_info *) dst;
912
913         dst_release(dst);
914
915         return NULL;
916 }
917 EXPORT_SYMBOL(rt6_lookup);
918
919 /* ip6_ins_rt is called with FREE table->tb6_lock.
920    It takes new route entry, the addition fails by any reason the
921    route is freed. In any case, if caller does not hold it, it may
922    be destroyed.
923  */
924
925 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
926                         struct mx6_config *mxc)
927 {
928         int err;
929         struct fib6_table *table;
930
931         table = rt->rt6i_table;
932         write_lock_bh(&table->tb6_lock);
933         err = fib6_add(&table->tb6_root, rt, info, mxc);
934         write_unlock_bh(&table->tb6_lock);
935
936         return err;
937 }
938
939 int ip6_ins_rt(struct rt6_info *rt)
940 {
941         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
942         struct mx6_config mxc = { .mx = NULL, };
943
944         return __ip6_ins_rt(rt, &info, &mxc);
945 }
946
947 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
948                                            const struct in6_addr *daddr,
949                                            const struct in6_addr *saddr)
950 {
951         struct rt6_info *rt;
952
953         /*
954          *      Clone the route.
955          */
956
957         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
958                 ort = (struct rt6_info *)ort->dst.from;
959
960         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
961                              0, ort->rt6i_table);
962
963         if (!rt)
964                 return NULL;
965
966         ip6_rt_copy_init(rt, ort);
967         rt->rt6i_flags |= RTF_CACHE;
968         rt->rt6i_metric = 0;
969         rt->dst.flags |= DST_HOST;
970         rt->rt6i_dst.addr = *daddr;
971         rt->rt6i_dst.plen = 128;
972
973         if (!rt6_is_gw_or_nonexthop(ort)) {
974                 if (ort->rt6i_dst.plen != 128 &&
975                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
976                         rt->rt6i_flags |= RTF_ANYCAST;
977 #ifdef CONFIG_IPV6_SUBTREES
978                 if (rt->rt6i_src.plen && saddr) {
979                         rt->rt6i_src.addr = *saddr;
980                         rt->rt6i_src.plen = 128;
981                 }
982 #endif
983         }
984
985         return rt;
986 }
987
988 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
989 {
990         struct rt6_info *pcpu_rt;
991
992         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
993                                   rt->dst.dev, rt->dst.flags,
994                                   rt->rt6i_table);
995
996         if (!pcpu_rt)
997                 return NULL;
998         ip6_rt_copy_init(pcpu_rt, rt);
999         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1000         pcpu_rt->rt6i_flags |= RTF_PCPU;
1001         return pcpu_rt;
1002 }
1003
1004 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1005 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1006 {
1007         struct rt6_info *pcpu_rt, *prev, **p;
1008
1009         p = this_cpu_ptr(rt->rt6i_pcpu);
1010         pcpu_rt = *p;
1011
1012         if (pcpu_rt)
1013                 goto done;
1014
1015         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016         if (!pcpu_rt) {
1017                 struct net *net = dev_net(rt->dst.dev);
1018
1019                 pcpu_rt = net->ipv6.ip6_null_entry;
1020                 goto done;
1021         }
1022
1023         prev = cmpxchg(p, NULL, pcpu_rt);
1024         if (prev) {
1025                 /* If someone did it before us, return prev instead */
1026                 dst_destroy(&pcpu_rt->dst);
1027                 pcpu_rt = prev;
1028         }
1029
1030 done:
1031         dst_hold(&pcpu_rt->dst);
1032         rt6_dst_from_metrics_check(pcpu_rt);
1033         return pcpu_rt;
1034 }
1035
1036 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1037                                       struct flowi6 *fl6, int flags)
1038 {
1039         struct fib6_node *fn, *saved_fn;
1040         struct rt6_info *rt;
1041         int strict = 0;
1042
1043         strict |= flags & RT6_LOOKUP_F_IFACE;
1044         if (net->ipv6.devconf_all->forwarding == 0)
1045                 strict |= RT6_LOOKUP_F_REACHABLE;
1046
1047         read_lock_bh(&table->tb6_lock);
1048
1049         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1050         saved_fn = fn;
1051
1052 redo_rt6_select:
1053         rt = rt6_select(fn, oif, strict);
1054         if (rt->rt6i_nsiblings)
1055                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1056         if (rt == net->ipv6.ip6_null_entry) {
1057                 fn = fib6_backtrack(fn, &fl6->saddr);
1058                 if (fn)
1059                         goto redo_rt6_select;
1060                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1061                         /* also consider unreachable route */
1062                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1063                         fn = saved_fn;
1064                         goto redo_rt6_select;
1065                 }
1066         }
1067
1068
1069         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1070                 dst_use(&rt->dst, jiffies);
1071                 read_unlock_bh(&table->tb6_lock);
1072
1073                 rt6_dst_from_metrics_check(rt);
1074                 return rt;
1075         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1076                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1077                 /* Create a RTF_CACHE clone which will not be
1078                  * owned by the fib6 tree.  It is for the special case where
1079                  * the daddr in the skb during the neighbor look-up is different
1080                  * from the fl6->daddr used to look-up route here.
1081                  */
1082
1083                 struct rt6_info *uncached_rt;
1084
1085                 dst_use(&rt->dst, jiffies);
1086                 read_unlock_bh(&table->tb6_lock);
1087
1088                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1089                 dst_release(&rt->dst);
1090
1091                 if (uncached_rt)
1092                         rt6_uncached_list_add(uncached_rt);
1093                 else
1094                         uncached_rt = net->ipv6.ip6_null_entry;
1095
1096                 dst_hold(&uncached_rt->dst);
1097                 return uncached_rt;
1098
1099         } else {
1100                 /* Get a percpu copy */
1101
1102                 struct rt6_info *pcpu_rt;
1103
1104                 rt->dst.lastuse = jiffies;
1105                 rt->dst.__use++;
1106                 pcpu_rt = rt6_get_pcpu_route(rt);
1107                 read_unlock_bh(&table->tb6_lock);
1108
1109                 return pcpu_rt;
1110         }
1111 }
1112
1113 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1114                                             struct flowi6 *fl6, int flags)
1115 {
1116         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1117 }
1118
1119 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1120                                                 struct net_device *dev,
1121                                                 struct flowi6 *fl6, int flags)
1122 {
1123         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1124                 flags |= RT6_LOOKUP_F_IFACE;
1125
1126         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1127 }
1128
1129 void ip6_route_input(struct sk_buff *skb)
1130 {
1131         const struct ipv6hdr *iph = ipv6_hdr(skb);
1132         struct net *net = dev_net(skb->dev);
1133         int flags = RT6_LOOKUP_F_HAS_SADDR;
1134         struct flowi6 fl6 = {
1135                 .flowi6_iif = skb->dev->ifindex,
1136                 .daddr = iph->daddr,
1137                 .saddr = iph->saddr,
1138                 .flowlabel = ip6_flowinfo(iph),
1139                 .flowi6_mark = skb->mark,
1140                 .flowi6_proto = iph->nexthdr,
1141         };
1142
1143         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1144 }
1145
1146 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1147                                              struct flowi6 *fl6, int flags)
1148 {
1149         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1150 }
1151
1152 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1153                                     struct flowi6 *fl6)
1154 {
1155         int flags = 0;
1156
1157         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1158
1159         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1160                 flags |= RT6_LOOKUP_F_IFACE;
1161
1162         if (!ipv6_addr_any(&fl6->saddr))
1163                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1164         else if (sk)
1165                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1166
1167         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1168 }
1169 EXPORT_SYMBOL(ip6_route_output);
1170
1171 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1172 {
1173         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1174         struct dst_entry *new = NULL;
1175
1176         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1177         if (rt) {
1178                 new = &rt->dst;
1179
1180                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1181
1182                 new->__use = 1;
1183                 new->input = dst_discard;
1184                 new->output = dst_discard_sk;
1185
1186                 if (dst_metrics_read_only(&ort->dst))
1187                         new->_metrics = ort->dst._metrics;
1188                 else
1189                         dst_copy_metrics(new, &ort->dst);
1190                 rt->rt6i_idev = ort->rt6i_idev;
1191                 if (rt->rt6i_idev)
1192                         in6_dev_hold(rt->rt6i_idev);
1193
1194                 rt->rt6i_gateway = ort->rt6i_gateway;
1195                 rt->rt6i_flags = ort->rt6i_flags;
1196                 rt->rt6i_metric = 0;
1197
1198                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1199 #ifdef CONFIG_IPV6_SUBTREES
1200                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1201 #endif
1202
1203                 dst_free(new);
1204         }
1205
1206         dst_release(dst_orig);
1207         return new ? new : ERR_PTR(-ENOMEM);
1208 }
1209
1210 /*
1211  *      Destination cache support functions
1212  */
1213
1214 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1215 {
1216         if (rt->dst.from &&
1217             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1218                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1219 }
1220
1221 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1222 {
1223         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1224                 return NULL;
1225
1226         if (rt6_check_expired(rt))
1227                 return NULL;
1228
1229         return &rt->dst;
1230 }
1231
1232 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1233 {
1234         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1235             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1236                 return &rt->dst;
1237         else
1238                 return NULL;
1239 }
1240
1241 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1242 {
1243         struct rt6_info *rt;
1244
1245         rt = (struct rt6_info *) dst;
1246
1247         /* All IPV6 dsts are created with ->obsolete set to the value
1248          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1249          * into this function always.
1250          */
1251
1252         rt6_dst_from_metrics_check(rt);
1253
1254         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1255                 return rt6_dst_from_check(rt, cookie);
1256         else
1257                 return rt6_check(rt, cookie);
1258 }
1259
1260 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1261 {
1262         struct rt6_info *rt = (struct rt6_info *) dst;
1263
1264         if (rt) {
1265                 if (rt->rt6i_flags & RTF_CACHE) {
1266                         if (rt6_check_expired(rt)) {
1267                                 ip6_del_rt(rt);
1268                                 dst = NULL;
1269                         }
1270                 } else {
1271                         dst_release(dst);
1272                         dst = NULL;
1273                 }
1274         }
1275         return dst;
1276 }
1277
1278 static void ip6_link_failure(struct sk_buff *skb)
1279 {
1280         struct rt6_info *rt;
1281
1282         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1283
1284         rt = (struct rt6_info *) skb_dst(skb);
1285         if (rt) {
1286                 if (rt->rt6i_flags & RTF_CACHE) {
1287                         dst_hold(&rt->dst);
1288                         if (ip6_del_rt(rt))
1289                                 dst_free(&rt->dst);
1290                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1291                         rt->rt6i_node->fn_sernum = -1;
1292                 }
1293         }
1294 }
1295
1296 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1297 {
1298         struct net *net = dev_net(rt->dst.dev);
1299
1300         rt->rt6i_flags |= RTF_MODIFIED;
1301         rt->rt6i_pmtu = mtu;
1302         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1303 }
1304
1305 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1306                                  const struct ipv6hdr *iph, u32 mtu)
1307 {
1308         struct rt6_info *rt6 = (struct rt6_info *)dst;
1309
1310         if (rt6->rt6i_flags & RTF_LOCAL)
1311                 return;
1312
1313         dst_confirm(dst);
1314         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1315         if (mtu >= dst_mtu(dst))
1316                 return;
1317
1318         if (rt6->rt6i_flags & RTF_CACHE) {
1319                 rt6_do_update_pmtu(rt6, mtu);
1320         } else {
1321                 const struct in6_addr *daddr, *saddr;
1322                 struct rt6_info *nrt6;
1323
1324                 if (iph) {
1325                         daddr = &iph->daddr;
1326                         saddr = &iph->saddr;
1327                 } else if (sk) {
1328                         daddr = &sk->sk_v6_daddr;
1329                         saddr = &inet6_sk(sk)->saddr;
1330                 } else {
1331                         return;
1332                 }
1333                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1334                 if (nrt6) {
1335                         rt6_do_update_pmtu(nrt6, mtu);
1336
1337                         /* ip6_ins_rt(nrt6) will bump the
1338                          * rt6->rt6i_node->fn_sernum
1339                          * which will fail the next rt6_check() and
1340                          * invalidate the sk->sk_dst_cache.
1341                          */
1342                         ip6_ins_rt(nrt6);
1343                 }
1344         }
1345 }
1346
1347 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1348                                struct sk_buff *skb, u32 mtu)
1349 {
1350         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1351 }
1352
1353 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1354                      int oif, u32 mark)
1355 {
1356         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1357         struct dst_entry *dst;
1358         struct flowi6 fl6;
1359
1360         memset(&fl6, 0, sizeof(fl6));
1361         fl6.flowi6_oif = oif;
1362         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1363         fl6.daddr = iph->daddr;
1364         fl6.saddr = iph->saddr;
1365         fl6.flowlabel = ip6_flowinfo(iph);
1366
1367         dst = ip6_route_output(net, NULL, &fl6);
1368         if (!dst->error)
1369                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1370         dst_release(dst);
1371 }
1372 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1373
1374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1375 {
1376         ip6_update_pmtu(skb, sock_net(sk), mtu,
1377                         sk->sk_bound_dev_if, sk->sk_mark);
1378 }
1379 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1380
1381 /* Handle redirects */
1382 struct ip6rd_flowi {
1383         struct flowi6 fl6;
1384         struct in6_addr gateway;
1385 };
1386
1387 static struct rt6_info *__ip6_route_redirect(struct net *net,
1388                                              struct fib6_table *table,
1389                                              struct flowi6 *fl6,
1390                                              int flags)
1391 {
1392         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1393         struct rt6_info *rt;
1394         struct fib6_node *fn;
1395
1396         /* Get the "current" route for this destination and
1397          * check if the redirect has come from approriate router.
1398          *
1399          * RFC 4861 specifies that redirects should only be
1400          * accepted if they come from the nexthop to the target.
1401          * Due to the way the routes are chosen, this notion
1402          * is a bit fuzzy and one might need to check all possible
1403          * routes.
1404          */
1405
1406         read_lock_bh(&table->tb6_lock);
1407         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1408 restart:
1409         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1410                 if (rt6_check_expired(rt))
1411                         continue;
1412                 if (rt->dst.error)
1413                         break;
1414                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1415                         continue;
1416                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1417                         continue;
1418                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1419                         continue;
1420                 break;
1421         }
1422
1423         if (!rt)
1424                 rt = net->ipv6.ip6_null_entry;
1425         else if (rt->dst.error) {
1426                 rt = net->ipv6.ip6_null_entry;
1427                 goto out;
1428         }
1429
1430         if (rt == net->ipv6.ip6_null_entry) {
1431                 fn = fib6_backtrack(fn, &fl6->saddr);
1432                 if (fn)
1433                         goto restart;
1434         }
1435
1436 out:
1437         dst_hold(&rt->dst);
1438
1439         read_unlock_bh(&table->tb6_lock);
1440
1441         return rt;
1442 };
1443
1444 static struct dst_entry *ip6_route_redirect(struct net *net,
1445                                         const struct flowi6 *fl6,
1446                                         const struct in6_addr *gateway)
1447 {
1448         int flags = RT6_LOOKUP_F_HAS_SADDR;
1449         struct ip6rd_flowi rdfl;
1450
1451         rdfl.fl6 = *fl6;
1452         rdfl.gateway = *gateway;
1453
1454         return fib6_rule_lookup(net, &rdfl.fl6,
1455                                 flags, __ip6_route_redirect);
1456 }
1457
1458 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1459 {
1460         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1461         struct dst_entry *dst;
1462         struct flowi6 fl6;
1463
1464         memset(&fl6, 0, sizeof(fl6));
1465         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1466         fl6.flowi6_oif = oif;
1467         fl6.flowi6_mark = mark;
1468         fl6.daddr = iph->daddr;
1469         fl6.saddr = iph->saddr;
1470         fl6.flowlabel = ip6_flowinfo(iph);
1471
1472         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1473         rt6_do_redirect(dst, NULL, skb);
1474         dst_release(dst);
1475 }
1476 EXPORT_SYMBOL_GPL(ip6_redirect);
1477
1478 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1479                             u32 mark)
1480 {
1481         const struct ipv6hdr *iph = ipv6_hdr(skb);
1482         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1483         struct dst_entry *dst;
1484         struct flowi6 fl6;
1485
1486         memset(&fl6, 0, sizeof(fl6));
1487         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1488         fl6.flowi6_oif = oif;
1489         fl6.flowi6_mark = mark;
1490         fl6.daddr = msg->dest;
1491         fl6.saddr = iph->daddr;
1492
1493         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1494         rt6_do_redirect(dst, NULL, skb);
1495         dst_release(dst);
1496 }
1497
1498 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1499 {
1500         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1501 }
1502 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1503
1504 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1505 {
1506         struct net_device *dev = dst->dev;
1507         unsigned int mtu = dst_mtu(dst);
1508         struct net *net = dev_net(dev);
1509
1510         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1511
1512         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1513                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1514
1515         /*
1516          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1517          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1518          * IPV6_MAXPLEN is also valid and means: "any MSS,
1519          * rely only on pmtu discovery"
1520          */
1521         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1522                 mtu = IPV6_MAXPLEN;
1523         return mtu;
1524 }
1525
1526 static unsigned int ip6_mtu(const struct dst_entry *dst)
1527 {
1528         const struct rt6_info *rt = (const struct rt6_info *)dst;
1529         unsigned int mtu = rt->rt6i_pmtu;
1530         struct inet6_dev *idev;
1531
1532         if (mtu)
1533                 goto out;
1534
1535         mtu = dst_metric_raw(dst, RTAX_MTU);
1536         if (mtu)
1537                 goto out;
1538
1539         mtu = IPV6_MIN_MTU;
1540
1541         rcu_read_lock();
1542         idev = __in6_dev_get(dst->dev);
1543         if (idev)
1544                 mtu = idev->cnf.mtu6;
1545         rcu_read_unlock();
1546
1547 out:
1548         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1549 }
1550
1551 static struct dst_entry *icmp6_dst_gc_list;
1552 static DEFINE_SPINLOCK(icmp6_dst_lock);
1553
1554 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1555                                   struct flowi6 *fl6)
1556 {
1557         struct dst_entry *dst;
1558         struct rt6_info *rt;
1559         struct inet6_dev *idev = in6_dev_get(dev);
1560         struct net *net = dev_net(dev);
1561
1562         if (unlikely(!idev))
1563                 return ERR_PTR(-ENODEV);
1564
1565         rt = ip6_dst_alloc(net, dev, 0, NULL);
1566         if (unlikely(!rt)) {
1567                 in6_dev_put(idev);
1568                 dst = ERR_PTR(-ENOMEM);
1569                 goto out;
1570         }
1571
1572         rt->dst.flags |= DST_HOST;
1573         rt->dst.output  = ip6_output;
1574         atomic_set(&rt->dst.__refcnt, 1);
1575         rt->rt6i_gateway  = fl6->daddr;
1576         rt->rt6i_dst.addr = fl6->daddr;
1577         rt->rt6i_dst.plen = 128;
1578         rt->rt6i_idev     = idev;
1579         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1580
1581         spin_lock_bh(&icmp6_dst_lock);
1582         rt->dst.next = icmp6_dst_gc_list;
1583         icmp6_dst_gc_list = &rt->dst;
1584         spin_unlock_bh(&icmp6_dst_lock);
1585
1586         fib6_force_start_gc(net);
1587
1588         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1589
1590 out:
1591         return dst;
1592 }
1593
1594 int icmp6_dst_gc(void)
1595 {
1596         struct dst_entry *dst, **pprev;
1597         int more = 0;
1598
1599         spin_lock_bh(&icmp6_dst_lock);
1600         pprev = &icmp6_dst_gc_list;
1601
1602         while ((dst = *pprev) != NULL) {
1603                 if (!atomic_read(&dst->__refcnt)) {
1604                         *pprev = dst->next;
1605                         dst_free(dst);
1606                 } else {
1607                         pprev = &dst->next;
1608                         ++more;
1609                 }
1610         }
1611
1612         spin_unlock_bh(&icmp6_dst_lock);
1613
1614         return more;
1615 }
1616
1617 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1618                             void *arg)
1619 {
1620         struct dst_entry *dst, **pprev;
1621
1622         spin_lock_bh(&icmp6_dst_lock);
1623         pprev = &icmp6_dst_gc_list;
1624         while ((dst = *pprev) != NULL) {
1625                 struct rt6_info *rt = (struct rt6_info *) dst;
1626                 if (func(rt, arg)) {
1627                         *pprev = dst->next;
1628                         dst_free(dst);
1629                 } else {
1630                         pprev = &dst->next;
1631                 }
1632         }
1633         spin_unlock_bh(&icmp6_dst_lock);
1634 }
1635
1636 static int ip6_dst_gc(struct dst_ops *ops)
1637 {
1638         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1639         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1640         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1641         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1642         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1643         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1644         int entries;
1645
1646         entries = dst_entries_get_fast(ops);
1647         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1648             entries <= rt_max_size)
1649                 goto out;
1650
1651         net->ipv6.ip6_rt_gc_expire++;
1652         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1653         entries = dst_entries_get_slow(ops);
1654         if (entries < ops->gc_thresh)
1655                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1656 out:
1657         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1658         return entries > rt_max_size;
1659 }
1660
1661 static int ip6_convert_metrics(struct mx6_config *mxc,
1662                                const struct fib6_config *cfg)
1663 {
1664         struct nlattr *nla;
1665         int remaining;
1666         u32 *mp;
1667
1668         if (!cfg->fc_mx)
1669                 return 0;
1670
1671         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1672         if (unlikely(!mp))
1673                 return -ENOMEM;
1674
1675         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1676                 int type = nla_type(nla);
1677
1678                 if (type) {
1679                         u32 val;
1680
1681                         if (unlikely(type > RTAX_MAX))
1682                                 goto err;
1683                         if (type == RTAX_CC_ALGO) {
1684                                 char tmp[TCP_CA_NAME_MAX];
1685
1686                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1687                                 val = tcp_ca_get_key_by_name(tmp);
1688                                 if (val == TCP_CA_UNSPEC)
1689                                         goto err;
1690                         } else {
1691                                 val = nla_get_u32(nla);
1692                         }
1693
1694                         mp[type - 1] = val;
1695                         __set_bit(type - 1, mxc->mx_valid);
1696                 }
1697         }
1698
1699         mxc->mx = mp;
1700
1701         return 0;
1702  err:
1703         kfree(mp);
1704         return -EINVAL;
1705 }
1706
1707 int ip6_route_add(struct fib6_config *cfg)
1708 {
1709         int err;
1710         struct net *net = cfg->fc_nlinfo.nl_net;
1711         struct rt6_info *rt = NULL;
1712         struct net_device *dev = NULL;
1713         struct inet6_dev *idev = NULL;
1714         struct fib6_table *table;
1715         struct mx6_config mxc = { .mx = NULL, };
1716         int addr_type;
1717
1718         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1719                 return -EINVAL;
1720 #ifndef CONFIG_IPV6_SUBTREES
1721         if (cfg->fc_src_len)
1722                 return -EINVAL;
1723 #endif
1724         if (cfg->fc_ifindex) {
1725                 err = -ENODEV;
1726                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1727                 if (!dev)
1728                         goto out;
1729                 idev = in6_dev_get(dev);
1730                 if (!idev)
1731                         goto out;
1732         }
1733
1734         if (cfg->fc_metric == 0)
1735                 cfg->fc_metric = IP6_RT_PRIO_USER;
1736
1737         err = -ENOBUFS;
1738         if (cfg->fc_nlinfo.nlh &&
1739             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1740                 table = fib6_get_table(net, cfg->fc_table);
1741                 if (!table) {
1742                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1743                         table = fib6_new_table(net, cfg->fc_table);
1744                 }
1745         } else {
1746                 table = fib6_new_table(net, cfg->fc_table);
1747         }
1748
1749         if (!table)
1750                 goto out;
1751
1752         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1753
1754         if (!rt) {
1755                 err = -ENOMEM;
1756                 goto out;
1757         }
1758
1759         if (cfg->fc_flags & RTF_EXPIRES)
1760                 rt6_set_expires(rt, jiffies +
1761                                 clock_t_to_jiffies(cfg->fc_expires));
1762         else
1763                 rt6_clean_expires(rt);
1764
1765         if (cfg->fc_protocol == RTPROT_UNSPEC)
1766                 cfg->fc_protocol = RTPROT_BOOT;
1767         rt->rt6i_protocol = cfg->fc_protocol;
1768
1769         addr_type = ipv6_addr_type(&cfg->fc_dst);
1770
1771         if (addr_type & IPV6_ADDR_MULTICAST)
1772                 rt->dst.input = ip6_mc_input;
1773         else if (cfg->fc_flags & RTF_LOCAL)
1774                 rt->dst.input = ip6_input;
1775         else
1776                 rt->dst.input = ip6_forward;
1777
1778         rt->dst.output = ip6_output;
1779
1780         if (cfg->fc_encap) {
1781                 struct lwtunnel_state *lwtstate;
1782
1783                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1784                                            cfg->fc_encap, &lwtstate);
1785                 if (err)
1786                         goto out;
1787                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1788                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1789                         rt->dst.lwtstate->orig_output = rt->dst.output;
1790                         rt->dst.output = lwtunnel_output;
1791                 }
1792                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1793                         rt->dst.lwtstate->orig_input = rt->dst.input;
1794                         rt->dst.input = lwtunnel_input;
1795                 }
1796         }
1797
1798         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1799         rt->rt6i_dst.plen = cfg->fc_dst_len;
1800         if (rt->rt6i_dst.plen == 128)
1801                 rt->dst.flags |= DST_HOST;
1802
1803 #ifdef CONFIG_IPV6_SUBTREES
1804         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1805         rt->rt6i_src.plen = cfg->fc_src_len;
1806 #endif
1807
1808         rt->rt6i_metric = cfg->fc_metric;
1809
1810         /* We cannot add true routes via loopback here,
1811            they would result in kernel looping; promote them to reject routes
1812          */
1813         if ((cfg->fc_flags & RTF_REJECT) ||
1814             (dev && (dev->flags & IFF_LOOPBACK) &&
1815              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1816              !(cfg->fc_flags & RTF_LOCAL))) {
1817                 /* hold loopback dev/idev if we haven't done so. */
1818                 if (dev != net->loopback_dev) {
1819                         if (dev) {
1820                                 dev_put(dev);
1821                                 in6_dev_put(idev);
1822                         }
1823                         dev = net->loopback_dev;
1824                         dev_hold(dev);
1825                         idev = in6_dev_get(dev);
1826                         if (!idev) {
1827                                 err = -ENODEV;
1828                                 goto out;
1829                         }
1830                 }
1831                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1832                 switch (cfg->fc_type) {
1833                 case RTN_BLACKHOLE:
1834                         rt->dst.error = -EINVAL;
1835                         rt->dst.output = dst_discard_sk;
1836                         rt->dst.input = dst_discard;
1837                         break;
1838                 case RTN_PROHIBIT:
1839                         rt->dst.error = -EACCES;
1840                         rt->dst.output = ip6_pkt_prohibit_out;
1841                         rt->dst.input = ip6_pkt_prohibit;
1842                         break;
1843                 case RTN_THROW:
1844                 default:
1845                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1846                                         : -ENETUNREACH;
1847                         rt->dst.output = ip6_pkt_discard_out;
1848                         rt->dst.input = ip6_pkt_discard;
1849                         break;
1850                 }
1851                 goto install_route;
1852         }
1853
1854         if (cfg->fc_flags & RTF_GATEWAY) {
1855                 const struct in6_addr *gw_addr;
1856                 int gwa_type;
1857
1858                 gw_addr = &cfg->fc_gateway;
1859                 gwa_type = ipv6_addr_type(gw_addr);
1860
1861                 /* if gw_addr is local we will fail to detect this in case
1862                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1863                  * will return already-added prefix route via interface that
1864                  * prefix route was assigned to, which might be non-loopback.
1865                  */
1866                 err = -EINVAL;
1867                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1868                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1869                                             dev : NULL, 0, 0))
1870                         goto out;
1871
1872                 rt->rt6i_gateway = *gw_addr;
1873
1874                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1875                         struct rt6_info *grt;
1876
1877                         /* IPv6 strictly inhibits using not link-local
1878                            addresses as nexthop address.
1879                            Otherwise, router will not able to send redirects.
1880                            It is very good, but in some (rare!) circumstances
1881                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1882                            some exceptions. --ANK
1883                          */
1884                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1885                                 goto out;
1886
1887                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1888
1889                         err = -EHOSTUNREACH;
1890                         if (!grt)
1891                                 goto out;
1892                         if (dev) {
1893                                 if (dev != grt->dst.dev) {
1894                                         ip6_rt_put(grt);
1895                                         goto out;
1896                                 }
1897                         } else {
1898                                 dev = grt->dst.dev;
1899                                 idev = grt->rt6i_idev;
1900                                 dev_hold(dev);
1901                                 in6_dev_hold(grt->rt6i_idev);
1902                         }
1903                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1904                                 err = 0;
1905                         ip6_rt_put(grt);
1906
1907                         if (err)
1908                                 goto out;
1909                 }
1910                 err = -EINVAL;
1911                 if (!dev || (dev->flags & IFF_LOOPBACK))
1912                         goto out;
1913         }
1914
1915         err = -ENODEV;
1916         if (!dev)
1917                 goto out;
1918
1919         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1920                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1921                         err = -EINVAL;
1922                         goto out;
1923                 }
1924                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1925                 rt->rt6i_prefsrc.plen = 128;
1926         } else
1927                 rt->rt6i_prefsrc.plen = 0;
1928
1929         rt->rt6i_flags = cfg->fc_flags;
1930
1931 install_route:
1932         rt->dst.dev = dev;
1933         rt->rt6i_idev = idev;
1934         rt->rt6i_table = table;
1935
1936         cfg->fc_nlinfo.nl_net = dev_net(dev);
1937
1938         err = ip6_convert_metrics(&mxc, cfg);
1939         if (err)
1940                 goto out;
1941
1942         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1943
1944         kfree(mxc.mx);
1945         return err;
1946 out:
1947         if (dev)
1948                 dev_put(dev);
1949         if (idev)
1950                 in6_dev_put(idev);
1951         if (rt)
1952                 dst_free(&rt->dst);
1953         return err;
1954 }
1955
1956 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1957 {
1958         int err;
1959         struct fib6_table *table;
1960         struct net *net = dev_net(rt->dst.dev);
1961
1962         if (rt == net->ipv6.ip6_null_entry) {
1963                 err = -ENOENT;
1964                 goto out;
1965         }
1966
1967         table = rt->rt6i_table;
1968         write_lock_bh(&table->tb6_lock);
1969         err = fib6_del(rt, info);
1970         write_unlock_bh(&table->tb6_lock);
1971
1972 out:
1973         ip6_rt_put(rt);
1974         return err;
1975 }
1976
1977 int ip6_del_rt(struct rt6_info *rt)
1978 {
1979         struct nl_info info = {
1980                 .nl_net = dev_net(rt->dst.dev),
1981         };
1982         return __ip6_del_rt(rt, &info);
1983 }
1984
1985 static int ip6_route_del(struct fib6_config *cfg)
1986 {
1987         struct fib6_table *table;
1988         struct fib6_node *fn;
1989         struct rt6_info *rt;
1990         int err = -ESRCH;
1991
1992         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1993         if (!table)
1994                 return err;
1995
1996         read_lock_bh(&table->tb6_lock);
1997
1998         fn = fib6_locate(&table->tb6_root,
1999                          &cfg->fc_dst, cfg->fc_dst_len,
2000                          &cfg->fc_src, cfg->fc_src_len);
2001
2002         if (fn) {
2003                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2004                         if ((rt->rt6i_flags & RTF_CACHE) &&
2005                             !(cfg->fc_flags & RTF_CACHE))
2006                                 continue;
2007                         if (cfg->fc_ifindex &&
2008                             (!rt->dst.dev ||
2009                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2010                                 continue;
2011                         if (cfg->fc_flags & RTF_GATEWAY &&
2012                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2013                                 continue;
2014                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2015                                 continue;
2016                         dst_hold(&rt->dst);
2017                         read_unlock_bh(&table->tb6_lock);
2018
2019                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2020                 }
2021         }
2022         read_unlock_bh(&table->tb6_lock);
2023
2024         return err;
2025 }
2026
2027 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2028 {
2029         struct net *net = dev_net(skb->dev);
2030         struct netevent_redirect netevent;
2031         struct rt6_info *rt, *nrt = NULL;
2032         struct ndisc_options ndopts;
2033         struct inet6_dev *in6_dev;
2034         struct neighbour *neigh;
2035         struct rd_msg *msg;
2036         int optlen, on_link;
2037         u8 *lladdr;
2038
2039         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2040         optlen -= sizeof(*msg);
2041
2042         if (optlen < 0) {
2043                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2044                 return;
2045         }
2046
2047         msg = (struct rd_msg *)icmp6_hdr(skb);
2048
2049         if (ipv6_addr_is_multicast(&msg->dest)) {
2050                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2051                 return;
2052         }
2053
2054         on_link = 0;
2055         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2056                 on_link = 1;
2057         } else if (ipv6_addr_type(&msg->target) !=
2058                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2059                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2060                 return;
2061         }
2062
2063         in6_dev = __in6_dev_get(skb->dev);
2064         if (!in6_dev)
2065                 return;
2066         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2067                 return;
2068
2069         /* RFC2461 8.1:
2070          *      The IP source address of the Redirect MUST be the same as the current
2071          *      first-hop router for the specified ICMP Destination Address.
2072          */
2073
2074         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2075                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2076                 return;
2077         }
2078
2079         lladdr = NULL;
2080         if (ndopts.nd_opts_tgt_lladdr) {
2081                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2082                                              skb->dev);
2083                 if (!lladdr) {
2084                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2085                         return;
2086                 }
2087         }
2088
2089         rt = (struct rt6_info *) dst;
2090         if (rt == net->ipv6.ip6_null_entry) {
2091                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2092                 return;
2093         }
2094
2095         /* Redirect received -> path was valid.
2096          * Look, redirects are sent only in response to data packets,
2097          * so that this nexthop apparently is reachable. --ANK
2098          */
2099         dst_confirm(&rt->dst);
2100
2101         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2102         if (!neigh)
2103                 return;
2104
2105         /*
2106          *      We have finally decided to accept it.
2107          */
2108
2109         neigh_update(neigh, lladdr, NUD_STALE,
2110                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2111                      NEIGH_UPDATE_F_OVERRIDE|
2112                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2113                                      NEIGH_UPDATE_F_ISROUTER))
2114                      );
2115
2116         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2117         if (!nrt)
2118                 goto out;
2119
2120         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2121         if (on_link)
2122                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2123
2124         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2125
2126         if (ip6_ins_rt(nrt))
2127                 goto out;
2128
2129         netevent.old = &rt->dst;
2130         netevent.new = &nrt->dst;
2131         netevent.daddr = &msg->dest;
2132         netevent.neigh = neigh;
2133         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2134
2135         if (rt->rt6i_flags & RTF_CACHE) {
2136                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2137                 ip6_del_rt(rt);
2138         }
2139
2140 out:
2141         neigh_release(neigh);
2142 }
2143
2144 /*
2145  *      Misc support functions
2146  */
2147
2148 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2149 {
2150         BUG_ON(from->dst.from);
2151
2152         rt->rt6i_flags &= ~RTF_EXPIRES;
2153         dst_hold(&from->dst);
2154         rt->dst.from = &from->dst;
2155         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2156 }
2157
2158 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2159 {
2160         rt->dst.input = ort->dst.input;
2161         rt->dst.output = ort->dst.output;
2162         rt->rt6i_dst = ort->rt6i_dst;
2163         rt->dst.error = ort->dst.error;
2164         rt->rt6i_idev = ort->rt6i_idev;
2165         if (rt->rt6i_idev)
2166                 in6_dev_hold(rt->rt6i_idev);
2167         rt->dst.lastuse = jiffies;
2168         rt->rt6i_gateway = ort->rt6i_gateway;
2169         rt->rt6i_flags = ort->rt6i_flags;
2170         rt6_set_from(rt, ort);
2171         rt->rt6i_metric = ort->rt6i_metric;
2172 #ifdef CONFIG_IPV6_SUBTREES
2173         rt->rt6i_src = ort->rt6i_src;
2174 #endif
2175         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2176         rt->rt6i_table = ort->rt6i_table;
2177         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2178 }
2179
2180 #ifdef CONFIG_IPV6_ROUTE_INFO
2181 static struct rt6_info *rt6_get_route_info(struct net *net,
2182                                            const struct in6_addr *prefix, int prefixlen,
2183                                            const struct in6_addr *gwaddr, int ifindex)
2184 {
2185         struct fib6_node *fn;
2186         struct rt6_info *rt = NULL;
2187         struct fib6_table *table;
2188
2189         table = fib6_get_table(net, RT6_TABLE_INFO);
2190         if (!table)
2191                 return NULL;
2192
2193         read_lock_bh(&table->tb6_lock);
2194         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2195         if (!fn)
2196                 goto out;
2197
2198         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2199                 if (rt->dst.dev->ifindex != ifindex)
2200                         continue;
2201                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2202                         continue;
2203                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2204                         continue;
2205                 dst_hold(&rt->dst);
2206                 break;
2207         }
2208 out:
2209         read_unlock_bh(&table->tb6_lock);
2210         return rt;
2211 }
2212
2213 static struct rt6_info *rt6_add_route_info(struct net *net,
2214                                            const struct in6_addr *prefix, int prefixlen,
2215                                            const struct in6_addr *gwaddr, int ifindex,
2216                                            unsigned int pref)
2217 {
2218         struct fib6_config cfg = {
2219                 .fc_table       = RT6_TABLE_INFO,
2220                 .fc_metric      = IP6_RT_PRIO_USER,
2221                 .fc_ifindex     = ifindex,
2222                 .fc_dst_len     = prefixlen,
2223                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2224                                   RTF_UP | RTF_PREF(pref),
2225                 .fc_nlinfo.portid = 0,
2226                 .fc_nlinfo.nlh = NULL,
2227                 .fc_nlinfo.nl_net = net,
2228         };
2229
2230         cfg.fc_dst = *prefix;
2231         cfg.fc_gateway = *gwaddr;
2232
2233         /* We should treat it as a default route if prefix length is 0. */
2234         if (!prefixlen)
2235                 cfg.fc_flags |= RTF_DEFAULT;
2236
2237         ip6_route_add(&cfg);
2238
2239         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2240 }
2241 #endif
2242
2243 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2244 {
2245         struct rt6_info *rt;
2246         struct fib6_table *table;
2247
2248         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2249         if (!table)
2250                 return NULL;
2251
2252         read_lock_bh(&table->tb6_lock);
2253         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2254                 if (dev == rt->dst.dev &&
2255                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2256                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2257                         break;
2258         }
2259         if (rt)
2260                 dst_hold(&rt->dst);
2261         read_unlock_bh(&table->tb6_lock);
2262         return rt;
2263 }
2264
2265 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2266                                      struct net_device *dev,
2267                                      unsigned int pref)
2268 {
2269         struct fib6_config cfg = {
2270                 .fc_table       = RT6_TABLE_DFLT,
2271                 .fc_metric      = IP6_RT_PRIO_USER,
2272                 .fc_ifindex     = dev->ifindex,
2273                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2274                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2275                 .fc_nlinfo.portid = 0,
2276                 .fc_nlinfo.nlh = NULL,
2277                 .fc_nlinfo.nl_net = dev_net(dev),
2278         };
2279
2280         cfg.fc_gateway = *gwaddr;
2281
2282         ip6_route_add(&cfg);
2283
2284         return rt6_get_dflt_router(gwaddr, dev);
2285 }
2286
2287 void rt6_purge_dflt_routers(struct net *net)
2288 {
2289         struct rt6_info *rt;
2290         struct fib6_table *table;
2291
2292         /* NOTE: Keep consistent with rt6_get_dflt_router */
2293         table = fib6_get_table(net, RT6_TABLE_DFLT);
2294         if (!table)
2295                 return;
2296
2297 restart:
2298         read_lock_bh(&table->tb6_lock);
2299         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2300                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2301                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2302                         dst_hold(&rt->dst);
2303                         read_unlock_bh(&table->tb6_lock);
2304                         ip6_del_rt(rt);
2305                         goto restart;
2306                 }
2307         }
2308         read_unlock_bh(&table->tb6_lock);
2309 }
2310
2311 static void rtmsg_to_fib6_config(struct net *net,
2312                                  struct in6_rtmsg *rtmsg,
2313                                  struct fib6_config *cfg)
2314 {
2315         memset(cfg, 0, sizeof(*cfg));
2316
2317         cfg->fc_table = RT6_TABLE_MAIN;
2318         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2319         cfg->fc_metric = rtmsg->rtmsg_metric;
2320         cfg->fc_expires = rtmsg->rtmsg_info;
2321         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2322         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2323         cfg->fc_flags = rtmsg->rtmsg_flags;
2324
2325         cfg->fc_nlinfo.nl_net = net;
2326
2327         cfg->fc_dst = rtmsg->rtmsg_dst;
2328         cfg->fc_src = rtmsg->rtmsg_src;
2329         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2330 }
2331
2332 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2333 {
2334         struct fib6_config cfg;
2335         struct in6_rtmsg rtmsg;
2336         int err;
2337
2338         switch (cmd) {
2339         case SIOCADDRT:         /* Add a route */
2340         case SIOCDELRT:         /* Delete a route */
2341                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2342                         return -EPERM;
2343                 err = copy_from_user(&rtmsg, arg,
2344                                      sizeof(struct in6_rtmsg));
2345                 if (err)
2346                         return -EFAULT;
2347
2348                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2349
2350                 rtnl_lock();
2351                 switch (cmd) {
2352                 case SIOCADDRT:
2353                         err = ip6_route_add(&cfg);
2354                         break;
2355                 case SIOCDELRT:
2356                         err = ip6_route_del(&cfg);
2357                         break;
2358                 default:
2359                         err = -EINVAL;
2360                 }
2361                 rtnl_unlock();
2362
2363                 return err;
2364         }
2365
2366         return -EINVAL;
2367 }
2368
2369 /*
2370  *      Drop the packet on the floor
2371  */
2372
2373 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2374 {
2375         int type;
2376         struct dst_entry *dst = skb_dst(skb);
2377         switch (ipstats_mib_noroutes) {
2378         case IPSTATS_MIB_INNOROUTES:
2379                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2380                 if (type == IPV6_ADDR_ANY) {
2381                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2382                                       IPSTATS_MIB_INADDRERRORS);
2383                         break;
2384                 }
2385                 /* FALLTHROUGH */
2386         case IPSTATS_MIB_OUTNOROUTES:
2387                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2388                               ipstats_mib_noroutes);
2389                 break;
2390         }
2391         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2392         kfree_skb(skb);
2393         return 0;
2394 }
2395
2396 static int ip6_pkt_discard(struct sk_buff *skb)
2397 {
2398         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2399 }
2400
2401 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2402 {
2403         skb->dev = skb_dst(skb)->dev;
2404         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2405 }
2406
2407 static int ip6_pkt_prohibit(struct sk_buff *skb)
2408 {
2409         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2410 }
2411
2412 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2413 {
2414         skb->dev = skb_dst(skb)->dev;
2415         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2416 }
2417
2418 /*
2419  *      Allocate a dst for local (unicast / anycast) address.
2420  */
2421
2422 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2423                                     const struct in6_addr *addr,
2424                                     bool anycast)
2425 {
2426         struct net *net = dev_net(idev->dev);
2427         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2428                                             DST_NOCOUNT, NULL);
2429         if (!rt)
2430                 return ERR_PTR(-ENOMEM);
2431
2432         in6_dev_hold(idev);
2433
2434         rt->dst.flags |= DST_HOST;
2435         rt->dst.input = ip6_input;
2436         rt->dst.output = ip6_output;
2437         rt->rt6i_idev = idev;
2438
2439         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2440         if (anycast)
2441                 rt->rt6i_flags |= RTF_ANYCAST;
2442         else
2443                 rt->rt6i_flags |= RTF_LOCAL;
2444
2445         rt->rt6i_gateway  = *addr;
2446         rt->rt6i_dst.addr = *addr;
2447         rt->rt6i_dst.plen = 128;
2448         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2449
2450         atomic_set(&rt->dst.__refcnt, 1);
2451
2452         return rt;
2453 }
2454
2455 int ip6_route_get_saddr(struct net *net,
2456                         struct rt6_info *rt,
2457                         const struct in6_addr *daddr,
2458                         unsigned int prefs,
2459                         struct in6_addr *saddr)
2460 {
2461         struct inet6_dev *idev =
2462                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2463         int err = 0;
2464         if (rt && rt->rt6i_prefsrc.plen)
2465                 *saddr = rt->rt6i_prefsrc.addr;
2466         else
2467                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2468                                          daddr, prefs, saddr);
2469         return err;
2470 }
2471
2472 /* remove deleted ip from prefsrc entries */
2473 struct arg_dev_net_ip {
2474         struct net_device *dev;
2475         struct net *net;
2476         struct in6_addr *addr;
2477 };
2478
2479 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2480 {
2481         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2482         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2483         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2484
2485         if (((void *)rt->dst.dev == dev || !dev) &&
2486             rt != net->ipv6.ip6_null_entry &&
2487             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2488                 /* remove prefsrc entry */
2489                 rt->rt6i_prefsrc.plen = 0;
2490         }
2491         return 0;
2492 }
2493
2494 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2495 {
2496         struct net *net = dev_net(ifp->idev->dev);
2497         struct arg_dev_net_ip adni = {
2498                 .dev = ifp->idev->dev,
2499                 .net = net,
2500                 .addr = &ifp->addr,
2501         };
2502         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2503 }
2504
2505 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2506 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2507
2508 /* Remove routers and update dst entries when gateway turn into host. */
2509 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2510 {
2511         struct in6_addr *gateway = (struct in6_addr *)arg;
2512
2513         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2514              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2515              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2516                 return -1;
2517         }
2518         return 0;
2519 }
2520
2521 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2522 {
2523         fib6_clean_all(net, fib6_clean_tohost, gateway);
2524 }
2525
2526 struct arg_dev_net {
2527         struct net_device *dev;
2528         struct net *net;
2529 };
2530
2531 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2532 {
2533         const struct arg_dev_net *adn = arg;
2534         const struct net_device *dev = adn->dev;
2535
2536         if ((rt->dst.dev == dev || !dev) &&
2537             rt != adn->net->ipv6.ip6_null_entry)
2538                 return -1;
2539
2540         return 0;
2541 }
2542
2543 void rt6_ifdown(struct net *net, struct net_device *dev)
2544 {
2545         struct arg_dev_net adn = {
2546                 .dev = dev,
2547                 .net = net,
2548         };
2549
2550         fib6_clean_all(net, fib6_ifdown, &adn);
2551         icmp6_clean_all(fib6_ifdown, &adn);
2552         rt6_uncached_list_flush_dev(net, dev);
2553 }
2554
2555 struct rt6_mtu_change_arg {
2556         struct net_device *dev;
2557         unsigned int mtu;
2558 };
2559
2560 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2561 {
2562         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2563         struct inet6_dev *idev;
2564
2565         /* In IPv6 pmtu discovery is not optional,
2566            so that RTAX_MTU lock cannot disable it.
2567            We still use this lock to block changes
2568            caused by addrconf/ndisc.
2569         */
2570
2571         idev = __in6_dev_get(arg->dev);
2572         if (!idev)
2573                 return 0;
2574
2575         /* For administrative MTU increase, there is no way to discover
2576            IPv6 PMTU increase, so PMTU increase should be updated here.
2577            Since RFC 1981 doesn't include administrative MTU increase
2578            update PMTU increase is a MUST. (i.e. jumbo frame)
2579          */
2580         /*
2581            If new MTU is less than route PMTU, this new MTU will be the
2582            lowest MTU in the path, update the route PMTU to reflect PMTU
2583            decreases; if new MTU is greater than route PMTU, and the
2584            old MTU is the lowest MTU in the path, update the route PMTU
2585            to reflect the increase. In this case if the other nodes' MTU
2586            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2587            PMTU discouvery.
2588          */
2589         if (rt->dst.dev == arg->dev &&
2590             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2591                 if (rt->rt6i_flags & RTF_CACHE) {
2592                         /* For RTF_CACHE with rt6i_pmtu == 0
2593                          * (i.e. a redirected route),
2594                          * the metrics of its rt->dst.from has already
2595                          * been updated.
2596                          */
2597                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2598                                 rt->rt6i_pmtu = arg->mtu;
2599                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2600                            (dst_mtu(&rt->dst) < arg->mtu &&
2601                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2602                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2603                 }
2604         }
2605         return 0;
2606 }
2607
2608 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2609 {
2610         struct rt6_mtu_change_arg arg = {
2611                 .dev = dev,
2612                 .mtu = mtu,
2613         };
2614
2615         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2616 }
2617
2618 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2619         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2620         [RTA_OIF]               = { .type = NLA_U32 },
2621         [RTA_IIF]               = { .type = NLA_U32 },
2622         [RTA_PRIORITY]          = { .type = NLA_U32 },
2623         [RTA_METRICS]           = { .type = NLA_NESTED },
2624         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2625         [RTA_PREF]              = { .type = NLA_U8 },
2626         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2627         [RTA_ENCAP]             = { .type = NLA_NESTED },
2628 };
2629
2630 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2631                               struct fib6_config *cfg)
2632 {
2633         struct rtmsg *rtm;
2634         struct nlattr *tb[RTA_MAX+1];
2635         unsigned int pref;
2636         int err;
2637
2638         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2639         if (err < 0)
2640                 goto errout;
2641
2642         err = -EINVAL;
2643         rtm = nlmsg_data(nlh);
2644         memset(cfg, 0, sizeof(*cfg));
2645
2646         cfg->fc_table = rtm->rtm_table;
2647         cfg->fc_dst_len = rtm->rtm_dst_len;
2648         cfg->fc_src_len = rtm->rtm_src_len;
2649         cfg->fc_flags = RTF_UP;
2650         cfg->fc_protocol = rtm->rtm_protocol;
2651         cfg->fc_type = rtm->rtm_type;
2652
2653         if (rtm->rtm_type == RTN_UNREACHABLE ||
2654             rtm->rtm_type == RTN_BLACKHOLE ||
2655             rtm->rtm_type == RTN_PROHIBIT ||
2656             rtm->rtm_type == RTN_THROW)
2657                 cfg->fc_flags |= RTF_REJECT;
2658
2659         if (rtm->rtm_type == RTN_LOCAL)
2660                 cfg->fc_flags |= RTF_LOCAL;
2661
2662         if (rtm->rtm_flags & RTM_F_CLONED)
2663                 cfg->fc_flags |= RTF_CACHE;
2664
2665         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2666         cfg->fc_nlinfo.nlh = nlh;
2667         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2668
2669         if (tb[RTA_GATEWAY]) {
2670                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2671                 cfg->fc_flags |= RTF_GATEWAY;
2672         }
2673
2674         if (tb[RTA_DST]) {
2675                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2676
2677                 if (nla_len(tb[RTA_DST]) < plen)
2678                         goto errout;
2679
2680                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2681         }
2682
2683         if (tb[RTA_SRC]) {
2684                 int plen = (rtm->rtm_src_len + 7) >> 3;
2685
2686                 if (nla_len(tb[RTA_SRC]) < plen)
2687                         goto errout;
2688
2689                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2690         }
2691
2692         if (tb[RTA_PREFSRC])
2693                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2694
2695         if (tb[RTA_OIF])
2696                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2697
2698         if (tb[RTA_PRIORITY])
2699                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2700
2701         if (tb[RTA_METRICS]) {
2702                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2703                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2704         }
2705
2706         if (tb[RTA_TABLE])
2707                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2708
2709         if (tb[RTA_MULTIPATH]) {
2710                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2711                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2712         }
2713
2714         if (tb[RTA_PREF]) {
2715                 pref = nla_get_u8(tb[RTA_PREF]);
2716                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2717                     pref != ICMPV6_ROUTER_PREF_HIGH)
2718                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2719                 cfg->fc_flags |= RTF_PREF(pref);
2720         }
2721
2722         if (tb[RTA_ENCAP])
2723                 cfg->fc_encap = tb[RTA_ENCAP];
2724
2725         if (tb[RTA_ENCAP_TYPE])
2726                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2727
2728         err = 0;
2729 errout:
2730         return err;
2731 }
2732
2733 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2734 {
2735         struct fib6_config r_cfg;
2736         struct rtnexthop *rtnh;
2737         int remaining;
2738         int attrlen;
2739         int err = 0, last_err = 0;
2740
2741         remaining = cfg->fc_mp_len;
2742 beginning:
2743         rtnh = (struct rtnexthop *)cfg->fc_mp;
2744
2745         /* Parse a Multipath Entry */
2746         while (rtnh_ok(rtnh, remaining)) {
2747                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2748                 if (rtnh->rtnh_ifindex)
2749                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2750
2751                 attrlen = rtnh_attrlen(rtnh);
2752                 if (attrlen > 0) {
2753                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2754
2755                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2756                         if (nla) {
2757                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2758                                 r_cfg.fc_flags |= RTF_GATEWAY;
2759                         }
2760                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2761                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2762                         if (nla)
2763                                 r_cfg.fc_encap_type = nla_get_u16(nla);
2764                 }
2765                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2766                 if (err) {
2767                         last_err = err;
2768                         /* If we are trying to remove a route, do not stop the
2769                          * loop when ip6_route_del() fails (because next hop is
2770                          * already gone), we should try to remove all next hops.
2771                          */
2772                         if (add) {
2773                                 /* If add fails, we should try to delete all
2774                                  * next hops that have been already added.
2775                                  */
2776                                 add = 0;
2777                                 remaining = cfg->fc_mp_len - remaining;
2778                                 goto beginning;
2779                         }
2780                 }
2781                 /* Because each route is added like a single route we remove
2782                  * these flags after the first nexthop: if there is a collision,
2783                  * we have already failed to add the first nexthop:
2784                  * fib6_add_rt2node() has rejected it; when replacing, old
2785                  * nexthops have been replaced by first new, the rest should
2786                  * be added to it.
2787                  */
2788                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2789                                                      NLM_F_REPLACE);
2790                 rtnh = rtnh_next(rtnh, &remaining);
2791         }
2792
2793         return last_err;
2794 }
2795
2796 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2797 {
2798         struct fib6_config cfg;
2799         int err;
2800
2801         err = rtm_to_fib6_config(skb, nlh, &cfg);
2802         if (err < 0)
2803                 return err;
2804
2805         if (cfg.fc_mp)
2806                 return ip6_route_multipath(&cfg, 0);
2807         else
2808                 return ip6_route_del(&cfg);
2809 }
2810
2811 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2812 {
2813         struct fib6_config cfg;
2814         int err;
2815
2816         err = rtm_to_fib6_config(skb, nlh, &cfg);
2817         if (err < 0)
2818                 return err;
2819
2820         if (cfg.fc_mp)
2821                 return ip6_route_multipath(&cfg, 1);
2822         else
2823                 return ip6_route_add(&cfg);
2824 }
2825
2826 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2827 {
2828         return NLMSG_ALIGN(sizeof(struct rtmsg))
2829                + nla_total_size(16) /* RTA_SRC */
2830                + nla_total_size(16) /* RTA_DST */
2831                + nla_total_size(16) /* RTA_GATEWAY */
2832                + nla_total_size(16) /* RTA_PREFSRC */
2833                + nla_total_size(4) /* RTA_TABLE */
2834                + nla_total_size(4) /* RTA_IIF */
2835                + nla_total_size(4) /* RTA_OIF */
2836                + nla_total_size(4) /* RTA_PRIORITY */
2837                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2838                + nla_total_size(sizeof(struct rta_cacheinfo))
2839                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2840                + nla_total_size(1) /* RTA_PREF */
2841                + lwtunnel_get_encap_size(rt->dst.lwtstate);
2842 }
2843
2844 static int rt6_fill_node(struct net *net,
2845                          struct sk_buff *skb, struct rt6_info *rt,
2846                          struct in6_addr *dst, struct in6_addr *src,
2847                          int iif, int type, u32 portid, u32 seq,
2848                          int prefix, int nowait, unsigned int flags)
2849 {
2850         u32 metrics[RTAX_MAX];
2851         struct rtmsg *rtm;
2852         struct nlmsghdr *nlh;
2853         long expires;
2854         u32 table;
2855
2856         if (prefix) {   /* user wants prefix routes only */
2857                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2858                         /* success since this is not a prefix route */
2859                         return 1;
2860                 }
2861         }
2862
2863         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2864         if (!nlh)
2865                 return -EMSGSIZE;
2866
2867         rtm = nlmsg_data(nlh);
2868         rtm->rtm_family = AF_INET6;
2869         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2870         rtm->rtm_src_len = rt->rt6i_src.plen;
2871         rtm->rtm_tos = 0;
2872         if (rt->rt6i_table)
2873                 table = rt->rt6i_table->tb6_id;
2874         else
2875                 table = RT6_TABLE_UNSPEC;
2876         rtm->rtm_table = table;
2877         if (nla_put_u32(skb, RTA_TABLE, table))
2878                 goto nla_put_failure;
2879         if (rt->rt6i_flags & RTF_REJECT) {
2880                 switch (rt->dst.error) {
2881                 case -EINVAL:
2882                         rtm->rtm_type = RTN_BLACKHOLE;
2883                         break;
2884                 case -EACCES:
2885                         rtm->rtm_type = RTN_PROHIBIT;
2886                         break;
2887                 case -EAGAIN:
2888                         rtm->rtm_type = RTN_THROW;
2889                         break;
2890                 default:
2891                         rtm->rtm_type = RTN_UNREACHABLE;
2892                         break;
2893                 }
2894         }
2895         else if (rt->rt6i_flags & RTF_LOCAL)
2896                 rtm->rtm_type = RTN_LOCAL;
2897         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2898                 rtm->rtm_type = RTN_LOCAL;
2899         else
2900                 rtm->rtm_type = RTN_UNICAST;
2901         rtm->rtm_flags = 0;
2902         if (!netif_carrier_ok(rt->dst.dev)) {
2903                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
2904                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
2905                         rtm->rtm_flags |= RTNH_F_DEAD;
2906         }
2907         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2908         rtm->rtm_protocol = rt->rt6i_protocol;
2909         if (rt->rt6i_flags & RTF_DYNAMIC)
2910                 rtm->rtm_protocol = RTPROT_REDIRECT;
2911         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2912                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2913                         rtm->rtm_protocol = RTPROT_RA;
2914                 else
2915                         rtm->rtm_protocol = RTPROT_KERNEL;
2916         }
2917
2918         if (rt->rt6i_flags & RTF_CACHE)
2919                 rtm->rtm_flags |= RTM_F_CLONED;
2920
2921         if (dst) {
2922                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2923                         goto nla_put_failure;
2924                 rtm->rtm_dst_len = 128;
2925         } else if (rtm->rtm_dst_len)
2926                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2927                         goto nla_put_failure;
2928 #ifdef CONFIG_IPV6_SUBTREES
2929         if (src) {
2930                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2931                         goto nla_put_failure;
2932                 rtm->rtm_src_len = 128;
2933         } else if (rtm->rtm_src_len &&
2934                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2935                 goto nla_put_failure;
2936 #endif
2937         if (iif) {
2938 #ifdef CONFIG_IPV6_MROUTE
2939                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2940                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2941                         if (err <= 0) {
2942                                 if (!nowait) {
2943                                         if (err == 0)
2944                                                 return 0;
2945                                         goto nla_put_failure;
2946                                 } else {
2947                                         if (err == -EMSGSIZE)
2948                                                 goto nla_put_failure;
2949                                 }
2950                         }
2951                 } else
2952 #endif
2953                         if (nla_put_u32(skb, RTA_IIF, iif))
2954                                 goto nla_put_failure;
2955         } else if (dst) {
2956                 struct in6_addr saddr_buf;
2957                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2958                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2959                         goto nla_put_failure;
2960         }
2961
2962         if (rt->rt6i_prefsrc.plen) {
2963                 struct in6_addr saddr_buf;
2964                 saddr_buf = rt->rt6i_prefsrc.addr;
2965                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2966                         goto nla_put_failure;
2967         }
2968
2969         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2970         if (rt->rt6i_pmtu)
2971                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2972         if (rtnetlink_put_metrics(skb, metrics) < 0)
2973                 goto nla_put_failure;
2974
2975         if (rt->rt6i_flags & RTF_GATEWAY) {
2976                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2977                         goto nla_put_failure;
2978         }
2979
2980         if (rt->dst.dev &&
2981             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2982                 goto nla_put_failure;
2983         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2984                 goto nla_put_failure;
2985
2986         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2987
2988         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2989                 goto nla_put_failure;
2990
2991         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2992                 goto nla_put_failure;
2993
2994         lwtunnel_fill_encap(skb, rt->dst.lwtstate);
2995
2996         nlmsg_end(skb, nlh);
2997         return 0;
2998
2999 nla_put_failure:
3000         nlmsg_cancel(skb, nlh);
3001         return -EMSGSIZE;
3002 }
3003
3004 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3005 {
3006         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3007         int prefix;
3008
3009         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3010                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3011                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3012         } else
3013                 prefix = 0;
3014
3015         return rt6_fill_node(arg->net,
3016                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3017                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3018                      prefix, 0, NLM_F_MULTI);
3019 }
3020
3021 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3022 {
3023         struct net *net = sock_net(in_skb->sk);
3024         struct nlattr *tb[RTA_MAX+1];
3025         struct rt6_info *rt;
3026         struct sk_buff *skb;
3027         struct rtmsg *rtm;
3028         struct flowi6 fl6;
3029         int err, iif = 0, oif = 0;
3030
3031         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3032         if (err < 0)
3033                 goto errout;
3034
3035         err = -EINVAL;
3036         memset(&fl6, 0, sizeof(fl6));
3037
3038         if (tb[RTA_SRC]) {
3039                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3040                         goto errout;
3041
3042                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3043         }
3044
3045         if (tb[RTA_DST]) {
3046                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3047                         goto errout;
3048
3049                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3050         }
3051
3052         if (tb[RTA_IIF])
3053                 iif = nla_get_u32(tb[RTA_IIF]);
3054
3055         if (tb[RTA_OIF])
3056                 oif = nla_get_u32(tb[RTA_OIF]);
3057
3058         if (tb[RTA_MARK])
3059                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3060
3061         if (iif) {
3062                 struct net_device *dev;
3063                 int flags = 0;
3064
3065                 dev = __dev_get_by_index(net, iif);
3066                 if (!dev) {
3067                         err = -ENODEV;
3068                         goto errout;
3069                 }
3070
3071                 fl6.flowi6_iif = iif;
3072
3073                 if (!ipv6_addr_any(&fl6.saddr))
3074                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3075
3076                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3077                                                                flags);
3078         } else {
3079                 fl6.flowi6_oif = oif;
3080
3081                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3082         }
3083
3084         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3085         if (!skb) {
3086                 ip6_rt_put(rt);
3087                 err = -ENOBUFS;
3088                 goto errout;
3089         }
3090
3091         /* Reserve room for dummy headers, this skb can pass
3092            through good chunk of routing engine.
3093          */
3094         skb_reset_mac_header(skb);
3095         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3096
3097         skb_dst_set(skb, &rt->dst);
3098
3099         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3100                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3101                             nlh->nlmsg_seq, 0, 0, 0);
3102         if (err < 0) {
3103                 kfree_skb(skb);
3104                 goto errout;
3105         }
3106
3107         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3108 errout:
3109         return err;
3110 }
3111
3112 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3113 {
3114         struct sk_buff *skb;
3115         struct net *net = info->nl_net;
3116         u32 seq;
3117         int err;
3118
3119         err = -ENOBUFS;
3120         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3121
3122         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3123         if (!skb)
3124                 goto errout;
3125
3126         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3127                                 event, info->portid, seq, 0, 0, 0);
3128         if (err < 0) {
3129                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3130                 WARN_ON(err == -EMSGSIZE);
3131                 kfree_skb(skb);
3132                 goto errout;
3133         }
3134         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3135                     info->nlh, gfp_any());
3136         return;
3137 errout:
3138         if (err < 0)
3139                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3140 }
3141
3142 static int ip6_route_dev_notify(struct notifier_block *this,
3143                                 unsigned long event, void *ptr)
3144 {
3145         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3146         struct net *net = dev_net(dev);
3147
3148         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3149                 net->ipv6.ip6_null_entry->dst.dev = dev;
3150                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3152                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3153                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3154                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3155                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3156 #endif
3157         }
3158
3159         return NOTIFY_OK;
3160 }
3161
3162 /*
3163  *      /proc
3164  */
3165
3166 #ifdef CONFIG_PROC_FS
3167
3168 static const struct file_operations ipv6_route_proc_fops = {
3169         .owner          = THIS_MODULE,
3170         .open           = ipv6_route_open,
3171         .read           = seq_read,
3172         .llseek         = seq_lseek,
3173         .release        = seq_release_net,
3174 };
3175
3176 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3177 {
3178         struct net *net = (struct net *)seq->private;
3179         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3180                    net->ipv6.rt6_stats->fib_nodes,
3181                    net->ipv6.rt6_stats->fib_route_nodes,
3182                    net->ipv6.rt6_stats->fib_rt_alloc,
3183                    net->ipv6.rt6_stats->fib_rt_entries,
3184                    net->ipv6.rt6_stats->fib_rt_cache,
3185                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3186                    net->ipv6.rt6_stats->fib_discarded_routes);
3187
3188         return 0;
3189 }
3190
3191 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3192 {
3193         return single_open_net(inode, file, rt6_stats_seq_show);
3194 }
3195
3196 static const struct file_operations rt6_stats_seq_fops = {
3197         .owner   = THIS_MODULE,
3198         .open    = rt6_stats_seq_open,
3199         .read    = seq_read,
3200         .llseek  = seq_lseek,
3201         .release = single_release_net,
3202 };
3203 #endif  /* CONFIG_PROC_FS */
3204
3205 #ifdef CONFIG_SYSCTL
3206
3207 static
3208 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3209                               void __user *buffer, size_t *lenp, loff_t *ppos)
3210 {
3211         struct net *net;
3212         int delay;
3213         if (!write)
3214                 return -EINVAL;
3215
3216         net = (struct net *)ctl->extra1;
3217         delay = net->ipv6.sysctl.flush_delay;
3218         proc_dointvec(ctl, write, buffer, lenp, ppos);
3219         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3220         return 0;
3221 }
3222
3223 struct ctl_table ipv6_route_table_template[] = {
3224         {
3225                 .procname       =       "flush",
3226                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3227                 .maxlen         =       sizeof(int),
3228                 .mode           =       0200,
3229                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3230         },
3231         {
3232                 .procname       =       "gc_thresh",
3233                 .data           =       &ip6_dst_ops_template.gc_thresh,
3234                 .maxlen         =       sizeof(int),
3235                 .mode           =       0644,
3236                 .proc_handler   =       proc_dointvec,
3237         },
3238         {
3239                 .procname       =       "max_size",
3240                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3241                 .maxlen         =       sizeof(int),
3242                 .mode           =       0644,
3243                 .proc_handler   =       proc_dointvec,
3244         },
3245         {
3246                 .procname       =       "gc_min_interval",
3247                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3248                 .maxlen         =       sizeof(int),
3249                 .mode           =       0644,
3250                 .proc_handler   =       proc_dointvec_jiffies,
3251         },
3252         {
3253                 .procname       =       "gc_timeout",
3254                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3255                 .maxlen         =       sizeof(int),
3256                 .mode           =       0644,
3257                 .proc_handler   =       proc_dointvec_jiffies,
3258         },
3259         {
3260                 .procname       =       "gc_interval",
3261                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3262                 .maxlen         =       sizeof(int),
3263                 .mode           =       0644,
3264                 .proc_handler   =       proc_dointvec_jiffies,
3265         },
3266         {
3267                 .procname       =       "gc_elasticity",
3268                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3269                 .maxlen         =       sizeof(int),
3270                 .mode           =       0644,
3271                 .proc_handler   =       proc_dointvec,
3272         },
3273         {
3274                 .procname       =       "mtu_expires",
3275                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3276                 .maxlen         =       sizeof(int),
3277                 .mode           =       0644,
3278                 .proc_handler   =       proc_dointvec_jiffies,
3279         },
3280         {
3281                 .procname       =       "min_adv_mss",
3282                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3283                 .maxlen         =       sizeof(int),
3284                 .mode           =       0644,
3285                 .proc_handler   =       proc_dointvec,
3286         },
3287         {
3288                 .procname       =       "gc_min_interval_ms",
3289                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3290                 .maxlen         =       sizeof(int),
3291                 .mode           =       0644,
3292                 .proc_handler   =       proc_dointvec_ms_jiffies,
3293         },
3294         { }
3295 };
3296
3297 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3298 {
3299         struct ctl_table *table;
3300
3301         table = kmemdup(ipv6_route_table_template,
3302                         sizeof(ipv6_route_table_template),
3303                         GFP_KERNEL);
3304
3305         if (table) {
3306                 table[0].data = &net->ipv6.sysctl.flush_delay;
3307                 table[0].extra1 = net;
3308                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3309                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3310                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3311                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3312                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3313                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3314                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3315                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3316                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3317
3318                 /* Don't export sysctls to unprivileged users */
3319                 if (net->user_ns != &init_user_ns)
3320                         table[0].procname = NULL;
3321         }
3322
3323         return table;
3324 }
3325 #endif
3326
3327 static int __net_init ip6_route_net_init(struct net *net)
3328 {
3329         int ret = -ENOMEM;
3330
3331         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3332                sizeof(net->ipv6.ip6_dst_ops));
3333
3334         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3335                 goto out_ip6_dst_ops;
3336
3337         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3338                                            sizeof(*net->ipv6.ip6_null_entry),
3339                                            GFP_KERNEL);
3340         if (!net->ipv6.ip6_null_entry)
3341                 goto out_ip6_dst_entries;
3342         net->ipv6.ip6_null_entry->dst.path =
3343                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3344         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3345         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3346                          ip6_template_metrics, true);
3347
3348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3349         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3350                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3351                                                GFP_KERNEL);
3352         if (!net->ipv6.ip6_prohibit_entry)
3353                 goto out_ip6_null_entry;
3354         net->ipv6.ip6_prohibit_entry->dst.path =
3355                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3356         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3357         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3358                          ip6_template_metrics, true);
3359
3360         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3361                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3362                                                GFP_KERNEL);
3363         if (!net->ipv6.ip6_blk_hole_entry)
3364                 goto out_ip6_prohibit_entry;
3365         net->ipv6.ip6_blk_hole_entry->dst.path =
3366                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3367         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3368         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3369                          ip6_template_metrics, true);
3370 #endif
3371
3372         net->ipv6.sysctl.flush_delay = 0;
3373         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3374         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3375         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3376         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3377         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3378         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3379         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3380
3381         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3382
3383         ret = 0;
3384 out:
3385         return ret;
3386
3387 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3388 out_ip6_prohibit_entry:
3389         kfree(net->ipv6.ip6_prohibit_entry);
3390 out_ip6_null_entry:
3391         kfree(net->ipv6.ip6_null_entry);
3392 #endif
3393 out_ip6_dst_entries:
3394         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3395 out_ip6_dst_ops:
3396         goto out;
3397 }
3398
3399 static void __net_exit ip6_route_net_exit(struct net *net)
3400 {
3401         kfree(net->ipv6.ip6_null_entry);
3402 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3403         kfree(net->ipv6.ip6_prohibit_entry);
3404         kfree(net->ipv6.ip6_blk_hole_entry);
3405 #endif
3406         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3407 }
3408
3409 static int __net_init ip6_route_net_init_late(struct net *net)
3410 {
3411 #ifdef CONFIG_PROC_FS
3412         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3413         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3414 #endif
3415         return 0;
3416 }
3417
3418 static void __net_exit ip6_route_net_exit_late(struct net *net)
3419 {
3420 #ifdef CONFIG_PROC_FS
3421         remove_proc_entry("ipv6_route", net->proc_net);
3422         remove_proc_entry("rt6_stats", net->proc_net);
3423 #endif
3424 }
3425
3426 static struct pernet_operations ip6_route_net_ops = {
3427         .init = ip6_route_net_init,
3428         .exit = ip6_route_net_exit,
3429 };
3430
3431 static int __net_init ipv6_inetpeer_init(struct net *net)
3432 {
3433         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3434
3435         if (!bp)
3436                 return -ENOMEM;
3437         inet_peer_base_init(bp);
3438         net->ipv6.peers = bp;
3439         return 0;
3440 }
3441
3442 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3443 {
3444         struct inet_peer_base *bp = net->ipv6.peers;
3445
3446         net->ipv6.peers = NULL;
3447         inetpeer_invalidate_tree(bp);
3448         kfree(bp);
3449 }
3450
3451 static struct pernet_operations ipv6_inetpeer_ops = {
3452         .init   =       ipv6_inetpeer_init,
3453         .exit   =       ipv6_inetpeer_exit,
3454 };
3455
3456 static struct pernet_operations ip6_route_net_late_ops = {
3457         .init = ip6_route_net_init_late,
3458         .exit = ip6_route_net_exit_late,
3459 };
3460
3461 static struct notifier_block ip6_route_dev_notifier = {
3462         .notifier_call = ip6_route_dev_notify,
3463         .priority = 0,
3464 };
3465
3466 int __init ip6_route_init(void)
3467 {
3468         int ret;
3469         int cpu;
3470
3471         ret = -ENOMEM;
3472         ip6_dst_ops_template.kmem_cachep =
3473                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3474                                   SLAB_HWCACHE_ALIGN, NULL);
3475         if (!ip6_dst_ops_template.kmem_cachep)
3476                 goto out;
3477
3478         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3479         if (ret)
3480                 goto out_kmem_cache;
3481
3482         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3483         if (ret)
3484                 goto out_dst_entries;
3485
3486         ret = register_pernet_subsys(&ip6_route_net_ops);
3487         if (ret)
3488                 goto out_register_inetpeer;
3489
3490         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3491
3492         /* Registering of the loopback is done before this portion of code,
3493          * the loopback reference in rt6_info will not be taken, do it
3494          * manually for init_net */
3495         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3496         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3497   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3498         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3499         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3500         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3501         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3502   #endif
3503         ret = fib6_init();
3504         if (ret)
3505                 goto out_register_subsys;
3506
3507         ret = xfrm6_init();
3508         if (ret)
3509                 goto out_fib6_init;
3510
3511         ret = fib6_rules_init();
3512         if (ret)
3513                 goto xfrm6_init;
3514
3515         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3516         if (ret)
3517                 goto fib6_rules_init;
3518
3519         ret = -ENOBUFS;
3520         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3521             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3522             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3523                 goto out_register_late_subsys;
3524
3525         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3526         if (ret)
3527                 goto out_register_late_subsys;
3528
3529         for_each_possible_cpu(cpu) {
3530                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3531
3532                 INIT_LIST_HEAD(&ul->head);
3533                 spin_lock_init(&ul->lock);
3534         }
3535
3536 out:
3537         return ret;
3538
3539 out_register_late_subsys:
3540         unregister_pernet_subsys(&ip6_route_net_late_ops);
3541 fib6_rules_init:
3542         fib6_rules_cleanup();
3543 xfrm6_init:
3544         xfrm6_fini();
3545 out_fib6_init:
3546         fib6_gc_cleanup();
3547 out_register_subsys:
3548         unregister_pernet_subsys(&ip6_route_net_ops);
3549 out_register_inetpeer:
3550         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3551 out_dst_entries:
3552         dst_entries_destroy(&ip6_dst_blackhole_ops);
3553 out_kmem_cache:
3554         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3555         goto out;
3556 }
3557
3558 void ip6_route_cleanup(void)
3559 {
3560         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3561         unregister_pernet_subsys(&ip6_route_net_late_ops);
3562         fib6_rules_cleanup();
3563         xfrm6_fini();
3564         fib6_gc_cleanup();
3565         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3566         unregister_pernet_subsys(&ip6_route_net_ops);
3567         dst_entries_destroy(&ip6_dst_blackhole_ops);
3568         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3569 }