2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
63 #include <asm/uaccess.h>
66 #include <linux/sysctl.h>
70 RT6_NUD_FAIL_HARD = -3,
71 RT6_NUD_FAIL_PROBE = -2,
72 RT6_NUD_FAIL_DO_RR = -1,
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void ip6_dst_destroy(struct dst_entry *);
82 static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how);
84 static int ip6_dst_gc(struct dst_ops *ops);
86 static int ip6_pkt_discard(struct sk_buff *skb);
87 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int ip6_pkt_prohibit(struct sk_buff *skb);
89 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void ip6_link_failure(struct sk_buff *skb);
91 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92 struct sk_buff *skb, u32 mtu);
93 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
95 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100 const struct in6_addr *prefix, int prefixlen,
101 const struct in6_addr *gwaddr, int ifindex,
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104 const struct in6_addr *prefix, int prefixlen,
105 const struct in6_addr *gwaddr, int ifindex);
108 struct uncached_list {
110 struct list_head head;
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
115 static void rt6_uncached_list_add(struct rt6_info *rt)
117 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
119 rt->dst.flags |= DST_NOCACHE;
120 rt->rt6i_uncached_list = ul;
122 spin_lock_bh(&ul->lock);
123 list_add_tail(&rt->rt6i_uncached, &ul->head);
124 spin_unlock_bh(&ul->lock);
127 static void rt6_uncached_list_del(struct rt6_info *rt)
129 if (!list_empty(&rt->rt6i_uncached)) {
130 struct uncached_list *ul = rt->rt6i_uncached_list;
132 spin_lock_bh(&ul->lock);
133 list_del(&rt->rt6i_uncached);
134 spin_unlock_bh(&ul->lock);
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
140 struct net_device *loopback_dev = net->loopback_dev;
143 for_each_possible_cpu(cpu) {
144 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
147 spin_lock_bh(&ul->lock);
148 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149 struct inet6_dev *rt_idev = rt->rt6i_idev;
150 struct net_device *rt_dev = rt->dst.dev;
152 if (rt_idev && (rt_idev->dev == dev || !dev) &&
153 rt_idev->dev != loopback_dev) {
154 rt->rt6i_idev = in6_dev_get(loopback_dev);
155 in6_dev_put(rt_idev);
158 if (rt_dev && (rt_dev == dev || !dev) &&
159 rt_dev != loopback_dev) {
160 rt->dst.dev = loopback_dev;
161 dev_hold(rt->dst.dev);
165 spin_unlock_bh(&ul->lock);
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
171 return dst_metrics_write_ptr(rt->dst.from);
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
176 struct rt6_info *rt = (struct rt6_info *)dst;
178 if (rt->rt6i_flags & RTF_PCPU)
179 return rt6_pcpu_cow_metrics(rt);
180 else if (rt->rt6i_flags & RTF_CACHE)
183 return dst_cow_metrics_generic(dst, old);
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
190 struct in6_addr *p = &rt->rt6i_gateway;
192 if (!ipv6_addr_any(p))
193 return (const void *) p;
195 return &ipv6_hdr(skb)->daddr;
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
203 struct rt6_info *rt = (struct rt6_info *) dst;
206 daddr = choose_neigh_daddr(rt, skb, daddr);
207 n = __ipv6_neigh_lookup(dst->dev, daddr);
210 return neigh_create(&nd_tbl, daddr, dst->dev);
213 static struct dst_ops ip6_dst_ops_template = {
217 .check = ip6_dst_check,
218 .default_advmss = ip6_default_advmss,
220 .cow_metrics = ipv6_cow_metrics,
221 .destroy = ip6_dst_destroy,
222 .ifdown = ip6_dst_ifdown,
223 .negative_advice = ip6_negative_advice,
224 .link_failure = ip6_link_failure,
225 .update_pmtu = ip6_rt_update_pmtu,
226 .redirect = rt6_do_redirect,
227 .local_out = __ip6_local_out,
228 .neigh_lookup = ip6_neigh_lookup,
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
233 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
235 return mtu ? : dst->dev->mtu;
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239 struct sk_buff *skb, u32 mtu)
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
254 static struct dst_ops ip6_dst_blackhole_ops = {
256 .destroy = ip6_dst_destroy,
257 .check = ip6_dst_check,
258 .mtu = ip6_blackhole_mtu,
259 .default_advmss = ip6_default_advmss,
260 .update_pmtu = ip6_rt_blackhole_update_pmtu,
261 .redirect = ip6_rt_blackhole_redirect,
262 .cow_metrics = ip6_rt_blackhole_cow_metrics,
263 .neigh_lookup = ip6_neigh_lookup,
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267 [RTAX_HOPLIMIT - 1] = 0,
270 static const struct rt6_info ip6_null_entry_template = {
272 .__refcnt = ATOMIC_INIT(1),
274 .obsolete = DST_OBSOLETE_FORCE_CHK,
275 .error = -ENETUNREACH,
276 .input = ip6_pkt_discard,
277 .output = ip6_pkt_discard_out,
279 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
280 .rt6i_protocol = RTPROT_KERNEL,
281 .rt6i_metric = ~(u32) 0,
282 .rt6i_ref = ATOMIC_INIT(1),
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287 static const struct rt6_info ip6_prohibit_entry_template = {
289 .__refcnt = ATOMIC_INIT(1),
291 .obsolete = DST_OBSOLETE_FORCE_CHK,
293 .input = ip6_pkt_prohibit,
294 .output = ip6_pkt_prohibit_out,
296 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .rt6i_protocol = RTPROT_KERNEL,
298 .rt6i_metric = ~(u32) 0,
299 .rt6i_ref = ATOMIC_INIT(1),
302 static const struct rt6_info ip6_blk_hole_entry_template = {
304 .__refcnt = ATOMIC_INIT(1),
306 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .input = dst_discard,
309 .output = dst_discard_sk,
311 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
312 .rt6i_protocol = RTPROT_KERNEL,
313 .rt6i_metric = ~(u32) 0,
314 .rt6i_ref = ATOMIC_INIT(1),
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321 struct net_device *dev,
323 struct fib6_table *table)
325 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326 0, DST_OBSOLETE_FORCE_CHK, flags);
329 struct dst_entry *dst = &rt->dst;
331 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332 INIT_LIST_HEAD(&rt->rt6i_siblings);
333 INIT_LIST_HEAD(&rt->rt6i_uncached);
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339 struct net_device *dev,
341 struct fib6_table *table)
343 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
346 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
350 for_each_possible_cpu(cpu) {
353 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354 /* no one shares rt */
358 dst_destroy((struct dst_entry *)rt);
366 static void ip6_dst_destroy(struct dst_entry *dst)
368 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct dst_entry *from = dst->from;
370 struct inet6_dev *idev;
372 dst_destroy_metrics_generic(dst);
373 free_percpu(rt->rt6i_pcpu);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389 struct rt6_info *rt = (struct rt6_info *)dst;
390 struct inet6_dev *idev = rt->rt6i_idev;
391 struct net_device *loopback_dev =
392 dev_net(dev)->loopback_dev;
394 if (dev != loopback_dev) {
395 if (idev && idev->dev == dev) {
396 struct inet6_dev *loopback_idev =
397 in6_dev_get(loopback_dev);
399 rt->rt6i_idev = loopback_idev;
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES) {
409 if (time_after(jiffies, rt->dst.expires))
411 } else if (rt->dst.from) {
412 return rt6_check_expired((struct rt6_info *) rt->dst.from);
417 /* Multipath route selection:
418 * Hash based function using packet header and flowlabel.
419 * Adapted from fib_info_hashfn()
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422 const struct flowi6 *fl6)
424 unsigned int val = fl6->flowi6_proto;
426 val ^= ipv6_addr_hash(&fl6->daddr);
427 val ^= ipv6_addr_hash(&fl6->saddr);
429 /* Work only if this not encapsulated */
430 switch (fl6->flowi6_proto) {
434 val ^= (__force u16)fl6->fl6_sport;
435 val ^= (__force u16)fl6->fl6_dport;
439 val ^= (__force u16)fl6->fl6_icmp_type;
440 val ^= (__force u16)fl6->fl6_icmp_code;
443 /* RFC6438 recommands to use flowlabel */
444 val ^= (__force u32)fl6->flowlabel;
446 /* Perhaps, we need to tune, this function? */
447 val = val ^ (val >> 7) ^ (val >> 12);
448 return val % candidate_count;
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 struct flowi6 *fl6, int oif,
455 struct rt6_info *sibling, *next_sibling;
458 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459 /* Don't change the route, if route_choosen == 0
460 * (siblings does not include ourself)
463 list_for_each_entry_safe(sibling, next_sibling,
464 &match->rt6i_siblings, rt6i_siblings) {
466 if (route_choosen == 0) {
467 if (rt6_score_route(sibling, oif, strict) < 0)
477 * Route lookup. Any table->tb6_lock is implied.
480 static inline struct rt6_info *rt6_device_match(struct net *net,
482 const struct in6_addr *saddr,
486 struct rt6_info *local = NULL;
487 struct rt6_info *sprt;
489 if (!oif && ipv6_addr_any(saddr))
492 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493 struct net_device *dev = sprt->dst.dev;
496 if (dev->ifindex == oif)
498 if (dev->flags & IFF_LOOPBACK) {
499 if (!sprt->rt6i_idev ||
500 sprt->rt6i_idev->dev->ifindex != oif) {
501 if (flags & RT6_LOOKUP_F_IFACE && oif)
503 if (local && (!oif ||
504 local->rt6i_idev->dev->ifindex == oif))
510 if (ipv6_chk_addr(net, saddr, dev,
511 flags & RT6_LOOKUP_F_IFACE))
520 if (flags & RT6_LOOKUP_F_IFACE)
521 return net->ipv6.ip6_null_entry;
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529 struct work_struct work;
530 struct in6_addr target;
531 struct net_device *dev;
534 static void rt6_probe_deferred(struct work_struct *w)
536 struct in6_addr mcaddr;
537 struct __rt6_probe_work *work =
538 container_of(w, struct __rt6_probe_work, work);
540 addrconf_addr_solict_mult(&work->target, &mcaddr);
541 ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
546 static void rt6_probe(struct rt6_info *rt)
548 struct __rt6_probe_work *work;
549 struct neighbour *neigh;
551 * Okay, this does not seem to be appropriate
552 * for now, however, we need to check if it
553 * is really so; aka Router Reachability Probing.
555 * Router Reachability Probe MUST be rate-limited
556 * to no more than one per minute.
558 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
561 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
563 if (neigh->nud_state & NUD_VALID)
567 write_lock(&neigh->lock);
568 if (!(neigh->nud_state & NUD_VALID) &&
571 rt->rt6i_idev->cnf.rtr_probe_interval)) {
572 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574 __neigh_set_probe_once(neigh);
576 write_unlock(&neigh->lock);
578 work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 INIT_WORK(&work->work, rt6_probe_deferred);
583 work->target = rt->rt6i_gateway;
584 dev_hold(rt->dst.dev);
585 work->dev = rt->dst.dev;
586 schedule_work(&work->work);
590 rcu_read_unlock_bh();
593 static inline void rt6_probe(struct rt6_info *rt)
599 * Default Router Selection (RFC 2461 6.3.6)
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
603 struct net_device *dev = rt->dst.dev;
604 if (!oif || dev->ifindex == oif)
606 if ((dev->flags & IFF_LOOPBACK) &&
607 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
614 struct neighbour *neigh;
615 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
617 if (rt->rt6i_flags & RTF_NONEXTHOP ||
618 !(rt->rt6i_flags & RTF_GATEWAY))
619 return RT6_NUD_SUCCEED;
622 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
624 read_lock(&neigh->lock);
625 if (neigh->nud_state & NUD_VALID)
626 ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628 else if (!(neigh->nud_state & NUD_FAILED))
629 ret = RT6_NUD_SUCCEED;
631 ret = RT6_NUD_FAIL_PROBE;
633 read_unlock(&neigh->lock);
635 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
638 rcu_read_unlock_bh();
643 static int rt6_score_route(struct rt6_info *rt, int oif,
648 m = rt6_check_dev(rt, oif);
649 if (!m && (strict & RT6_LOOKUP_F_IFACE))
650 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
654 if (strict & RT6_LOOKUP_F_REACHABLE) {
655 int n = rt6_check_neigh(rt);
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663 int *mpri, struct rt6_info *match,
667 bool match_do_rr = false;
668 struct inet6_dev *idev = rt->rt6i_idev;
669 struct net_device *dev = rt->dst.dev;
671 if (dev && !netif_carrier_ok(dev) &&
672 idev->cnf.ignore_routes_with_linkdown)
675 if (rt6_check_expired(rt))
678 m = rt6_score_route(rt, oif, strict);
679 if (m == RT6_NUD_FAIL_DO_RR) {
681 m = 0; /* lowest valid score */
682 } else if (m == RT6_NUD_FAIL_HARD) {
686 if (strict & RT6_LOOKUP_F_REACHABLE)
689 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
691 *do_rr = match_do_rr;
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700 struct rt6_info *rr_head,
701 u32 metric, int oif, int strict,
704 struct rt6_info *rt, *match, *cont;
709 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
710 if (rt->rt6i_metric != metric) {
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
719 if (rt->rt6i_metric != metric) {
724 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 for (rt = cont; rt; rt = rt->dst.rt6_next)
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
738 struct rt6_info *match, *rt0;
744 fn->rr_ptr = rt0 = fn->leaf;
746 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
750 struct rt6_info *next = rt0->dst.rt6_next;
752 /* no entries matched; do round-robin */
753 if (!next || next->rt6i_metric != rt0->rt6i_metric)
760 net = dev_net(rt0->dst.dev);
761 return match ? match : net->ipv6.ip6_null_entry;
764 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
766 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
769 #ifdef CONFIG_IPV6_ROUTE_INFO
770 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
771 const struct in6_addr *gwaddr)
773 struct net *net = dev_net(dev);
774 struct route_info *rinfo = (struct route_info *) opt;
775 struct in6_addr prefix_buf, *prefix;
777 unsigned long lifetime;
780 if (len < sizeof(struct route_info)) {
784 /* Sanity check for prefix_len and length */
785 if (rinfo->length > 3) {
787 } else if (rinfo->prefix_len > 128) {
789 } else if (rinfo->prefix_len > 64) {
790 if (rinfo->length < 2) {
793 } else if (rinfo->prefix_len > 0) {
794 if (rinfo->length < 1) {
799 pref = rinfo->route_pref;
800 if (pref == ICMPV6_ROUTER_PREF_INVALID)
803 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
805 if (rinfo->length == 3)
806 prefix = (struct in6_addr *)rinfo->prefix;
808 /* this function is safe */
809 ipv6_addr_prefix(&prefix_buf,
810 (struct in6_addr *)rinfo->prefix,
812 prefix = &prefix_buf;
815 if (rinfo->prefix_len == 0)
816 rt = rt6_get_dflt_router(gwaddr, dev);
818 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
819 gwaddr, dev->ifindex);
821 if (rt && !lifetime) {
827 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
830 rt->rt6i_flags = RTF_ROUTEINFO |
831 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
834 if (!addrconf_finite_timeout(lifetime))
835 rt6_clean_expires(rt);
837 rt6_set_expires(rt, jiffies + HZ * lifetime);
845 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
846 struct in6_addr *saddr)
848 struct fib6_node *pn;
850 if (fn->fn_flags & RTN_TL_ROOT)
853 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
854 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
857 if (fn->fn_flags & RTN_RTINFO)
862 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
863 struct fib6_table *table,
864 struct flowi6 *fl6, int flags)
866 struct fib6_node *fn;
869 read_lock_bh(&table->tb6_lock);
870 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
873 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
874 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
875 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
876 if (rt == net->ipv6.ip6_null_entry) {
877 fn = fib6_backtrack(fn, &fl6->saddr);
881 dst_use(&rt->dst, jiffies);
882 read_unlock_bh(&table->tb6_lock);
887 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
890 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
892 EXPORT_SYMBOL_GPL(ip6_route_lookup);
894 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
895 const struct in6_addr *saddr, int oif, int strict)
897 struct flowi6 fl6 = {
901 struct dst_entry *dst;
902 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
905 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
906 flags |= RT6_LOOKUP_F_HAS_SADDR;
909 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
911 return (struct rt6_info *) dst;
917 EXPORT_SYMBOL(rt6_lookup);
919 /* ip6_ins_rt is called with FREE table->tb6_lock.
920 It takes new route entry, the addition fails by any reason the
921 route is freed. In any case, if caller does not hold it, it may
925 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
926 struct mx6_config *mxc)
929 struct fib6_table *table;
931 table = rt->rt6i_table;
932 write_lock_bh(&table->tb6_lock);
933 err = fib6_add(&table->tb6_root, rt, info, mxc);
934 write_unlock_bh(&table->tb6_lock);
939 int ip6_ins_rt(struct rt6_info *rt)
941 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
942 struct mx6_config mxc = { .mx = NULL, };
944 return __ip6_ins_rt(rt, &info, &mxc);
947 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
948 const struct in6_addr *daddr,
949 const struct in6_addr *saddr)
957 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
958 ort = (struct rt6_info *)ort->dst.from;
960 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
966 ip6_rt_copy_init(rt, ort);
967 rt->rt6i_flags |= RTF_CACHE;
969 rt->dst.flags |= DST_HOST;
970 rt->rt6i_dst.addr = *daddr;
971 rt->rt6i_dst.plen = 128;
973 if (!rt6_is_gw_or_nonexthop(ort)) {
974 if (ort->rt6i_dst.plen != 128 &&
975 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
976 rt->rt6i_flags |= RTF_ANYCAST;
977 #ifdef CONFIG_IPV6_SUBTREES
978 if (rt->rt6i_src.plen && saddr) {
979 rt->rt6i_src.addr = *saddr;
980 rt->rt6i_src.plen = 128;
988 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
990 struct rt6_info *pcpu_rt;
992 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
993 rt->dst.dev, rt->dst.flags,
998 ip6_rt_copy_init(pcpu_rt, rt);
999 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1000 pcpu_rt->rt6i_flags |= RTF_PCPU;
1004 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1005 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1007 struct rt6_info *pcpu_rt, *prev, **p;
1009 p = this_cpu_ptr(rt->rt6i_pcpu);
1015 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1017 struct net *net = dev_net(rt->dst.dev);
1019 pcpu_rt = net->ipv6.ip6_null_entry;
1023 prev = cmpxchg(p, NULL, pcpu_rt);
1025 /* If someone did it before us, return prev instead */
1026 dst_destroy(&pcpu_rt->dst);
1031 dst_hold(&pcpu_rt->dst);
1032 rt6_dst_from_metrics_check(pcpu_rt);
1036 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1037 struct flowi6 *fl6, int flags)
1039 struct fib6_node *fn, *saved_fn;
1040 struct rt6_info *rt;
1043 strict |= flags & RT6_LOOKUP_F_IFACE;
1044 if (net->ipv6.devconf_all->forwarding == 0)
1045 strict |= RT6_LOOKUP_F_REACHABLE;
1047 read_lock_bh(&table->tb6_lock);
1049 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1053 rt = rt6_select(fn, oif, strict);
1054 if (rt->rt6i_nsiblings)
1055 rt = rt6_multipath_select(rt, fl6, oif, strict);
1056 if (rt == net->ipv6.ip6_null_entry) {
1057 fn = fib6_backtrack(fn, &fl6->saddr);
1059 goto redo_rt6_select;
1060 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1061 /* also consider unreachable route */
1062 strict &= ~RT6_LOOKUP_F_REACHABLE;
1064 goto redo_rt6_select;
1069 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1070 dst_use(&rt->dst, jiffies);
1071 read_unlock_bh(&table->tb6_lock);
1073 rt6_dst_from_metrics_check(rt);
1075 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1076 !(rt->rt6i_flags & RTF_GATEWAY))) {
1077 /* Create a RTF_CACHE clone which will not be
1078 * owned by the fib6 tree. It is for the special case where
1079 * the daddr in the skb during the neighbor look-up is different
1080 * from the fl6->daddr used to look-up route here.
1083 struct rt6_info *uncached_rt;
1085 dst_use(&rt->dst, jiffies);
1086 read_unlock_bh(&table->tb6_lock);
1088 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1089 dst_release(&rt->dst);
1092 rt6_uncached_list_add(uncached_rt);
1094 uncached_rt = net->ipv6.ip6_null_entry;
1096 dst_hold(&uncached_rt->dst);
1100 /* Get a percpu copy */
1102 struct rt6_info *pcpu_rt;
1104 rt->dst.lastuse = jiffies;
1106 pcpu_rt = rt6_get_pcpu_route(rt);
1107 read_unlock_bh(&table->tb6_lock);
1113 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1114 struct flowi6 *fl6, int flags)
1116 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1119 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1120 struct net_device *dev,
1121 struct flowi6 *fl6, int flags)
1123 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1124 flags |= RT6_LOOKUP_F_IFACE;
1126 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1129 void ip6_route_input(struct sk_buff *skb)
1131 const struct ipv6hdr *iph = ipv6_hdr(skb);
1132 struct net *net = dev_net(skb->dev);
1133 int flags = RT6_LOOKUP_F_HAS_SADDR;
1134 struct flowi6 fl6 = {
1135 .flowi6_iif = skb->dev->ifindex,
1136 .daddr = iph->daddr,
1137 .saddr = iph->saddr,
1138 .flowlabel = ip6_flowinfo(iph),
1139 .flowi6_mark = skb->mark,
1140 .flowi6_proto = iph->nexthdr,
1143 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1146 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1147 struct flowi6 *fl6, int flags)
1149 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1152 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1157 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1159 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1160 flags |= RT6_LOOKUP_F_IFACE;
1162 if (!ipv6_addr_any(&fl6->saddr))
1163 flags |= RT6_LOOKUP_F_HAS_SADDR;
1165 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1167 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1169 EXPORT_SYMBOL(ip6_route_output);
1171 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1173 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1174 struct dst_entry *new = NULL;
1176 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1180 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1183 new->input = dst_discard;
1184 new->output = dst_discard_sk;
1186 if (dst_metrics_read_only(&ort->dst))
1187 new->_metrics = ort->dst._metrics;
1189 dst_copy_metrics(new, &ort->dst);
1190 rt->rt6i_idev = ort->rt6i_idev;
1192 in6_dev_hold(rt->rt6i_idev);
1194 rt->rt6i_gateway = ort->rt6i_gateway;
1195 rt->rt6i_flags = ort->rt6i_flags;
1196 rt->rt6i_metric = 0;
1198 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1199 #ifdef CONFIG_IPV6_SUBTREES
1200 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1206 dst_release(dst_orig);
1207 return new ? new : ERR_PTR(-ENOMEM);
1211 * Destination cache support functions
1214 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1217 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1218 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1221 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1223 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1226 if (rt6_check_expired(rt))
1232 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1234 if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1235 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1241 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1243 struct rt6_info *rt;
1245 rt = (struct rt6_info *) dst;
1247 /* All IPV6 dsts are created with ->obsolete set to the value
1248 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1249 * into this function always.
1252 rt6_dst_from_metrics_check(rt);
1254 if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1255 return rt6_dst_from_check(rt, cookie);
1257 return rt6_check(rt, cookie);
1260 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1262 struct rt6_info *rt = (struct rt6_info *) dst;
1265 if (rt->rt6i_flags & RTF_CACHE) {
1266 if (rt6_check_expired(rt)) {
1278 static void ip6_link_failure(struct sk_buff *skb)
1280 struct rt6_info *rt;
1282 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1284 rt = (struct rt6_info *) skb_dst(skb);
1286 if (rt->rt6i_flags & RTF_CACHE) {
1290 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1291 rt->rt6i_node->fn_sernum = -1;
1296 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1298 struct net *net = dev_net(rt->dst.dev);
1300 rt->rt6i_flags |= RTF_MODIFIED;
1301 rt->rt6i_pmtu = mtu;
1302 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1305 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1306 const struct ipv6hdr *iph, u32 mtu)
1308 struct rt6_info *rt6 = (struct rt6_info *)dst;
1310 if (rt6->rt6i_flags & RTF_LOCAL)
1314 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1315 if (mtu >= dst_mtu(dst))
1318 if (rt6->rt6i_flags & RTF_CACHE) {
1319 rt6_do_update_pmtu(rt6, mtu);
1321 const struct in6_addr *daddr, *saddr;
1322 struct rt6_info *nrt6;
1325 daddr = &iph->daddr;
1326 saddr = &iph->saddr;
1328 daddr = &sk->sk_v6_daddr;
1329 saddr = &inet6_sk(sk)->saddr;
1333 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1335 rt6_do_update_pmtu(nrt6, mtu);
1337 /* ip6_ins_rt(nrt6) will bump the
1338 * rt6->rt6i_node->fn_sernum
1339 * which will fail the next rt6_check() and
1340 * invalidate the sk->sk_dst_cache.
1347 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1348 struct sk_buff *skb, u32 mtu)
1350 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1353 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1356 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1357 struct dst_entry *dst;
1360 memset(&fl6, 0, sizeof(fl6));
1361 fl6.flowi6_oif = oif;
1362 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1363 fl6.daddr = iph->daddr;
1364 fl6.saddr = iph->saddr;
1365 fl6.flowlabel = ip6_flowinfo(iph);
1367 dst = ip6_route_output(net, NULL, &fl6);
1369 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1372 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1376 ip6_update_pmtu(skb, sock_net(sk), mtu,
1377 sk->sk_bound_dev_if, sk->sk_mark);
1379 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1381 /* Handle redirects */
1382 struct ip6rd_flowi {
1384 struct in6_addr gateway;
1387 static struct rt6_info *__ip6_route_redirect(struct net *net,
1388 struct fib6_table *table,
1392 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1393 struct rt6_info *rt;
1394 struct fib6_node *fn;
1396 /* Get the "current" route for this destination and
1397 * check if the redirect has come from approriate router.
1399 * RFC 4861 specifies that redirects should only be
1400 * accepted if they come from the nexthop to the target.
1401 * Due to the way the routes are chosen, this notion
1402 * is a bit fuzzy and one might need to check all possible
1406 read_lock_bh(&table->tb6_lock);
1407 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1409 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1410 if (rt6_check_expired(rt))
1414 if (!(rt->rt6i_flags & RTF_GATEWAY))
1416 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1418 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1424 rt = net->ipv6.ip6_null_entry;
1425 else if (rt->dst.error) {
1426 rt = net->ipv6.ip6_null_entry;
1430 if (rt == net->ipv6.ip6_null_entry) {
1431 fn = fib6_backtrack(fn, &fl6->saddr);
1439 read_unlock_bh(&table->tb6_lock);
1444 static struct dst_entry *ip6_route_redirect(struct net *net,
1445 const struct flowi6 *fl6,
1446 const struct in6_addr *gateway)
1448 int flags = RT6_LOOKUP_F_HAS_SADDR;
1449 struct ip6rd_flowi rdfl;
1452 rdfl.gateway = *gateway;
1454 return fib6_rule_lookup(net, &rdfl.fl6,
1455 flags, __ip6_route_redirect);
1458 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1460 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1461 struct dst_entry *dst;
1464 memset(&fl6, 0, sizeof(fl6));
1465 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1466 fl6.flowi6_oif = oif;
1467 fl6.flowi6_mark = mark;
1468 fl6.daddr = iph->daddr;
1469 fl6.saddr = iph->saddr;
1470 fl6.flowlabel = ip6_flowinfo(iph);
1472 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1473 rt6_do_redirect(dst, NULL, skb);
1476 EXPORT_SYMBOL_GPL(ip6_redirect);
1478 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1481 const struct ipv6hdr *iph = ipv6_hdr(skb);
1482 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1483 struct dst_entry *dst;
1486 memset(&fl6, 0, sizeof(fl6));
1487 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1488 fl6.flowi6_oif = oif;
1489 fl6.flowi6_mark = mark;
1490 fl6.daddr = msg->dest;
1491 fl6.saddr = iph->daddr;
1493 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1494 rt6_do_redirect(dst, NULL, skb);
1498 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1500 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1502 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1504 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1506 struct net_device *dev = dst->dev;
1507 unsigned int mtu = dst_mtu(dst);
1508 struct net *net = dev_net(dev);
1510 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1512 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1513 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1516 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1517 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1518 * IPV6_MAXPLEN is also valid and means: "any MSS,
1519 * rely only on pmtu discovery"
1521 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1526 static unsigned int ip6_mtu(const struct dst_entry *dst)
1528 const struct rt6_info *rt = (const struct rt6_info *)dst;
1529 unsigned int mtu = rt->rt6i_pmtu;
1530 struct inet6_dev *idev;
1535 mtu = dst_metric_raw(dst, RTAX_MTU);
1542 idev = __in6_dev_get(dst->dev);
1544 mtu = idev->cnf.mtu6;
1548 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1551 static struct dst_entry *icmp6_dst_gc_list;
1552 static DEFINE_SPINLOCK(icmp6_dst_lock);
1554 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1557 struct dst_entry *dst;
1558 struct rt6_info *rt;
1559 struct inet6_dev *idev = in6_dev_get(dev);
1560 struct net *net = dev_net(dev);
1562 if (unlikely(!idev))
1563 return ERR_PTR(-ENODEV);
1565 rt = ip6_dst_alloc(net, dev, 0, NULL);
1566 if (unlikely(!rt)) {
1568 dst = ERR_PTR(-ENOMEM);
1572 rt->dst.flags |= DST_HOST;
1573 rt->dst.output = ip6_output;
1574 atomic_set(&rt->dst.__refcnt, 1);
1575 rt->rt6i_gateway = fl6->daddr;
1576 rt->rt6i_dst.addr = fl6->daddr;
1577 rt->rt6i_dst.plen = 128;
1578 rt->rt6i_idev = idev;
1579 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1581 spin_lock_bh(&icmp6_dst_lock);
1582 rt->dst.next = icmp6_dst_gc_list;
1583 icmp6_dst_gc_list = &rt->dst;
1584 spin_unlock_bh(&icmp6_dst_lock);
1586 fib6_force_start_gc(net);
1588 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1594 int icmp6_dst_gc(void)
1596 struct dst_entry *dst, **pprev;
1599 spin_lock_bh(&icmp6_dst_lock);
1600 pprev = &icmp6_dst_gc_list;
1602 while ((dst = *pprev) != NULL) {
1603 if (!atomic_read(&dst->__refcnt)) {
1612 spin_unlock_bh(&icmp6_dst_lock);
1617 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1620 struct dst_entry *dst, **pprev;
1622 spin_lock_bh(&icmp6_dst_lock);
1623 pprev = &icmp6_dst_gc_list;
1624 while ((dst = *pprev) != NULL) {
1625 struct rt6_info *rt = (struct rt6_info *) dst;
1626 if (func(rt, arg)) {
1633 spin_unlock_bh(&icmp6_dst_lock);
1636 static int ip6_dst_gc(struct dst_ops *ops)
1638 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1639 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1640 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1641 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1642 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1643 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1646 entries = dst_entries_get_fast(ops);
1647 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1648 entries <= rt_max_size)
1651 net->ipv6.ip6_rt_gc_expire++;
1652 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1653 entries = dst_entries_get_slow(ops);
1654 if (entries < ops->gc_thresh)
1655 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1657 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1658 return entries > rt_max_size;
1661 static int ip6_convert_metrics(struct mx6_config *mxc,
1662 const struct fib6_config *cfg)
1671 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1675 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1676 int type = nla_type(nla);
1681 if (unlikely(type > RTAX_MAX))
1683 if (type == RTAX_CC_ALGO) {
1684 char tmp[TCP_CA_NAME_MAX];
1686 nla_strlcpy(tmp, nla, sizeof(tmp));
1687 val = tcp_ca_get_key_by_name(tmp);
1688 if (val == TCP_CA_UNSPEC)
1691 val = nla_get_u32(nla);
1695 __set_bit(type - 1, mxc->mx_valid);
1707 int ip6_route_add(struct fib6_config *cfg)
1710 struct net *net = cfg->fc_nlinfo.nl_net;
1711 struct rt6_info *rt = NULL;
1712 struct net_device *dev = NULL;
1713 struct inet6_dev *idev = NULL;
1714 struct fib6_table *table;
1715 struct mx6_config mxc = { .mx = NULL, };
1718 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1720 #ifndef CONFIG_IPV6_SUBTREES
1721 if (cfg->fc_src_len)
1724 if (cfg->fc_ifindex) {
1726 dev = dev_get_by_index(net, cfg->fc_ifindex);
1729 idev = in6_dev_get(dev);
1734 if (cfg->fc_metric == 0)
1735 cfg->fc_metric = IP6_RT_PRIO_USER;
1738 if (cfg->fc_nlinfo.nlh &&
1739 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1740 table = fib6_get_table(net, cfg->fc_table);
1742 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1743 table = fib6_new_table(net, cfg->fc_table);
1746 table = fib6_new_table(net, cfg->fc_table);
1752 rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1759 if (cfg->fc_flags & RTF_EXPIRES)
1760 rt6_set_expires(rt, jiffies +
1761 clock_t_to_jiffies(cfg->fc_expires));
1763 rt6_clean_expires(rt);
1765 if (cfg->fc_protocol == RTPROT_UNSPEC)
1766 cfg->fc_protocol = RTPROT_BOOT;
1767 rt->rt6i_protocol = cfg->fc_protocol;
1769 addr_type = ipv6_addr_type(&cfg->fc_dst);
1771 if (addr_type & IPV6_ADDR_MULTICAST)
1772 rt->dst.input = ip6_mc_input;
1773 else if (cfg->fc_flags & RTF_LOCAL)
1774 rt->dst.input = ip6_input;
1776 rt->dst.input = ip6_forward;
1778 rt->dst.output = ip6_output;
1780 if (cfg->fc_encap) {
1781 struct lwtunnel_state *lwtstate;
1783 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1784 cfg->fc_encap, &lwtstate);
1787 rt->dst.lwtstate = lwtstate_get(lwtstate);
1788 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1789 rt->dst.lwtstate->orig_output = rt->dst.output;
1790 rt->dst.output = lwtunnel_output;
1792 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1793 rt->dst.lwtstate->orig_input = rt->dst.input;
1794 rt->dst.input = lwtunnel_input;
1798 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1799 rt->rt6i_dst.plen = cfg->fc_dst_len;
1800 if (rt->rt6i_dst.plen == 128)
1801 rt->dst.flags |= DST_HOST;
1803 #ifdef CONFIG_IPV6_SUBTREES
1804 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1805 rt->rt6i_src.plen = cfg->fc_src_len;
1808 rt->rt6i_metric = cfg->fc_metric;
1810 /* We cannot add true routes via loopback here,
1811 they would result in kernel looping; promote them to reject routes
1813 if ((cfg->fc_flags & RTF_REJECT) ||
1814 (dev && (dev->flags & IFF_LOOPBACK) &&
1815 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1816 !(cfg->fc_flags & RTF_LOCAL))) {
1817 /* hold loopback dev/idev if we haven't done so. */
1818 if (dev != net->loopback_dev) {
1823 dev = net->loopback_dev;
1825 idev = in6_dev_get(dev);
1831 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1832 switch (cfg->fc_type) {
1834 rt->dst.error = -EINVAL;
1835 rt->dst.output = dst_discard_sk;
1836 rt->dst.input = dst_discard;
1839 rt->dst.error = -EACCES;
1840 rt->dst.output = ip6_pkt_prohibit_out;
1841 rt->dst.input = ip6_pkt_prohibit;
1845 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1847 rt->dst.output = ip6_pkt_discard_out;
1848 rt->dst.input = ip6_pkt_discard;
1854 if (cfg->fc_flags & RTF_GATEWAY) {
1855 const struct in6_addr *gw_addr;
1858 gw_addr = &cfg->fc_gateway;
1859 gwa_type = ipv6_addr_type(gw_addr);
1861 /* if gw_addr is local we will fail to detect this in case
1862 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1863 * will return already-added prefix route via interface that
1864 * prefix route was assigned to, which might be non-loopback.
1867 if (ipv6_chk_addr_and_flags(net, gw_addr,
1868 gwa_type & IPV6_ADDR_LINKLOCAL ?
1872 rt->rt6i_gateway = *gw_addr;
1874 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1875 struct rt6_info *grt;
1877 /* IPv6 strictly inhibits using not link-local
1878 addresses as nexthop address.
1879 Otherwise, router will not able to send redirects.
1880 It is very good, but in some (rare!) circumstances
1881 (SIT, PtP, NBMA NOARP links) it is handy to allow
1882 some exceptions. --ANK
1884 if (!(gwa_type & IPV6_ADDR_UNICAST))
1887 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1889 err = -EHOSTUNREACH;
1893 if (dev != grt->dst.dev) {
1899 idev = grt->rt6i_idev;
1901 in6_dev_hold(grt->rt6i_idev);
1903 if (!(grt->rt6i_flags & RTF_GATEWAY))
1911 if (!dev || (dev->flags & IFF_LOOPBACK))
1919 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1920 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1924 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1925 rt->rt6i_prefsrc.plen = 128;
1927 rt->rt6i_prefsrc.plen = 0;
1929 rt->rt6i_flags = cfg->fc_flags;
1933 rt->rt6i_idev = idev;
1934 rt->rt6i_table = table;
1936 cfg->fc_nlinfo.nl_net = dev_net(dev);
1938 err = ip6_convert_metrics(&mxc, cfg);
1942 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1956 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1959 struct fib6_table *table;
1960 struct net *net = dev_net(rt->dst.dev);
1962 if (rt == net->ipv6.ip6_null_entry) {
1967 table = rt->rt6i_table;
1968 write_lock_bh(&table->tb6_lock);
1969 err = fib6_del(rt, info);
1970 write_unlock_bh(&table->tb6_lock);
1977 int ip6_del_rt(struct rt6_info *rt)
1979 struct nl_info info = {
1980 .nl_net = dev_net(rt->dst.dev),
1982 return __ip6_del_rt(rt, &info);
1985 static int ip6_route_del(struct fib6_config *cfg)
1987 struct fib6_table *table;
1988 struct fib6_node *fn;
1989 struct rt6_info *rt;
1992 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1996 read_lock_bh(&table->tb6_lock);
1998 fn = fib6_locate(&table->tb6_root,
1999 &cfg->fc_dst, cfg->fc_dst_len,
2000 &cfg->fc_src, cfg->fc_src_len);
2003 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2004 if ((rt->rt6i_flags & RTF_CACHE) &&
2005 !(cfg->fc_flags & RTF_CACHE))
2007 if (cfg->fc_ifindex &&
2009 rt->dst.dev->ifindex != cfg->fc_ifindex))
2011 if (cfg->fc_flags & RTF_GATEWAY &&
2012 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2014 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2017 read_unlock_bh(&table->tb6_lock);
2019 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2022 read_unlock_bh(&table->tb6_lock);
2027 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2029 struct net *net = dev_net(skb->dev);
2030 struct netevent_redirect netevent;
2031 struct rt6_info *rt, *nrt = NULL;
2032 struct ndisc_options ndopts;
2033 struct inet6_dev *in6_dev;
2034 struct neighbour *neigh;
2036 int optlen, on_link;
2039 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2040 optlen -= sizeof(*msg);
2043 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2047 msg = (struct rd_msg *)icmp6_hdr(skb);
2049 if (ipv6_addr_is_multicast(&msg->dest)) {
2050 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2055 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2057 } else if (ipv6_addr_type(&msg->target) !=
2058 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2059 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2063 in6_dev = __in6_dev_get(skb->dev);
2066 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2070 * The IP source address of the Redirect MUST be the same as the current
2071 * first-hop router for the specified ICMP Destination Address.
2074 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2075 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2080 if (ndopts.nd_opts_tgt_lladdr) {
2081 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2084 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2089 rt = (struct rt6_info *) dst;
2090 if (rt == net->ipv6.ip6_null_entry) {
2091 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2095 /* Redirect received -> path was valid.
2096 * Look, redirects are sent only in response to data packets,
2097 * so that this nexthop apparently is reachable. --ANK
2099 dst_confirm(&rt->dst);
2101 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2106 * We have finally decided to accept it.
2109 neigh_update(neigh, lladdr, NUD_STALE,
2110 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2111 NEIGH_UPDATE_F_OVERRIDE|
2112 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2113 NEIGH_UPDATE_F_ISROUTER))
2116 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2120 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2122 nrt->rt6i_flags &= ~RTF_GATEWAY;
2124 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2126 if (ip6_ins_rt(nrt))
2129 netevent.old = &rt->dst;
2130 netevent.new = &nrt->dst;
2131 netevent.daddr = &msg->dest;
2132 netevent.neigh = neigh;
2133 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2135 if (rt->rt6i_flags & RTF_CACHE) {
2136 rt = (struct rt6_info *) dst_clone(&rt->dst);
2141 neigh_release(neigh);
2145 * Misc support functions
2148 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2150 BUG_ON(from->dst.from);
2152 rt->rt6i_flags &= ~RTF_EXPIRES;
2153 dst_hold(&from->dst);
2154 rt->dst.from = &from->dst;
2155 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2158 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2160 rt->dst.input = ort->dst.input;
2161 rt->dst.output = ort->dst.output;
2162 rt->rt6i_dst = ort->rt6i_dst;
2163 rt->dst.error = ort->dst.error;
2164 rt->rt6i_idev = ort->rt6i_idev;
2166 in6_dev_hold(rt->rt6i_idev);
2167 rt->dst.lastuse = jiffies;
2168 rt->rt6i_gateway = ort->rt6i_gateway;
2169 rt->rt6i_flags = ort->rt6i_flags;
2170 rt6_set_from(rt, ort);
2171 rt->rt6i_metric = ort->rt6i_metric;
2172 #ifdef CONFIG_IPV6_SUBTREES
2173 rt->rt6i_src = ort->rt6i_src;
2175 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2176 rt->rt6i_table = ort->rt6i_table;
2177 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2180 #ifdef CONFIG_IPV6_ROUTE_INFO
2181 static struct rt6_info *rt6_get_route_info(struct net *net,
2182 const struct in6_addr *prefix, int prefixlen,
2183 const struct in6_addr *gwaddr, int ifindex)
2185 struct fib6_node *fn;
2186 struct rt6_info *rt = NULL;
2187 struct fib6_table *table;
2189 table = fib6_get_table(net, RT6_TABLE_INFO);
2193 read_lock_bh(&table->tb6_lock);
2194 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2198 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2199 if (rt->dst.dev->ifindex != ifindex)
2201 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2203 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2209 read_unlock_bh(&table->tb6_lock);
2213 static struct rt6_info *rt6_add_route_info(struct net *net,
2214 const struct in6_addr *prefix, int prefixlen,
2215 const struct in6_addr *gwaddr, int ifindex,
2218 struct fib6_config cfg = {
2219 .fc_table = RT6_TABLE_INFO,
2220 .fc_metric = IP6_RT_PRIO_USER,
2221 .fc_ifindex = ifindex,
2222 .fc_dst_len = prefixlen,
2223 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2224 RTF_UP | RTF_PREF(pref),
2225 .fc_nlinfo.portid = 0,
2226 .fc_nlinfo.nlh = NULL,
2227 .fc_nlinfo.nl_net = net,
2230 cfg.fc_dst = *prefix;
2231 cfg.fc_gateway = *gwaddr;
2233 /* We should treat it as a default route if prefix length is 0. */
2235 cfg.fc_flags |= RTF_DEFAULT;
2237 ip6_route_add(&cfg);
2239 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2243 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2245 struct rt6_info *rt;
2246 struct fib6_table *table;
2248 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2252 read_lock_bh(&table->tb6_lock);
2253 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2254 if (dev == rt->dst.dev &&
2255 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2256 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2261 read_unlock_bh(&table->tb6_lock);
2265 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2266 struct net_device *dev,
2269 struct fib6_config cfg = {
2270 .fc_table = RT6_TABLE_DFLT,
2271 .fc_metric = IP6_RT_PRIO_USER,
2272 .fc_ifindex = dev->ifindex,
2273 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2274 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2275 .fc_nlinfo.portid = 0,
2276 .fc_nlinfo.nlh = NULL,
2277 .fc_nlinfo.nl_net = dev_net(dev),
2280 cfg.fc_gateway = *gwaddr;
2282 ip6_route_add(&cfg);
2284 return rt6_get_dflt_router(gwaddr, dev);
2287 void rt6_purge_dflt_routers(struct net *net)
2289 struct rt6_info *rt;
2290 struct fib6_table *table;
2292 /* NOTE: Keep consistent with rt6_get_dflt_router */
2293 table = fib6_get_table(net, RT6_TABLE_DFLT);
2298 read_lock_bh(&table->tb6_lock);
2299 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2300 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2301 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2303 read_unlock_bh(&table->tb6_lock);
2308 read_unlock_bh(&table->tb6_lock);
2311 static void rtmsg_to_fib6_config(struct net *net,
2312 struct in6_rtmsg *rtmsg,
2313 struct fib6_config *cfg)
2315 memset(cfg, 0, sizeof(*cfg));
2317 cfg->fc_table = RT6_TABLE_MAIN;
2318 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2319 cfg->fc_metric = rtmsg->rtmsg_metric;
2320 cfg->fc_expires = rtmsg->rtmsg_info;
2321 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2322 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2323 cfg->fc_flags = rtmsg->rtmsg_flags;
2325 cfg->fc_nlinfo.nl_net = net;
2327 cfg->fc_dst = rtmsg->rtmsg_dst;
2328 cfg->fc_src = rtmsg->rtmsg_src;
2329 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2332 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2334 struct fib6_config cfg;
2335 struct in6_rtmsg rtmsg;
2339 case SIOCADDRT: /* Add a route */
2340 case SIOCDELRT: /* Delete a route */
2341 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2343 err = copy_from_user(&rtmsg, arg,
2344 sizeof(struct in6_rtmsg));
2348 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2353 err = ip6_route_add(&cfg);
2356 err = ip6_route_del(&cfg);
2370 * Drop the packet on the floor
2373 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2376 struct dst_entry *dst = skb_dst(skb);
2377 switch (ipstats_mib_noroutes) {
2378 case IPSTATS_MIB_INNOROUTES:
2379 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2380 if (type == IPV6_ADDR_ANY) {
2381 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2382 IPSTATS_MIB_INADDRERRORS);
2386 case IPSTATS_MIB_OUTNOROUTES:
2387 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2388 ipstats_mib_noroutes);
2391 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2396 static int ip6_pkt_discard(struct sk_buff *skb)
2398 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2401 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2403 skb->dev = skb_dst(skb)->dev;
2404 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2407 static int ip6_pkt_prohibit(struct sk_buff *skb)
2409 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2412 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2414 skb->dev = skb_dst(skb)->dev;
2415 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2419 * Allocate a dst for local (unicast / anycast) address.
2422 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2423 const struct in6_addr *addr,
2426 struct net *net = dev_net(idev->dev);
2427 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2430 return ERR_PTR(-ENOMEM);
2434 rt->dst.flags |= DST_HOST;
2435 rt->dst.input = ip6_input;
2436 rt->dst.output = ip6_output;
2437 rt->rt6i_idev = idev;
2439 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2441 rt->rt6i_flags |= RTF_ANYCAST;
2443 rt->rt6i_flags |= RTF_LOCAL;
2445 rt->rt6i_gateway = *addr;
2446 rt->rt6i_dst.addr = *addr;
2447 rt->rt6i_dst.plen = 128;
2448 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2450 atomic_set(&rt->dst.__refcnt, 1);
2455 int ip6_route_get_saddr(struct net *net,
2456 struct rt6_info *rt,
2457 const struct in6_addr *daddr,
2459 struct in6_addr *saddr)
2461 struct inet6_dev *idev =
2462 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2464 if (rt && rt->rt6i_prefsrc.plen)
2465 *saddr = rt->rt6i_prefsrc.addr;
2467 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2468 daddr, prefs, saddr);
2472 /* remove deleted ip from prefsrc entries */
2473 struct arg_dev_net_ip {
2474 struct net_device *dev;
2476 struct in6_addr *addr;
2479 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2481 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2482 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2483 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2485 if (((void *)rt->dst.dev == dev || !dev) &&
2486 rt != net->ipv6.ip6_null_entry &&
2487 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2488 /* remove prefsrc entry */
2489 rt->rt6i_prefsrc.plen = 0;
2494 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2496 struct net *net = dev_net(ifp->idev->dev);
2497 struct arg_dev_net_ip adni = {
2498 .dev = ifp->idev->dev,
2502 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2505 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2506 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2508 /* Remove routers and update dst entries when gateway turn into host. */
2509 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2511 struct in6_addr *gateway = (struct in6_addr *)arg;
2513 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2514 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2515 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2521 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2523 fib6_clean_all(net, fib6_clean_tohost, gateway);
2526 struct arg_dev_net {
2527 struct net_device *dev;
2531 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2533 const struct arg_dev_net *adn = arg;
2534 const struct net_device *dev = adn->dev;
2536 if ((rt->dst.dev == dev || !dev) &&
2537 rt != adn->net->ipv6.ip6_null_entry)
2543 void rt6_ifdown(struct net *net, struct net_device *dev)
2545 struct arg_dev_net adn = {
2550 fib6_clean_all(net, fib6_ifdown, &adn);
2551 icmp6_clean_all(fib6_ifdown, &adn);
2552 rt6_uncached_list_flush_dev(net, dev);
2555 struct rt6_mtu_change_arg {
2556 struct net_device *dev;
2560 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2562 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2563 struct inet6_dev *idev;
2565 /* In IPv6 pmtu discovery is not optional,
2566 so that RTAX_MTU lock cannot disable it.
2567 We still use this lock to block changes
2568 caused by addrconf/ndisc.
2571 idev = __in6_dev_get(arg->dev);
2575 /* For administrative MTU increase, there is no way to discover
2576 IPv6 PMTU increase, so PMTU increase should be updated here.
2577 Since RFC 1981 doesn't include administrative MTU increase
2578 update PMTU increase is a MUST. (i.e. jumbo frame)
2581 If new MTU is less than route PMTU, this new MTU will be the
2582 lowest MTU in the path, update the route PMTU to reflect PMTU
2583 decreases; if new MTU is greater than route PMTU, and the
2584 old MTU is the lowest MTU in the path, update the route PMTU
2585 to reflect the increase. In this case if the other nodes' MTU
2586 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2589 if (rt->dst.dev == arg->dev &&
2590 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2591 if (rt->rt6i_flags & RTF_CACHE) {
2592 /* For RTF_CACHE with rt6i_pmtu == 0
2593 * (i.e. a redirected route),
2594 * the metrics of its rt->dst.from has already
2597 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2598 rt->rt6i_pmtu = arg->mtu;
2599 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2600 (dst_mtu(&rt->dst) < arg->mtu &&
2601 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2602 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2608 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2610 struct rt6_mtu_change_arg arg = {
2615 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2618 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2619 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2620 [RTA_OIF] = { .type = NLA_U32 },
2621 [RTA_IIF] = { .type = NLA_U32 },
2622 [RTA_PRIORITY] = { .type = NLA_U32 },
2623 [RTA_METRICS] = { .type = NLA_NESTED },
2624 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2625 [RTA_PREF] = { .type = NLA_U8 },
2626 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2627 [RTA_ENCAP] = { .type = NLA_NESTED },
2630 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2631 struct fib6_config *cfg)
2634 struct nlattr *tb[RTA_MAX+1];
2638 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2643 rtm = nlmsg_data(nlh);
2644 memset(cfg, 0, sizeof(*cfg));
2646 cfg->fc_table = rtm->rtm_table;
2647 cfg->fc_dst_len = rtm->rtm_dst_len;
2648 cfg->fc_src_len = rtm->rtm_src_len;
2649 cfg->fc_flags = RTF_UP;
2650 cfg->fc_protocol = rtm->rtm_protocol;
2651 cfg->fc_type = rtm->rtm_type;
2653 if (rtm->rtm_type == RTN_UNREACHABLE ||
2654 rtm->rtm_type == RTN_BLACKHOLE ||
2655 rtm->rtm_type == RTN_PROHIBIT ||
2656 rtm->rtm_type == RTN_THROW)
2657 cfg->fc_flags |= RTF_REJECT;
2659 if (rtm->rtm_type == RTN_LOCAL)
2660 cfg->fc_flags |= RTF_LOCAL;
2662 if (rtm->rtm_flags & RTM_F_CLONED)
2663 cfg->fc_flags |= RTF_CACHE;
2665 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2666 cfg->fc_nlinfo.nlh = nlh;
2667 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2669 if (tb[RTA_GATEWAY]) {
2670 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2671 cfg->fc_flags |= RTF_GATEWAY;
2675 int plen = (rtm->rtm_dst_len + 7) >> 3;
2677 if (nla_len(tb[RTA_DST]) < plen)
2680 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2684 int plen = (rtm->rtm_src_len + 7) >> 3;
2686 if (nla_len(tb[RTA_SRC]) < plen)
2689 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2692 if (tb[RTA_PREFSRC])
2693 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2696 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2698 if (tb[RTA_PRIORITY])
2699 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2701 if (tb[RTA_METRICS]) {
2702 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2703 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2707 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2709 if (tb[RTA_MULTIPATH]) {
2710 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2711 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2715 pref = nla_get_u8(tb[RTA_PREF]);
2716 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2717 pref != ICMPV6_ROUTER_PREF_HIGH)
2718 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2719 cfg->fc_flags |= RTF_PREF(pref);
2723 cfg->fc_encap = tb[RTA_ENCAP];
2725 if (tb[RTA_ENCAP_TYPE])
2726 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2733 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2735 struct fib6_config r_cfg;
2736 struct rtnexthop *rtnh;
2739 int err = 0, last_err = 0;
2741 remaining = cfg->fc_mp_len;
2743 rtnh = (struct rtnexthop *)cfg->fc_mp;
2745 /* Parse a Multipath Entry */
2746 while (rtnh_ok(rtnh, remaining)) {
2747 memcpy(&r_cfg, cfg, sizeof(*cfg));
2748 if (rtnh->rtnh_ifindex)
2749 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2751 attrlen = rtnh_attrlen(rtnh);
2753 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2755 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2757 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2758 r_cfg.fc_flags |= RTF_GATEWAY;
2760 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2761 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2763 r_cfg.fc_encap_type = nla_get_u16(nla);
2765 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2768 /* If we are trying to remove a route, do not stop the
2769 * loop when ip6_route_del() fails (because next hop is
2770 * already gone), we should try to remove all next hops.
2773 /* If add fails, we should try to delete all
2774 * next hops that have been already added.
2777 remaining = cfg->fc_mp_len - remaining;
2781 /* Because each route is added like a single route we remove
2782 * these flags after the first nexthop: if there is a collision,
2783 * we have already failed to add the first nexthop:
2784 * fib6_add_rt2node() has rejected it; when replacing, old
2785 * nexthops have been replaced by first new, the rest should
2788 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2790 rtnh = rtnh_next(rtnh, &remaining);
2796 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2798 struct fib6_config cfg;
2801 err = rtm_to_fib6_config(skb, nlh, &cfg);
2806 return ip6_route_multipath(&cfg, 0);
2808 return ip6_route_del(&cfg);
2811 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2813 struct fib6_config cfg;
2816 err = rtm_to_fib6_config(skb, nlh, &cfg);
2821 return ip6_route_multipath(&cfg, 1);
2823 return ip6_route_add(&cfg);
2826 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2828 return NLMSG_ALIGN(sizeof(struct rtmsg))
2829 + nla_total_size(16) /* RTA_SRC */
2830 + nla_total_size(16) /* RTA_DST */
2831 + nla_total_size(16) /* RTA_GATEWAY */
2832 + nla_total_size(16) /* RTA_PREFSRC */
2833 + nla_total_size(4) /* RTA_TABLE */
2834 + nla_total_size(4) /* RTA_IIF */
2835 + nla_total_size(4) /* RTA_OIF */
2836 + nla_total_size(4) /* RTA_PRIORITY */
2837 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2838 + nla_total_size(sizeof(struct rta_cacheinfo))
2839 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2840 + nla_total_size(1) /* RTA_PREF */
2841 + lwtunnel_get_encap_size(rt->dst.lwtstate);
2844 static int rt6_fill_node(struct net *net,
2845 struct sk_buff *skb, struct rt6_info *rt,
2846 struct in6_addr *dst, struct in6_addr *src,
2847 int iif, int type, u32 portid, u32 seq,
2848 int prefix, int nowait, unsigned int flags)
2850 u32 metrics[RTAX_MAX];
2852 struct nlmsghdr *nlh;
2856 if (prefix) { /* user wants prefix routes only */
2857 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2858 /* success since this is not a prefix route */
2863 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2867 rtm = nlmsg_data(nlh);
2868 rtm->rtm_family = AF_INET6;
2869 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2870 rtm->rtm_src_len = rt->rt6i_src.plen;
2873 table = rt->rt6i_table->tb6_id;
2875 table = RT6_TABLE_UNSPEC;
2876 rtm->rtm_table = table;
2877 if (nla_put_u32(skb, RTA_TABLE, table))
2878 goto nla_put_failure;
2879 if (rt->rt6i_flags & RTF_REJECT) {
2880 switch (rt->dst.error) {
2882 rtm->rtm_type = RTN_BLACKHOLE;
2885 rtm->rtm_type = RTN_PROHIBIT;
2888 rtm->rtm_type = RTN_THROW;
2891 rtm->rtm_type = RTN_UNREACHABLE;
2895 else if (rt->rt6i_flags & RTF_LOCAL)
2896 rtm->rtm_type = RTN_LOCAL;
2897 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2898 rtm->rtm_type = RTN_LOCAL;
2900 rtm->rtm_type = RTN_UNICAST;
2902 if (!netif_carrier_ok(rt->dst.dev)) {
2903 rtm->rtm_flags |= RTNH_F_LINKDOWN;
2904 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
2905 rtm->rtm_flags |= RTNH_F_DEAD;
2907 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2908 rtm->rtm_protocol = rt->rt6i_protocol;
2909 if (rt->rt6i_flags & RTF_DYNAMIC)
2910 rtm->rtm_protocol = RTPROT_REDIRECT;
2911 else if (rt->rt6i_flags & RTF_ADDRCONF) {
2912 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2913 rtm->rtm_protocol = RTPROT_RA;
2915 rtm->rtm_protocol = RTPROT_KERNEL;
2918 if (rt->rt6i_flags & RTF_CACHE)
2919 rtm->rtm_flags |= RTM_F_CLONED;
2922 if (nla_put_in6_addr(skb, RTA_DST, dst))
2923 goto nla_put_failure;
2924 rtm->rtm_dst_len = 128;
2925 } else if (rtm->rtm_dst_len)
2926 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2927 goto nla_put_failure;
2928 #ifdef CONFIG_IPV6_SUBTREES
2930 if (nla_put_in6_addr(skb, RTA_SRC, src))
2931 goto nla_put_failure;
2932 rtm->rtm_src_len = 128;
2933 } else if (rtm->rtm_src_len &&
2934 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2935 goto nla_put_failure;
2938 #ifdef CONFIG_IPV6_MROUTE
2939 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2940 int err = ip6mr_get_route(net, skb, rtm, nowait);
2945 goto nla_put_failure;
2947 if (err == -EMSGSIZE)
2948 goto nla_put_failure;
2953 if (nla_put_u32(skb, RTA_IIF, iif))
2954 goto nla_put_failure;
2956 struct in6_addr saddr_buf;
2957 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2958 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2959 goto nla_put_failure;
2962 if (rt->rt6i_prefsrc.plen) {
2963 struct in6_addr saddr_buf;
2964 saddr_buf = rt->rt6i_prefsrc.addr;
2965 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2966 goto nla_put_failure;
2969 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2971 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2972 if (rtnetlink_put_metrics(skb, metrics) < 0)
2973 goto nla_put_failure;
2975 if (rt->rt6i_flags & RTF_GATEWAY) {
2976 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2977 goto nla_put_failure;
2981 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2982 goto nla_put_failure;
2983 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2984 goto nla_put_failure;
2986 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2988 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2989 goto nla_put_failure;
2991 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2992 goto nla_put_failure;
2994 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
2996 nlmsg_end(skb, nlh);
3000 nlmsg_cancel(skb, nlh);
3004 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3006 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3009 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3010 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3011 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3015 return rt6_fill_node(arg->net,
3016 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3017 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3018 prefix, 0, NLM_F_MULTI);
3021 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3023 struct net *net = sock_net(in_skb->sk);
3024 struct nlattr *tb[RTA_MAX+1];
3025 struct rt6_info *rt;
3026 struct sk_buff *skb;
3029 int err, iif = 0, oif = 0;
3031 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3036 memset(&fl6, 0, sizeof(fl6));
3039 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3042 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3046 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3049 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3053 iif = nla_get_u32(tb[RTA_IIF]);
3056 oif = nla_get_u32(tb[RTA_OIF]);
3059 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3062 struct net_device *dev;
3065 dev = __dev_get_by_index(net, iif);
3071 fl6.flowi6_iif = iif;
3073 if (!ipv6_addr_any(&fl6.saddr))
3074 flags |= RT6_LOOKUP_F_HAS_SADDR;
3076 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3079 fl6.flowi6_oif = oif;
3081 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3084 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3091 /* Reserve room for dummy headers, this skb can pass
3092 through good chunk of routing engine.
3094 skb_reset_mac_header(skb);
3095 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3097 skb_dst_set(skb, &rt->dst);
3099 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3100 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3101 nlh->nlmsg_seq, 0, 0, 0);
3107 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3112 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3114 struct sk_buff *skb;
3115 struct net *net = info->nl_net;
3120 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3122 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3126 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3127 event, info->portid, seq, 0, 0, 0);
3129 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3130 WARN_ON(err == -EMSGSIZE);
3134 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3135 info->nlh, gfp_any());
3139 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3142 static int ip6_route_dev_notify(struct notifier_block *this,
3143 unsigned long event, void *ptr)
3145 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3146 struct net *net = dev_net(dev);
3148 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3149 net->ipv6.ip6_null_entry->dst.dev = dev;
3150 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3152 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3153 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3154 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3155 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3166 #ifdef CONFIG_PROC_FS
3168 static const struct file_operations ipv6_route_proc_fops = {
3169 .owner = THIS_MODULE,
3170 .open = ipv6_route_open,
3172 .llseek = seq_lseek,
3173 .release = seq_release_net,
3176 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3178 struct net *net = (struct net *)seq->private;
3179 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3180 net->ipv6.rt6_stats->fib_nodes,
3181 net->ipv6.rt6_stats->fib_route_nodes,
3182 net->ipv6.rt6_stats->fib_rt_alloc,
3183 net->ipv6.rt6_stats->fib_rt_entries,
3184 net->ipv6.rt6_stats->fib_rt_cache,
3185 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3186 net->ipv6.rt6_stats->fib_discarded_routes);
3191 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3193 return single_open_net(inode, file, rt6_stats_seq_show);
3196 static const struct file_operations rt6_stats_seq_fops = {
3197 .owner = THIS_MODULE,
3198 .open = rt6_stats_seq_open,
3200 .llseek = seq_lseek,
3201 .release = single_release_net,
3203 #endif /* CONFIG_PROC_FS */
3205 #ifdef CONFIG_SYSCTL
3208 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3209 void __user *buffer, size_t *lenp, loff_t *ppos)
3216 net = (struct net *)ctl->extra1;
3217 delay = net->ipv6.sysctl.flush_delay;
3218 proc_dointvec(ctl, write, buffer, lenp, ppos);
3219 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3223 struct ctl_table ipv6_route_table_template[] = {
3225 .procname = "flush",
3226 .data = &init_net.ipv6.sysctl.flush_delay,
3227 .maxlen = sizeof(int),
3229 .proc_handler = ipv6_sysctl_rtcache_flush
3232 .procname = "gc_thresh",
3233 .data = &ip6_dst_ops_template.gc_thresh,
3234 .maxlen = sizeof(int),
3236 .proc_handler = proc_dointvec,
3239 .procname = "max_size",
3240 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3241 .maxlen = sizeof(int),
3243 .proc_handler = proc_dointvec,
3246 .procname = "gc_min_interval",
3247 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3248 .maxlen = sizeof(int),
3250 .proc_handler = proc_dointvec_jiffies,
3253 .procname = "gc_timeout",
3254 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3255 .maxlen = sizeof(int),
3257 .proc_handler = proc_dointvec_jiffies,
3260 .procname = "gc_interval",
3261 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3262 .maxlen = sizeof(int),
3264 .proc_handler = proc_dointvec_jiffies,
3267 .procname = "gc_elasticity",
3268 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3269 .maxlen = sizeof(int),
3271 .proc_handler = proc_dointvec,
3274 .procname = "mtu_expires",
3275 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3276 .maxlen = sizeof(int),
3278 .proc_handler = proc_dointvec_jiffies,
3281 .procname = "min_adv_mss",
3282 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3283 .maxlen = sizeof(int),
3285 .proc_handler = proc_dointvec,
3288 .procname = "gc_min_interval_ms",
3289 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3290 .maxlen = sizeof(int),
3292 .proc_handler = proc_dointvec_ms_jiffies,
3297 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3299 struct ctl_table *table;
3301 table = kmemdup(ipv6_route_table_template,
3302 sizeof(ipv6_route_table_template),
3306 table[0].data = &net->ipv6.sysctl.flush_delay;
3307 table[0].extra1 = net;
3308 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3309 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3310 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3311 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3312 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3313 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3314 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3315 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3316 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3318 /* Don't export sysctls to unprivileged users */
3319 if (net->user_ns != &init_user_ns)
3320 table[0].procname = NULL;
3327 static int __net_init ip6_route_net_init(struct net *net)
3331 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3332 sizeof(net->ipv6.ip6_dst_ops));
3334 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3335 goto out_ip6_dst_ops;
3337 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3338 sizeof(*net->ipv6.ip6_null_entry),
3340 if (!net->ipv6.ip6_null_entry)
3341 goto out_ip6_dst_entries;
3342 net->ipv6.ip6_null_entry->dst.path =
3343 (struct dst_entry *)net->ipv6.ip6_null_entry;
3344 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3345 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3346 ip6_template_metrics, true);
3348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3349 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3350 sizeof(*net->ipv6.ip6_prohibit_entry),
3352 if (!net->ipv6.ip6_prohibit_entry)
3353 goto out_ip6_null_entry;
3354 net->ipv6.ip6_prohibit_entry->dst.path =
3355 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3356 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3357 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3358 ip6_template_metrics, true);
3360 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3361 sizeof(*net->ipv6.ip6_blk_hole_entry),
3363 if (!net->ipv6.ip6_blk_hole_entry)
3364 goto out_ip6_prohibit_entry;
3365 net->ipv6.ip6_blk_hole_entry->dst.path =
3366 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3367 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3368 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3369 ip6_template_metrics, true);
3372 net->ipv6.sysctl.flush_delay = 0;
3373 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3374 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3375 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3376 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3377 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3378 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3379 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3381 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3387 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3388 out_ip6_prohibit_entry:
3389 kfree(net->ipv6.ip6_prohibit_entry);
3391 kfree(net->ipv6.ip6_null_entry);
3393 out_ip6_dst_entries:
3394 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3399 static void __net_exit ip6_route_net_exit(struct net *net)
3401 kfree(net->ipv6.ip6_null_entry);
3402 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3403 kfree(net->ipv6.ip6_prohibit_entry);
3404 kfree(net->ipv6.ip6_blk_hole_entry);
3406 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3409 static int __net_init ip6_route_net_init_late(struct net *net)
3411 #ifdef CONFIG_PROC_FS
3412 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3413 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3418 static void __net_exit ip6_route_net_exit_late(struct net *net)
3420 #ifdef CONFIG_PROC_FS
3421 remove_proc_entry("ipv6_route", net->proc_net);
3422 remove_proc_entry("rt6_stats", net->proc_net);
3426 static struct pernet_operations ip6_route_net_ops = {
3427 .init = ip6_route_net_init,
3428 .exit = ip6_route_net_exit,
3431 static int __net_init ipv6_inetpeer_init(struct net *net)
3433 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3437 inet_peer_base_init(bp);
3438 net->ipv6.peers = bp;
3442 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3444 struct inet_peer_base *bp = net->ipv6.peers;
3446 net->ipv6.peers = NULL;
3447 inetpeer_invalidate_tree(bp);
3451 static struct pernet_operations ipv6_inetpeer_ops = {
3452 .init = ipv6_inetpeer_init,
3453 .exit = ipv6_inetpeer_exit,
3456 static struct pernet_operations ip6_route_net_late_ops = {
3457 .init = ip6_route_net_init_late,
3458 .exit = ip6_route_net_exit_late,
3461 static struct notifier_block ip6_route_dev_notifier = {
3462 .notifier_call = ip6_route_dev_notify,
3466 int __init ip6_route_init(void)
3472 ip6_dst_ops_template.kmem_cachep =
3473 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3474 SLAB_HWCACHE_ALIGN, NULL);
3475 if (!ip6_dst_ops_template.kmem_cachep)
3478 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3480 goto out_kmem_cache;
3482 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3484 goto out_dst_entries;
3486 ret = register_pernet_subsys(&ip6_route_net_ops);
3488 goto out_register_inetpeer;
3490 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3492 /* Registering of the loopback is done before this portion of code,
3493 * the loopback reference in rt6_info will not be taken, do it
3494 * manually for init_net */
3495 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3496 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3497 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3498 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3499 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3500 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3501 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3505 goto out_register_subsys;
3511 ret = fib6_rules_init();
3515 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3517 goto fib6_rules_init;
3520 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3521 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3522 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3523 goto out_register_late_subsys;
3525 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3527 goto out_register_late_subsys;
3529 for_each_possible_cpu(cpu) {
3530 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3532 INIT_LIST_HEAD(&ul->head);
3533 spin_lock_init(&ul->lock);
3539 out_register_late_subsys:
3540 unregister_pernet_subsys(&ip6_route_net_late_ops);
3542 fib6_rules_cleanup();
3547 out_register_subsys:
3548 unregister_pernet_subsys(&ip6_route_net_ops);
3549 out_register_inetpeer:
3550 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3552 dst_entries_destroy(&ip6_dst_blackhole_ops);
3554 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3558 void ip6_route_cleanup(void)
3560 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3561 unregister_pernet_subsys(&ip6_route_net_late_ops);
3562 fib6_rules_cleanup();
3565 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3566 unregister_pernet_subsys(&ip6_route_net_ops);
3567 dst_entries_destroy(&ip6_dst_blackhole_ops);
3568 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);