ARM: LPC32xx: Fix reset function
[cascardo/linux.git] / net / ipv6 / ip6_fib.c
1 /*
2  *      Linux INET6 implementation
3  *      Forwarding Information Database
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  *
13  *      Changes:
14  *      Yuji SEKIYA @USAGI:     Support default route on router node;
15  *                              remove ip6_null_entry from the top of
16  *                              routing table.
17  *      Ville Nuorvala:         Fixed routing subtrees.
18  */
19
20 #define pr_fmt(fmt) "IPv6: " fmt
21
22 #include <linux/errno.h>
23 #include <linux/types.h>
24 #include <linux/net.h>
25 #include <linux/route.h>
26 #include <linux/netdevice.h>
27 #include <linux/in6.h>
28 #include <linux/init.h>
29 #include <linux/list.h>
30 #include <linux/slab.h>
31
32 #include <net/ipv6.h>
33 #include <net/ndisc.h>
34 #include <net/addrconf.h>
35
36 #include <net/ip6_fib.h>
37 #include <net/ip6_route.h>
38
39 #define RT6_DEBUG 2
40
41 #if RT6_DEBUG >= 3
42 #define RT6_TRACE(x...) pr_debug(x)
43 #else
44 #define RT6_TRACE(x...) do { ; } while (0)
45 #endif
46
47 static struct kmem_cache *fib6_node_kmem __read_mostly;
48
49 enum fib_walk_state_t {
50 #ifdef CONFIG_IPV6_SUBTREES
51         FWS_S,
52 #endif
53         FWS_L,
54         FWS_R,
55         FWS_C,
56         FWS_U
57 };
58
59 struct fib6_cleaner_t {
60         struct fib6_walker_t w;
61         struct net *net;
62         int (*func)(struct rt6_info *, void *arg);
63         void *arg;
64 };
65
66 static DEFINE_RWLOCK(fib6_walker_lock);
67
68 #ifdef CONFIG_IPV6_SUBTREES
69 #define FWS_INIT FWS_S
70 #else
71 #define FWS_INIT FWS_L
72 #endif
73
74 static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
75 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
76 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
77 static int fib6_walk(struct fib6_walker_t *w);
78 static int fib6_walk_continue(struct fib6_walker_t *w);
79
80 /*
81  *      A routing update causes an increase of the serial number on the
82  *      affected subtree. This allows for cached routes to be asynchronously
83  *      tested when modifications are made to the destination cache as a
84  *      result of redirects, path MTU changes, etc.
85  */
86
87 static __u32 rt_sernum;
88
89 static void fib6_gc_timer_cb(unsigned long arg);
90
91 static LIST_HEAD(fib6_walkers);
92 #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
93
94 static inline void fib6_walker_link(struct fib6_walker_t *w)
95 {
96         write_lock_bh(&fib6_walker_lock);
97         list_add(&w->lh, &fib6_walkers);
98         write_unlock_bh(&fib6_walker_lock);
99 }
100
101 static inline void fib6_walker_unlink(struct fib6_walker_t *w)
102 {
103         write_lock_bh(&fib6_walker_lock);
104         list_del(&w->lh);
105         write_unlock_bh(&fib6_walker_lock);
106 }
107 static __inline__ u32 fib6_new_sernum(void)
108 {
109         u32 n = ++rt_sernum;
110         if ((__s32)n <= 0)
111                 rt_sernum = n = 1;
112         return n;
113 }
114
115 /*
116  *      Auxiliary address test functions for the radix tree.
117  *
118  *      These assume a 32bit processor (although it will work on
119  *      64bit processors)
120  */
121
122 /*
123  *      test bit
124  */
125 #if defined(__LITTLE_ENDIAN)
126 # define BITOP_BE32_SWIZZLE     (0x1F & ~7)
127 #else
128 # define BITOP_BE32_SWIZZLE     0
129 #endif
130
131 static __inline__ __be32 addr_bit_set(const void *token, int fn_bit)
132 {
133         const __be32 *addr = token;
134         /*
135          * Here,
136          *      1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
137          * is optimized version of
138          *      htonl(1 << ((~fn_bit)&0x1F))
139          * See include/asm-generic/bitops/le.h.
140          */
141         return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
142                addr[fn_bit >> 5];
143 }
144
145 static __inline__ struct fib6_node *node_alloc(void)
146 {
147         struct fib6_node *fn;
148
149         fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
150
151         return fn;
152 }
153
154 static __inline__ void node_free(struct fib6_node *fn)
155 {
156         kmem_cache_free(fib6_node_kmem, fn);
157 }
158
159 static __inline__ void rt6_release(struct rt6_info *rt)
160 {
161         if (atomic_dec_and_test(&rt->rt6i_ref))
162                 dst_free(&rt->dst);
163 }
164
165 static void fib6_link_table(struct net *net, struct fib6_table *tb)
166 {
167         unsigned int h;
168
169         /*
170          * Initialize table lock at a single place to give lockdep a key,
171          * tables aren't visible prior to being linked to the list.
172          */
173         rwlock_init(&tb->tb6_lock);
174
175         h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
176
177         /*
178          * No protection necessary, this is the only list mutatation
179          * operation, tables never disappear once they exist.
180          */
181         hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
182 }
183
184 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
185
186 static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
187 {
188         struct fib6_table *table;
189
190         table = kzalloc(sizeof(*table), GFP_ATOMIC);
191         if (table) {
192                 table->tb6_id = id;
193                 table->tb6_root.leaf = net->ipv6.ip6_null_entry;
194                 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
195                 inet_peer_base_init(&table->tb6_peers);
196         }
197
198         return table;
199 }
200
201 struct fib6_table *fib6_new_table(struct net *net, u32 id)
202 {
203         struct fib6_table *tb;
204
205         if (id == 0)
206                 id = RT6_TABLE_MAIN;
207         tb = fib6_get_table(net, id);
208         if (tb)
209                 return tb;
210
211         tb = fib6_alloc_table(net, id);
212         if (tb)
213                 fib6_link_table(net, tb);
214
215         return tb;
216 }
217
218 struct fib6_table *fib6_get_table(struct net *net, u32 id)
219 {
220         struct fib6_table *tb;
221         struct hlist_head *head;
222         unsigned int h;
223
224         if (id == 0)
225                 id = RT6_TABLE_MAIN;
226         h = id & (FIB6_TABLE_HASHSZ - 1);
227         rcu_read_lock();
228         head = &net->ipv6.fib_table_hash[h];
229         hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
230                 if (tb->tb6_id == id) {
231                         rcu_read_unlock();
232                         return tb;
233                 }
234         }
235         rcu_read_unlock();
236
237         return NULL;
238 }
239
240 static void __net_init fib6_tables_init(struct net *net)
241 {
242         fib6_link_table(net, net->ipv6.fib6_main_tbl);
243         fib6_link_table(net, net->ipv6.fib6_local_tbl);
244 }
245 #else
246
247 struct fib6_table *fib6_new_table(struct net *net, u32 id)
248 {
249         return fib6_get_table(net, id);
250 }
251
252 struct fib6_table *fib6_get_table(struct net *net, u32 id)
253 {
254           return net->ipv6.fib6_main_tbl;
255 }
256
257 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
258                                    int flags, pol_lookup_t lookup)
259 {
260         return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
261 }
262
263 static void __net_init fib6_tables_init(struct net *net)
264 {
265         fib6_link_table(net, net->ipv6.fib6_main_tbl);
266 }
267
268 #endif
269
270 static int fib6_dump_node(struct fib6_walker_t *w)
271 {
272         int res;
273         struct rt6_info *rt;
274
275         for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
276                 res = rt6_dump_route(rt, w->args);
277                 if (res < 0) {
278                         /* Frame is full, suspend walking */
279                         w->leaf = rt;
280                         return 1;
281                 }
282                 WARN_ON(res == 0);
283         }
284         w->leaf = NULL;
285         return 0;
286 }
287
288 static void fib6_dump_end(struct netlink_callback *cb)
289 {
290         struct fib6_walker_t *w = (void *)cb->args[2];
291
292         if (w) {
293                 if (cb->args[4]) {
294                         cb->args[4] = 0;
295                         fib6_walker_unlink(w);
296                 }
297                 cb->args[2] = 0;
298                 kfree(w);
299         }
300         cb->done = (void *)cb->args[3];
301         cb->args[1] = 3;
302 }
303
304 static int fib6_dump_done(struct netlink_callback *cb)
305 {
306         fib6_dump_end(cb);
307         return cb->done ? cb->done(cb) : 0;
308 }
309
310 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
311                            struct netlink_callback *cb)
312 {
313         struct fib6_walker_t *w;
314         int res;
315
316         w = (void *)cb->args[2];
317         w->root = &table->tb6_root;
318
319         if (cb->args[4] == 0) {
320                 w->count = 0;
321                 w->skip = 0;
322
323                 read_lock_bh(&table->tb6_lock);
324                 res = fib6_walk(w);
325                 read_unlock_bh(&table->tb6_lock);
326                 if (res > 0) {
327                         cb->args[4] = 1;
328                         cb->args[5] = w->root->fn_sernum;
329                 }
330         } else {
331                 if (cb->args[5] != w->root->fn_sernum) {
332                         /* Begin at the root if the tree changed */
333                         cb->args[5] = w->root->fn_sernum;
334                         w->state = FWS_INIT;
335                         w->node = w->root;
336                         w->skip = w->count;
337                 } else
338                         w->skip = 0;
339
340                 read_lock_bh(&table->tb6_lock);
341                 res = fib6_walk_continue(w);
342                 read_unlock_bh(&table->tb6_lock);
343                 if (res <= 0) {
344                         fib6_walker_unlink(w);
345                         cb->args[4] = 0;
346                 }
347         }
348
349         return res;
350 }
351
352 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
353 {
354         struct net *net = sock_net(skb->sk);
355         unsigned int h, s_h;
356         unsigned int e = 0, s_e;
357         struct rt6_rtnl_dump_arg arg;
358         struct fib6_walker_t *w;
359         struct fib6_table *tb;
360         struct hlist_head *head;
361         int res = 0;
362
363         s_h = cb->args[0];
364         s_e = cb->args[1];
365
366         w = (void *)cb->args[2];
367         if (!w) {
368                 /* New dump:
369                  *
370                  * 1. hook callback destructor.
371                  */
372                 cb->args[3] = (long)cb->done;
373                 cb->done = fib6_dump_done;
374
375                 /*
376                  * 2. allocate and initialize walker.
377                  */
378                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
379                 if (!w)
380                         return -ENOMEM;
381                 w->func = fib6_dump_node;
382                 cb->args[2] = (long)w;
383         }
384
385         arg.skb = skb;
386         arg.cb = cb;
387         arg.net = net;
388         w->args = &arg;
389
390         rcu_read_lock();
391         for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
392                 e = 0;
393                 head = &net->ipv6.fib_table_hash[h];
394                 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
395                         if (e < s_e)
396                                 goto next;
397                         res = fib6_dump_table(tb, skb, cb);
398                         if (res != 0)
399                                 goto out;
400 next:
401                         e++;
402                 }
403         }
404 out:
405         rcu_read_unlock();
406         cb->args[1] = e;
407         cb->args[0] = h;
408
409         res = res < 0 ? res : skb->len;
410         if (res <= 0)
411                 fib6_dump_end(cb);
412         return res;
413 }
414
415 /*
416  *      Routing Table
417  *
418  *      return the appropriate node for a routing tree "add" operation
419  *      by either creating and inserting or by returning an existing
420  *      node.
421  */
422
423 static struct fib6_node *fib6_add_1(struct fib6_node *root,
424                                      struct in6_addr *addr, int plen,
425                                      int offset, int allow_create,
426                                      int replace_required)
427 {
428         struct fib6_node *fn, *in, *ln;
429         struct fib6_node *pn = NULL;
430         struct rt6key *key;
431         int     bit;
432         __be32  dir = 0;
433         __u32   sernum = fib6_new_sernum();
434
435         RT6_TRACE("fib6_add_1\n");
436
437         /* insert node in tree */
438
439         fn = root;
440
441         do {
442                 key = (struct rt6key *)((u8 *)fn->leaf + offset);
443
444                 /*
445                  *      Prefix match
446                  */
447                 if (plen < fn->fn_bit ||
448                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
449                         if (!allow_create) {
450                                 if (replace_required) {
451                                         pr_warn("Can't replace route, no match found\n");
452                                         return ERR_PTR(-ENOENT);
453                                 }
454                                 pr_warn("NLM_F_CREATE should be set when creating new route\n");
455                         }
456                         goto insert_above;
457                 }
458
459                 /*
460                  *      Exact match ?
461                  */
462
463                 if (plen == fn->fn_bit) {
464                         /* clean up an intermediate node */
465                         if (!(fn->fn_flags & RTN_RTINFO)) {
466                                 rt6_release(fn->leaf);
467                                 fn->leaf = NULL;
468                         }
469
470                         fn->fn_sernum = sernum;
471
472                         return fn;
473                 }
474
475                 /*
476                  *      We have more bits to go
477                  */
478
479                 /* Try to walk down on tree. */
480                 fn->fn_sernum = sernum;
481                 dir = addr_bit_set(addr, fn->fn_bit);
482                 pn = fn;
483                 fn = dir ? fn->right : fn->left;
484         } while (fn);
485
486         if (!allow_create) {
487                 /* We should not create new node because
488                  * NLM_F_REPLACE was specified without NLM_F_CREATE
489                  * I assume it is safe to require NLM_F_CREATE when
490                  * REPLACE flag is used! Later we may want to remove the
491                  * check for replace_required, because according
492                  * to netlink specification, NLM_F_CREATE
493                  * MUST be specified if new route is created.
494                  * That would keep IPv6 consistent with IPv4
495                  */
496                 if (replace_required) {
497                         pr_warn("Can't replace route, no match found\n");
498                         return ERR_PTR(-ENOENT);
499                 }
500                 pr_warn("NLM_F_CREATE should be set when creating new route\n");
501         }
502         /*
503          *      We walked to the bottom of tree.
504          *      Create new leaf node without children.
505          */
506
507         ln = node_alloc();
508
509         if (!ln)
510                 return ERR_PTR(-ENOMEM);
511         ln->fn_bit = plen;
512
513         ln->parent = pn;
514         ln->fn_sernum = sernum;
515
516         if (dir)
517                 pn->right = ln;
518         else
519                 pn->left  = ln;
520
521         return ln;
522
523
524 insert_above:
525         /*
526          * split since we don't have a common prefix anymore or
527          * we have a less significant route.
528          * we've to insert an intermediate node on the list
529          * this new node will point to the one we need to create
530          * and the current
531          */
532
533         pn = fn->parent;
534
535         /* find 1st bit in difference between the 2 addrs.
536
537            See comment in __ipv6_addr_diff: bit may be an invalid value,
538            but if it is >= plen, the value is ignored in any case.
539          */
540
541         bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
542
543         /*
544          *              (intermediate)[in]
545          *                /        \
546          *      (new leaf node)[ln] (old node)[fn]
547          */
548         if (plen > bit) {
549                 in = node_alloc();
550                 ln = node_alloc();
551
552                 if (!in || !ln) {
553                         if (in)
554                                 node_free(in);
555                         if (ln)
556                                 node_free(ln);
557                         return ERR_PTR(-ENOMEM);
558                 }
559
560                 /*
561                  * new intermediate node.
562                  * RTN_RTINFO will
563                  * be off since that an address that chooses one of
564                  * the branches would not match less specific routes
565                  * in the other branch
566                  */
567
568                 in->fn_bit = bit;
569
570                 in->parent = pn;
571                 in->leaf = fn->leaf;
572                 atomic_inc(&in->leaf->rt6i_ref);
573
574                 in->fn_sernum = sernum;
575
576                 /* update parent pointer */
577                 if (dir)
578                         pn->right = in;
579                 else
580                         pn->left  = in;
581
582                 ln->fn_bit = plen;
583
584                 ln->parent = in;
585                 fn->parent = in;
586
587                 ln->fn_sernum = sernum;
588
589                 if (addr_bit_set(addr, bit)) {
590                         in->right = ln;
591                         in->left  = fn;
592                 } else {
593                         in->left  = ln;
594                         in->right = fn;
595                 }
596         } else { /* plen <= bit */
597
598                 /*
599                  *              (new leaf node)[ln]
600                  *                /        \
601                  *           (old node)[fn] NULL
602                  */
603
604                 ln = node_alloc();
605
606                 if (!ln)
607                         return ERR_PTR(-ENOMEM);
608
609                 ln->fn_bit = plen;
610
611                 ln->parent = pn;
612
613                 ln->fn_sernum = sernum;
614
615                 if (dir)
616                         pn->right = ln;
617                 else
618                         pn->left  = ln;
619
620                 if (addr_bit_set(&key->addr, plen))
621                         ln->right = fn;
622                 else
623                         ln->left  = fn;
624
625                 fn->parent = ln;
626         }
627         return ln;
628 }
629
630 static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt)
631 {
632         return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
633                RTF_GATEWAY;
634 }
635
636 static int fib6_commit_metrics(struct dst_entry *dst,
637                                struct nlattr *mx, int mx_len)
638 {
639         struct nlattr *nla;
640         int remaining;
641         u32 *mp;
642
643         if (dst->flags & DST_HOST) {
644                 mp = dst_metrics_write_ptr(dst);
645         } else {
646                 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
647                 if (!mp)
648                         return -ENOMEM;
649                 dst_init_metrics(dst, mp, 0);
650         }
651
652         nla_for_each_attr(nla, mx, mx_len, remaining) {
653                 int type = nla_type(nla);
654
655                 if (type) {
656                         if (type > RTAX_MAX)
657                                 return -EINVAL;
658
659                         mp[type - 1] = nla_get_u32(nla);
660                 }
661         }
662         return 0;
663 }
664
665 /*
666  *      Insert routing information in a node.
667  */
668
669 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
670                             struct nl_info *info, struct nlattr *mx, int mx_len)
671 {
672         struct rt6_info *iter = NULL;
673         struct rt6_info **ins;
674         int replace = (info->nlh &&
675                        (info->nlh->nlmsg_flags & NLM_F_REPLACE));
676         int add = (!info->nlh ||
677                    (info->nlh->nlmsg_flags & NLM_F_CREATE));
678         int found = 0;
679         bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
680         int err;
681
682         ins = &fn->leaf;
683
684         for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
685                 /*
686                  *      Search for duplicates
687                  */
688
689                 if (iter->rt6i_metric == rt->rt6i_metric) {
690                         /*
691                          *      Same priority level
692                          */
693                         if (info->nlh &&
694                             (info->nlh->nlmsg_flags & NLM_F_EXCL))
695                                 return -EEXIST;
696                         if (replace) {
697                                 found++;
698                                 break;
699                         }
700
701                         if (iter->dst.dev == rt->dst.dev &&
702                             iter->rt6i_idev == rt->rt6i_idev &&
703                             ipv6_addr_equal(&iter->rt6i_gateway,
704                                             &rt->rt6i_gateway)) {
705                                 if (rt->rt6i_nsiblings)
706                                         rt->rt6i_nsiblings = 0;
707                                 if (!(iter->rt6i_flags & RTF_EXPIRES))
708                                         return -EEXIST;
709                                 if (!(rt->rt6i_flags & RTF_EXPIRES))
710                                         rt6_clean_expires(iter);
711                                 else
712                                         rt6_set_expires(iter, rt->dst.expires);
713                                 return -EEXIST;
714                         }
715                         /* If we have the same destination and the same metric,
716                          * but not the same gateway, then the route we try to
717                          * add is sibling to this route, increment our counter
718                          * of siblings, and later we will add our route to the
719                          * list.
720                          * Only static routes (which don't have flag
721                          * RTF_EXPIRES) are used for ECMPv6.
722                          *
723                          * To avoid long list, we only had siblings if the
724                          * route have a gateway.
725                          */
726                         if (rt_can_ecmp &&
727                             rt6_qualify_for_ecmp(iter))
728                                 rt->rt6i_nsiblings++;
729                 }
730
731                 if (iter->rt6i_metric > rt->rt6i_metric)
732                         break;
733
734                 ins = &iter->dst.rt6_next;
735         }
736
737         /* Reset round-robin state, if necessary */
738         if (ins == &fn->leaf)
739                 fn->rr_ptr = NULL;
740
741         /* Link this route to others same route. */
742         if (rt->rt6i_nsiblings) {
743                 unsigned int rt6i_nsiblings;
744                 struct rt6_info *sibling, *temp_sibling;
745
746                 /* Find the first route that have the same metric */
747                 sibling = fn->leaf;
748                 while (sibling) {
749                         if (sibling->rt6i_metric == rt->rt6i_metric &&
750                             rt6_qualify_for_ecmp(sibling)) {
751                                 list_add_tail(&rt->rt6i_siblings,
752                                               &sibling->rt6i_siblings);
753                                 break;
754                         }
755                         sibling = sibling->dst.rt6_next;
756                 }
757                 /* For each sibling in the list, increment the counter of
758                  * siblings. BUG() if counters does not match, list of siblings
759                  * is broken!
760                  */
761                 rt6i_nsiblings = 0;
762                 list_for_each_entry_safe(sibling, temp_sibling,
763                                          &rt->rt6i_siblings, rt6i_siblings) {
764                         sibling->rt6i_nsiblings++;
765                         BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
766                         rt6i_nsiblings++;
767                 }
768                 BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
769         }
770
771         /*
772          *      insert node
773          */
774         if (!replace) {
775                 if (!add)
776                         pr_warn("NLM_F_CREATE should be set when creating new route\n");
777
778 add:
779                 if (mx) {
780                         err = fib6_commit_metrics(&rt->dst, mx, mx_len);
781                         if (err)
782                                 return err;
783                 }
784                 rt->dst.rt6_next = iter;
785                 *ins = rt;
786                 rt->rt6i_node = fn;
787                 atomic_inc(&rt->rt6i_ref);
788                 inet6_rt_notify(RTM_NEWROUTE, rt, info);
789                 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
790
791                 if (!(fn->fn_flags & RTN_RTINFO)) {
792                         info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
793                         fn->fn_flags |= RTN_RTINFO;
794                 }
795
796         } else {
797                 if (!found) {
798                         if (add)
799                                 goto add;
800                         pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
801                         return -ENOENT;
802                 }
803                 if (mx) {
804                         err = fib6_commit_metrics(&rt->dst, mx, mx_len);
805                         if (err)
806                                 return err;
807                 }
808                 *ins = rt;
809                 rt->rt6i_node = fn;
810                 rt->dst.rt6_next = iter->dst.rt6_next;
811                 atomic_inc(&rt->rt6i_ref);
812                 inet6_rt_notify(RTM_NEWROUTE, rt, info);
813                 rt6_release(iter);
814                 if (!(fn->fn_flags & RTN_RTINFO)) {
815                         info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
816                         fn->fn_flags |= RTN_RTINFO;
817                 }
818         }
819
820         return 0;
821 }
822
823 static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt)
824 {
825         if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
826             (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
827                 mod_timer(&net->ipv6.ip6_fib_timer,
828                           jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
829 }
830
831 void fib6_force_start_gc(struct net *net)
832 {
833         if (!timer_pending(&net->ipv6.ip6_fib_timer))
834                 mod_timer(&net->ipv6.ip6_fib_timer,
835                           jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
836 }
837
838 /*
839  *      Add routing information to the routing tree.
840  *      <destination addr>/<source addr>
841  *      with source addr info in sub-trees
842  */
843
844 int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
845              struct nlattr *mx, int mx_len)
846 {
847         struct fib6_node *fn, *pn = NULL;
848         int err = -ENOMEM;
849         int allow_create = 1;
850         int replace_required = 0;
851
852         if (info->nlh) {
853                 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
854                         allow_create = 0;
855                 if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
856                         replace_required = 1;
857         }
858         if (!allow_create && !replace_required)
859                 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
860
861         fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
862                         offsetof(struct rt6_info, rt6i_dst), allow_create,
863                         replace_required);
864         if (IS_ERR(fn)) {
865                 err = PTR_ERR(fn);
866                 fn = NULL;
867                 goto out;
868         }
869
870         pn = fn;
871
872 #ifdef CONFIG_IPV6_SUBTREES
873         if (rt->rt6i_src.plen) {
874                 struct fib6_node *sn;
875
876                 if (!fn->subtree) {
877                         struct fib6_node *sfn;
878
879                         /*
880                          * Create subtree.
881                          *
882                          *              fn[main tree]
883                          *              |
884                          *              sfn[subtree root]
885                          *                 \
886                          *                  sn[new leaf node]
887                          */
888
889                         /* Create subtree root node */
890                         sfn = node_alloc();
891                         if (!sfn)
892                                 goto st_failure;
893
894                         sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
895                         atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
896                         sfn->fn_flags = RTN_ROOT;
897                         sfn->fn_sernum = fib6_new_sernum();
898
899                         /* Now add the first leaf node to new subtree */
900
901                         sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
902                                         rt->rt6i_src.plen,
903                                         offsetof(struct rt6_info, rt6i_src),
904                                         allow_create, replace_required);
905
906                         if (IS_ERR(sn)) {
907                                 /* If it is failed, discard just allocated
908                                    root, and then (in st_failure) stale node
909                                    in main tree.
910                                  */
911                                 node_free(sfn);
912                                 err = PTR_ERR(sn);
913                                 goto st_failure;
914                         }
915
916                         /* Now link new subtree to main tree */
917                         sfn->parent = fn;
918                         fn->subtree = sfn;
919                 } else {
920                         sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
921                                         rt->rt6i_src.plen,
922                                         offsetof(struct rt6_info, rt6i_src),
923                                         allow_create, replace_required);
924
925                         if (IS_ERR(sn)) {
926                                 err = PTR_ERR(sn);
927                                 goto st_failure;
928                         }
929                 }
930
931                 if (!fn->leaf) {
932                         fn->leaf = rt;
933                         atomic_inc(&rt->rt6i_ref);
934                 }
935                 fn = sn;
936         }
937 #endif
938
939         err = fib6_add_rt2node(fn, rt, info, mx, mx_len);
940         if (!err) {
941                 fib6_start_gc(info->nl_net, rt);
942                 if (!(rt->rt6i_flags & RTF_CACHE))
943                         fib6_prune_clones(info->nl_net, pn);
944         }
945
946 out:
947         if (err) {
948 #ifdef CONFIG_IPV6_SUBTREES
949                 /*
950                  * If fib6_add_1 has cleared the old leaf pointer in the
951                  * super-tree leaf node we have to find a new one for it.
952                  */
953                 if (pn != fn && pn->leaf == rt) {
954                         pn->leaf = NULL;
955                         atomic_dec(&rt->rt6i_ref);
956                 }
957                 if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
958                         pn->leaf = fib6_find_prefix(info->nl_net, pn);
959 #if RT6_DEBUG >= 2
960                         if (!pn->leaf) {
961                                 WARN_ON(pn->leaf == NULL);
962                                 pn->leaf = info->nl_net->ipv6.ip6_null_entry;
963                         }
964 #endif
965                         atomic_inc(&pn->leaf->rt6i_ref);
966                 }
967 #endif
968                 dst_free(&rt->dst);
969         }
970         return err;
971
972 #ifdef CONFIG_IPV6_SUBTREES
973         /* Subtree creation failed, probably main tree node
974            is orphan. If it is, shoot it.
975          */
976 st_failure:
977         if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
978                 fib6_repair_tree(info->nl_net, fn);
979         dst_free(&rt->dst);
980         return err;
981 #endif
982 }
983
984 /*
985  *      Routing tree lookup
986  *
987  */
988
989 struct lookup_args {
990         int                     offset;         /* key offset on rt6_info       */
991         const struct in6_addr   *addr;          /* search key                   */
992 };
993
994 static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
995                                        struct lookup_args *args)
996 {
997         struct fib6_node *fn;
998         __be32 dir;
999
1000         if (unlikely(args->offset == 0))
1001                 return NULL;
1002
1003         /*
1004          *      Descend on a tree
1005          */
1006
1007         fn = root;
1008
1009         for (;;) {
1010                 struct fib6_node *next;
1011
1012                 dir = addr_bit_set(args->addr, fn->fn_bit);
1013
1014                 next = dir ? fn->right : fn->left;
1015
1016                 if (next) {
1017                         fn = next;
1018                         continue;
1019                 }
1020                 break;
1021         }
1022
1023         while (fn) {
1024                 if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
1025                         struct rt6key *key;
1026
1027                         key = (struct rt6key *) ((u8 *) fn->leaf +
1028                                                  args->offset);
1029
1030                         if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1031 #ifdef CONFIG_IPV6_SUBTREES
1032                                 if (fn->subtree) {
1033                                         struct fib6_node *sfn;
1034                                         sfn = fib6_lookup_1(fn->subtree,
1035                                                             args + 1);
1036                                         if (!sfn)
1037                                                 goto backtrack;
1038                                         fn = sfn;
1039                                 }
1040 #endif
1041                                 if (fn->fn_flags & RTN_RTINFO)
1042                                         return fn;
1043                         }
1044                 }
1045 #ifdef CONFIG_IPV6_SUBTREES
1046 backtrack:
1047 #endif
1048                 if (fn->fn_flags & RTN_ROOT)
1049                         break;
1050
1051                 fn = fn->parent;
1052         }
1053
1054         return NULL;
1055 }
1056
1057 struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
1058                               const struct in6_addr *saddr)
1059 {
1060         struct fib6_node *fn;
1061         struct lookup_args args[] = {
1062                 {
1063                         .offset = offsetof(struct rt6_info, rt6i_dst),
1064                         .addr = daddr,
1065                 },
1066 #ifdef CONFIG_IPV6_SUBTREES
1067                 {
1068                         .offset = offsetof(struct rt6_info, rt6i_src),
1069                         .addr = saddr,
1070                 },
1071 #endif
1072                 {
1073                         .offset = 0,    /* sentinel */
1074                 }
1075         };
1076
1077         fn = fib6_lookup_1(root, daddr ? args : args + 1);
1078         if (!fn || fn->fn_flags & RTN_TL_ROOT)
1079                 fn = root;
1080
1081         return fn;
1082 }
1083
1084 /*
1085  *      Get node with specified destination prefix (and source prefix,
1086  *      if subtrees are used)
1087  */
1088
1089
1090 static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1091                                        const struct in6_addr *addr,
1092                                        int plen, int offset)
1093 {
1094         struct fib6_node *fn;
1095
1096         for (fn = root; fn ; ) {
1097                 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
1098
1099                 /*
1100                  *      Prefix match
1101                  */
1102                 if (plen < fn->fn_bit ||
1103                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1104                         return NULL;
1105
1106                 if (plen == fn->fn_bit)
1107                         return fn;
1108
1109                 /*
1110                  *      We have more bits to go
1111                  */
1112                 if (addr_bit_set(addr, fn->fn_bit))
1113                         fn = fn->right;
1114                 else
1115                         fn = fn->left;
1116         }
1117         return NULL;
1118 }
1119
1120 struct fib6_node *fib6_locate(struct fib6_node *root,
1121                               const struct in6_addr *daddr, int dst_len,
1122                               const struct in6_addr *saddr, int src_len)
1123 {
1124         struct fib6_node *fn;
1125
1126         fn = fib6_locate_1(root, daddr, dst_len,
1127                            offsetof(struct rt6_info, rt6i_dst));
1128
1129 #ifdef CONFIG_IPV6_SUBTREES
1130         if (src_len) {
1131                 WARN_ON(saddr == NULL);
1132                 if (fn && fn->subtree)
1133                         fn = fib6_locate_1(fn->subtree, saddr, src_len,
1134                                            offsetof(struct rt6_info, rt6i_src));
1135         }
1136 #endif
1137
1138         if (fn && fn->fn_flags & RTN_RTINFO)
1139                 return fn;
1140
1141         return NULL;
1142 }
1143
1144
1145 /*
1146  *      Deletion
1147  *
1148  */
1149
1150 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
1151 {
1152         if (fn->fn_flags & RTN_ROOT)
1153                 return net->ipv6.ip6_null_entry;
1154
1155         while (fn) {
1156                 if (fn->left)
1157                         return fn->left->leaf;
1158                 if (fn->right)
1159                         return fn->right->leaf;
1160
1161                 fn = FIB6_SUBTREE(fn);
1162         }
1163         return NULL;
1164 }
1165
1166 /*
1167  *      Called to trim the tree of intermediate nodes when possible. "fn"
1168  *      is the node we want to try and remove.
1169  */
1170
1171 static struct fib6_node *fib6_repair_tree(struct net *net,
1172                                            struct fib6_node *fn)
1173 {
1174         int children;
1175         int nstate;
1176         struct fib6_node *child, *pn;
1177         struct fib6_walker_t *w;
1178         int iter = 0;
1179
1180         for (;;) {
1181                 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1182                 iter++;
1183
1184                 WARN_ON(fn->fn_flags & RTN_RTINFO);
1185                 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1186                 WARN_ON(fn->leaf != NULL);
1187
1188                 children = 0;
1189                 child = NULL;
1190                 if (fn->right)
1191                         child = fn->right, children |= 1;
1192                 if (fn->left)
1193                         child = fn->left, children |= 2;
1194
1195                 if (children == 3 || FIB6_SUBTREE(fn)
1196 #ifdef CONFIG_IPV6_SUBTREES
1197                     /* Subtree root (i.e. fn) may have one child */
1198                     || (children && fn->fn_flags & RTN_ROOT)
1199 #endif
1200                     ) {
1201                         fn->leaf = fib6_find_prefix(net, fn);
1202 #if RT6_DEBUG >= 2
1203                         if (!fn->leaf) {
1204                                 WARN_ON(!fn->leaf);
1205                                 fn->leaf = net->ipv6.ip6_null_entry;
1206                         }
1207 #endif
1208                         atomic_inc(&fn->leaf->rt6i_ref);
1209                         return fn->parent;
1210                 }
1211
1212                 pn = fn->parent;
1213 #ifdef CONFIG_IPV6_SUBTREES
1214                 if (FIB6_SUBTREE(pn) == fn) {
1215                         WARN_ON(!(fn->fn_flags & RTN_ROOT));
1216                         FIB6_SUBTREE(pn) = NULL;
1217                         nstate = FWS_L;
1218                 } else {
1219                         WARN_ON(fn->fn_flags & RTN_ROOT);
1220 #endif
1221                         if (pn->right == fn)
1222                                 pn->right = child;
1223                         else if (pn->left == fn)
1224                                 pn->left = child;
1225 #if RT6_DEBUG >= 2
1226                         else
1227                                 WARN_ON(1);
1228 #endif
1229                         if (child)
1230                                 child->parent = pn;
1231                         nstate = FWS_R;
1232 #ifdef CONFIG_IPV6_SUBTREES
1233                 }
1234 #endif
1235
1236                 read_lock(&fib6_walker_lock);
1237                 FOR_WALKERS(w) {
1238                         if (!child) {
1239                                 if (w->root == fn) {
1240                                         w->root = w->node = NULL;
1241                                         RT6_TRACE("W %p adjusted by delroot 1\n", w);
1242                                 } else if (w->node == fn) {
1243                                         RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1244                                         w->node = pn;
1245                                         w->state = nstate;
1246                                 }
1247                         } else {
1248                                 if (w->root == fn) {
1249                                         w->root = child;
1250                                         RT6_TRACE("W %p adjusted by delroot 2\n", w);
1251                                 }
1252                                 if (w->node == fn) {
1253                                         w->node = child;
1254                                         if (children&2) {
1255                                                 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1256                                                 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1257                                         } else {
1258                                                 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1259                                                 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1260                                         }
1261                                 }
1262                         }
1263                 }
1264                 read_unlock(&fib6_walker_lock);
1265
1266                 node_free(fn);
1267                 if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1268                         return pn;
1269
1270                 rt6_release(pn->leaf);
1271                 pn->leaf = NULL;
1272                 fn = pn;
1273         }
1274 }
1275
1276 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
1277                            struct nl_info *info)
1278 {
1279         struct fib6_walker_t *w;
1280         struct rt6_info *rt = *rtp;
1281         struct net *net = info->nl_net;
1282
1283         RT6_TRACE("fib6_del_route\n");
1284
1285         /* Unlink it */
1286         *rtp = rt->dst.rt6_next;
1287         rt->rt6i_node = NULL;
1288         net->ipv6.rt6_stats->fib_rt_entries--;
1289         net->ipv6.rt6_stats->fib_discarded_routes++;
1290
1291         /* Reset round-robin state, if necessary */
1292         if (fn->rr_ptr == rt)
1293                 fn->rr_ptr = NULL;
1294
1295         /* Remove this entry from other siblings */
1296         if (rt->rt6i_nsiblings) {
1297                 struct rt6_info *sibling, *next_sibling;
1298
1299                 list_for_each_entry_safe(sibling, next_sibling,
1300                                          &rt->rt6i_siblings, rt6i_siblings)
1301                         sibling->rt6i_nsiblings--;
1302                 rt->rt6i_nsiblings = 0;
1303                 list_del_init(&rt->rt6i_siblings);
1304         }
1305
1306         /* Adjust walkers */
1307         read_lock(&fib6_walker_lock);
1308         FOR_WALKERS(w) {
1309                 if (w->state == FWS_C && w->leaf == rt) {
1310                         RT6_TRACE("walker %p adjusted by delroute\n", w);
1311                         w->leaf = rt->dst.rt6_next;
1312                         if (!w->leaf)
1313                                 w->state = FWS_U;
1314                 }
1315         }
1316         read_unlock(&fib6_walker_lock);
1317
1318         rt->dst.rt6_next = NULL;
1319
1320         /* If it was last route, expunge its radix tree node */
1321         if (!fn->leaf) {
1322                 fn->fn_flags &= ~RTN_RTINFO;
1323                 net->ipv6.rt6_stats->fib_route_nodes--;
1324                 fn = fib6_repair_tree(net, fn);
1325         }
1326
1327         if (atomic_read(&rt->rt6i_ref) != 1) {
1328                 /* This route is used as dummy address holder in some split
1329                  * nodes. It is not leaked, but it still holds other resources,
1330                  * which must be released in time. So, scan ascendant nodes
1331                  * and replace dummy references to this route with references
1332                  * to still alive ones.
1333                  */
1334                 while (fn) {
1335                         if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
1336                                 fn->leaf = fib6_find_prefix(net, fn);
1337                                 atomic_inc(&fn->leaf->rt6i_ref);
1338                                 rt6_release(rt);
1339                         }
1340                         fn = fn->parent;
1341                 }
1342                 /* No more references are possible at this point. */
1343                 BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
1344         }
1345
1346         inet6_rt_notify(RTM_DELROUTE, rt, info);
1347         rt6_release(rt);
1348 }
1349
1350 int fib6_del(struct rt6_info *rt, struct nl_info *info)
1351 {
1352         struct net *net = info->nl_net;
1353         struct fib6_node *fn = rt->rt6i_node;
1354         struct rt6_info **rtp;
1355
1356 #if RT6_DEBUG >= 2
1357         if (rt->dst.obsolete > 0) {
1358                 WARN_ON(fn != NULL);
1359                 return -ENOENT;
1360         }
1361 #endif
1362         if (!fn || rt == net->ipv6.ip6_null_entry)
1363                 return -ENOENT;
1364
1365         WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1366
1367         if (!(rt->rt6i_flags & RTF_CACHE)) {
1368                 struct fib6_node *pn = fn;
1369 #ifdef CONFIG_IPV6_SUBTREES
1370                 /* clones of this route might be in another subtree */
1371                 if (rt->rt6i_src.plen) {
1372                         while (!(pn->fn_flags & RTN_ROOT))
1373                                 pn = pn->parent;
1374                         pn = pn->parent;
1375                 }
1376 #endif
1377                 fib6_prune_clones(info->nl_net, pn);
1378         }
1379
1380         /*
1381          *      Walk the leaf entries looking for ourself
1382          */
1383
1384         for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
1385                 if (*rtp == rt) {
1386                         fib6_del_route(fn, rtp, info);
1387                         return 0;
1388                 }
1389         }
1390         return -ENOENT;
1391 }
1392
1393 /*
1394  *      Tree traversal function.
1395  *
1396  *      Certainly, it is not interrupt safe.
1397  *      However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1398  *      It means, that we can modify tree during walking
1399  *      and use this function for garbage collection, clone pruning,
1400  *      cleaning tree when a device goes down etc. etc.
1401  *
1402  *      It guarantees that every node will be traversed,
1403  *      and that it will be traversed only once.
1404  *
1405  *      Callback function w->func may return:
1406  *      0 -> continue walking.
1407  *      positive value -> walking is suspended (used by tree dumps,
1408  *      and probably by gc, if it will be split to several slices)
1409  *      negative value -> terminate walking.
1410  *
1411  *      The function itself returns:
1412  *      0   -> walk is complete.
1413  *      >0  -> walk is incomplete (i.e. suspended)
1414  *      <0  -> walk is terminated by an error.
1415  */
1416
1417 static int fib6_walk_continue(struct fib6_walker_t *w)
1418 {
1419         struct fib6_node *fn, *pn;
1420
1421         for (;;) {
1422                 fn = w->node;
1423                 if (!fn)
1424                         return 0;
1425
1426                 if (w->prune && fn != w->root &&
1427                     fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
1428                         w->state = FWS_C;
1429                         w->leaf = fn->leaf;
1430                 }
1431                 switch (w->state) {
1432 #ifdef CONFIG_IPV6_SUBTREES
1433                 case FWS_S:
1434                         if (FIB6_SUBTREE(fn)) {
1435                                 w->node = FIB6_SUBTREE(fn);
1436                                 continue;
1437                         }
1438                         w->state = FWS_L;
1439 #endif
1440                 case FWS_L:
1441                         if (fn->left) {
1442                                 w->node = fn->left;
1443                                 w->state = FWS_INIT;
1444                                 continue;
1445                         }
1446                         w->state = FWS_R;
1447                 case FWS_R:
1448                         if (fn->right) {
1449                                 w->node = fn->right;
1450                                 w->state = FWS_INIT;
1451                                 continue;
1452                         }
1453                         w->state = FWS_C;
1454                         w->leaf = fn->leaf;
1455                 case FWS_C:
1456                         if (w->leaf && fn->fn_flags & RTN_RTINFO) {
1457                                 int err;
1458
1459                                 if (w->skip) {
1460                                         w->skip--;
1461                                         goto skip;
1462                                 }
1463
1464                                 err = w->func(w);
1465                                 if (err)
1466                                         return err;
1467
1468                                 w->count++;
1469                                 continue;
1470                         }
1471 skip:
1472                         w->state = FWS_U;
1473                 case FWS_U:
1474                         if (fn == w->root)
1475                                 return 0;
1476                         pn = fn->parent;
1477                         w->node = pn;
1478 #ifdef CONFIG_IPV6_SUBTREES
1479                         if (FIB6_SUBTREE(pn) == fn) {
1480                                 WARN_ON(!(fn->fn_flags & RTN_ROOT));
1481                                 w->state = FWS_L;
1482                                 continue;
1483                         }
1484 #endif
1485                         if (pn->left == fn) {
1486                                 w->state = FWS_R;
1487                                 continue;
1488                         }
1489                         if (pn->right == fn) {
1490                                 w->state = FWS_C;
1491                                 w->leaf = w->node->leaf;
1492                                 continue;
1493                         }
1494 #if RT6_DEBUG >= 2
1495                         WARN_ON(1);
1496 #endif
1497                 }
1498         }
1499 }
1500
1501 static int fib6_walk(struct fib6_walker_t *w)
1502 {
1503         int res;
1504
1505         w->state = FWS_INIT;
1506         w->node = w->root;
1507
1508         fib6_walker_link(w);
1509         res = fib6_walk_continue(w);
1510         if (res <= 0)
1511                 fib6_walker_unlink(w);
1512         return res;
1513 }
1514
1515 static int fib6_clean_node(struct fib6_walker_t *w)
1516 {
1517         int res;
1518         struct rt6_info *rt;
1519         struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w);
1520         struct nl_info info = {
1521                 .nl_net = c->net,
1522         };
1523
1524         for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
1525                 res = c->func(rt, c->arg);
1526                 if (res < 0) {
1527                         w->leaf = rt;
1528                         res = fib6_del(rt, &info);
1529                         if (res) {
1530 #if RT6_DEBUG >= 2
1531                                 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
1532                                          __func__, rt, rt->rt6i_node, res);
1533 #endif
1534                                 continue;
1535                         }
1536                         return 0;
1537                 }
1538                 WARN_ON(res != 0);
1539         }
1540         w->leaf = rt;
1541         return 0;
1542 }
1543
1544 /*
1545  *      Convenient frontend to tree walker.
1546  *
1547  *      func is called on each route.
1548  *              It may return -1 -> delete this route.
1549  *                            0  -> continue walking
1550  *
1551  *      prune==1 -> only immediate children of node (certainly,
1552  *      ignoring pure split nodes) will be scanned.
1553  */
1554
1555 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
1556                             int (*func)(struct rt6_info *, void *arg),
1557                             int prune, void *arg)
1558 {
1559         struct fib6_cleaner_t c;
1560
1561         c.w.root = root;
1562         c.w.func = fib6_clean_node;
1563         c.w.prune = prune;
1564         c.w.count = 0;
1565         c.w.skip = 0;
1566         c.func = func;
1567         c.arg = arg;
1568         c.net = net;
1569
1570         fib6_walk(&c.w);
1571 }
1572
1573 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
1574                     void *arg)
1575 {
1576         struct fib6_table *table;
1577         struct hlist_head *head;
1578         unsigned int h;
1579
1580         rcu_read_lock();
1581         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
1582                 head = &net->ipv6.fib_table_hash[h];
1583                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
1584                         write_lock_bh(&table->tb6_lock);
1585                         fib6_clean_tree(net, &table->tb6_root,
1586                                         func, 0, arg);
1587                         write_unlock_bh(&table->tb6_lock);
1588                 }
1589         }
1590         rcu_read_unlock();
1591 }
1592
1593 static int fib6_prune_clone(struct rt6_info *rt, void *arg)
1594 {
1595         if (rt->rt6i_flags & RTF_CACHE) {
1596                 RT6_TRACE("pruning clone %p\n", rt);
1597                 return -1;
1598         }
1599
1600         return 0;
1601 }
1602
1603 static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
1604 {
1605         fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL);
1606 }
1607
1608 /*
1609  *      Garbage collection
1610  */
1611
1612 static struct fib6_gc_args
1613 {
1614         int                     timeout;
1615         int                     more;
1616 } gc_args;
1617
1618 static int fib6_age(struct rt6_info *rt, void *arg)
1619 {
1620         unsigned long now = jiffies;
1621
1622         /*
1623          *      check addrconf expiration here.
1624          *      Routes are expired even if they are in use.
1625          *
1626          *      Also age clones. Note, that clones are aged out
1627          *      only if they are not in use now.
1628          */
1629
1630         if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
1631                 if (time_after(now, rt->dst.expires)) {
1632                         RT6_TRACE("expiring %p\n", rt);
1633                         return -1;
1634                 }
1635                 gc_args.more++;
1636         } else if (rt->rt6i_flags & RTF_CACHE) {
1637                 if (atomic_read(&rt->dst.__refcnt) == 0 &&
1638                     time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
1639                         RT6_TRACE("aging clone %p\n", rt);
1640                         return -1;
1641                 } else if (rt->rt6i_flags & RTF_GATEWAY) {
1642                         struct neighbour *neigh;
1643                         __u8 neigh_flags = 0;
1644
1645                         neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1646                         if (neigh) {
1647                                 neigh_flags = neigh->flags;
1648                                 neigh_release(neigh);
1649                         }
1650                         if (!(neigh_flags & NTF_ROUTER)) {
1651                                 RT6_TRACE("purging route %p via non-router but gateway\n",
1652                                           rt);
1653                                 return -1;
1654                         }
1655                 }
1656                 gc_args.more++;
1657         }
1658
1659         return 0;
1660 }
1661
1662 static DEFINE_SPINLOCK(fib6_gc_lock);
1663
1664 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
1665 {
1666         unsigned long now;
1667
1668         if (force) {
1669                 spin_lock_bh(&fib6_gc_lock);
1670         } else if (!spin_trylock_bh(&fib6_gc_lock)) {
1671                 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
1672                 return;
1673         }
1674         gc_args.timeout = expires ? (int)expires :
1675                           net->ipv6.sysctl.ip6_rt_gc_interval;
1676
1677         gc_args.more = icmp6_dst_gc();
1678
1679         fib6_clean_all(net, fib6_age, NULL);
1680         now = jiffies;
1681         net->ipv6.ip6_rt_last_gc = now;
1682
1683         if (gc_args.more)
1684                 mod_timer(&net->ipv6.ip6_fib_timer,
1685                           round_jiffies(now
1686                                         + net->ipv6.sysctl.ip6_rt_gc_interval));
1687         else
1688                 del_timer(&net->ipv6.ip6_fib_timer);
1689         spin_unlock_bh(&fib6_gc_lock);
1690 }
1691
1692 static void fib6_gc_timer_cb(unsigned long arg)
1693 {
1694         fib6_run_gc(0, (struct net *)arg, true);
1695 }
1696
1697 static int __net_init fib6_net_init(struct net *net)
1698 {
1699         size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
1700
1701         setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
1702
1703         net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
1704         if (!net->ipv6.rt6_stats)
1705                 goto out_timer;
1706
1707         /* Avoid false sharing : Use at least a full cache line */
1708         size = max_t(size_t, size, L1_CACHE_BYTES);
1709
1710         net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
1711         if (!net->ipv6.fib_table_hash)
1712                 goto out_rt6_stats;
1713
1714         net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
1715                                           GFP_KERNEL);
1716         if (!net->ipv6.fib6_main_tbl)
1717                 goto out_fib_table_hash;
1718
1719         net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
1720         net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
1721         net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
1722                 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
1723         inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
1724
1725 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1726         net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
1727                                            GFP_KERNEL);
1728         if (!net->ipv6.fib6_local_tbl)
1729                 goto out_fib6_main_tbl;
1730         net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
1731         net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
1732         net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
1733                 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
1734         inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
1735 #endif
1736         fib6_tables_init(net);
1737
1738         return 0;
1739
1740 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1741 out_fib6_main_tbl:
1742         kfree(net->ipv6.fib6_main_tbl);
1743 #endif
1744 out_fib_table_hash:
1745         kfree(net->ipv6.fib_table_hash);
1746 out_rt6_stats:
1747         kfree(net->ipv6.rt6_stats);
1748 out_timer:
1749         return -ENOMEM;
1750 }
1751
1752 static void fib6_net_exit(struct net *net)
1753 {
1754         rt6_ifdown(net, NULL);
1755         del_timer_sync(&net->ipv6.ip6_fib_timer);
1756
1757 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1758         inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
1759         kfree(net->ipv6.fib6_local_tbl);
1760 #endif
1761         inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
1762         kfree(net->ipv6.fib6_main_tbl);
1763         kfree(net->ipv6.fib_table_hash);
1764         kfree(net->ipv6.rt6_stats);
1765 }
1766
1767 static struct pernet_operations fib6_net_ops = {
1768         .init = fib6_net_init,
1769         .exit = fib6_net_exit,
1770 };
1771
1772 int __init fib6_init(void)
1773 {
1774         int ret = -ENOMEM;
1775
1776         fib6_node_kmem = kmem_cache_create("fib6_nodes",
1777                                            sizeof(struct fib6_node),
1778                                            0, SLAB_HWCACHE_ALIGN,
1779                                            NULL);
1780         if (!fib6_node_kmem)
1781                 goto out;
1782
1783         ret = register_pernet_subsys(&fib6_net_ops);
1784         if (ret)
1785                 goto out_kmem_cache_create;
1786
1787         ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
1788                               NULL);
1789         if (ret)
1790                 goto out_unregister_subsys;
1791 out:
1792         return ret;
1793
1794 out_unregister_subsys:
1795         unregister_pernet_subsys(&fib6_net_ops);
1796 out_kmem_cache_create:
1797         kmem_cache_destroy(fib6_node_kmem);
1798         goto out;
1799 }
1800
1801 void fib6_gc_cleanup(void)
1802 {
1803         unregister_pernet_subsys(&fib6_net_ops);
1804         kmem_cache_destroy(fib6_node_kmem);
1805 }
1806
1807 #ifdef CONFIG_PROC_FS
1808
1809 struct ipv6_route_iter {
1810         struct seq_net_private p;
1811         struct fib6_walker_t w;
1812         loff_t skip;
1813         struct fib6_table *tbl;
1814         __u32 sernum;
1815 };
1816
1817 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
1818 {
1819         struct rt6_info *rt = v;
1820         struct ipv6_route_iter *iter = seq->private;
1821
1822         seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
1823
1824 #ifdef CONFIG_IPV6_SUBTREES
1825         seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
1826 #else
1827         seq_puts(seq, "00000000000000000000000000000000 00 ");
1828 #endif
1829         if (rt->rt6i_flags & RTF_GATEWAY)
1830                 seq_printf(seq, "%pi6", &rt->rt6i_gateway);
1831         else
1832                 seq_puts(seq, "00000000000000000000000000000000");
1833
1834         seq_printf(seq, " %08x %08x %08x %08x %8s\n",
1835                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
1836                    rt->dst.__use, rt->rt6i_flags,
1837                    rt->dst.dev ? rt->dst.dev->name : "");
1838         iter->w.leaf = NULL;
1839         return 0;
1840 }
1841
1842 static int ipv6_route_yield(struct fib6_walker_t *w)
1843 {
1844         struct ipv6_route_iter *iter = w->args;
1845
1846         if (!iter->skip)
1847                 return 1;
1848
1849         do {
1850                 iter->w.leaf = iter->w.leaf->dst.rt6_next;
1851                 iter->skip--;
1852                 if (!iter->skip && iter->w.leaf)
1853                         return 1;
1854         } while (iter->w.leaf);
1855
1856         return 0;
1857 }
1858
1859 static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
1860 {
1861         memset(&iter->w, 0, sizeof(iter->w));
1862         iter->w.func = ipv6_route_yield;
1863         iter->w.root = &iter->tbl->tb6_root;
1864         iter->w.state = FWS_INIT;
1865         iter->w.node = iter->w.root;
1866         iter->w.args = iter;
1867         iter->sernum = iter->w.root->fn_sernum;
1868         INIT_LIST_HEAD(&iter->w.lh);
1869         fib6_walker_link(&iter->w);
1870 }
1871
1872 static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
1873                                                     struct net *net)
1874 {
1875         unsigned int h;
1876         struct hlist_node *node;
1877
1878         if (tbl) {
1879                 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
1880                 node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
1881         } else {
1882                 h = 0;
1883                 node = NULL;
1884         }
1885
1886         while (!node && h < FIB6_TABLE_HASHSZ) {
1887                 node = rcu_dereference_bh(
1888                         hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
1889         }
1890         return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
1891 }
1892
1893 static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
1894 {
1895         if (iter->sernum != iter->w.root->fn_sernum) {
1896                 iter->sernum = iter->w.root->fn_sernum;
1897                 iter->w.state = FWS_INIT;
1898                 iter->w.node = iter->w.root;
1899                 WARN_ON(iter->w.skip);
1900                 iter->w.skip = iter->w.count;
1901         }
1902 }
1903
1904 static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1905 {
1906         int r;
1907         struct rt6_info *n;
1908         struct net *net = seq_file_net(seq);
1909         struct ipv6_route_iter *iter = seq->private;
1910
1911         if (!v)
1912                 goto iter_table;
1913
1914         n = ((struct rt6_info *)v)->dst.rt6_next;
1915         if (n) {
1916                 ++*pos;
1917                 return n;
1918         }
1919
1920 iter_table:
1921         ipv6_route_check_sernum(iter);
1922         read_lock(&iter->tbl->tb6_lock);
1923         r = fib6_walk_continue(&iter->w);
1924         read_unlock(&iter->tbl->tb6_lock);
1925         if (r > 0) {
1926                 if (v)
1927                         ++*pos;
1928                 return iter->w.leaf;
1929         } else if (r < 0) {
1930                 fib6_walker_unlink(&iter->w);
1931                 return NULL;
1932         }
1933         fib6_walker_unlink(&iter->w);
1934
1935         iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
1936         if (!iter->tbl)
1937                 return NULL;
1938
1939         ipv6_route_seq_setup_walk(iter);
1940         goto iter_table;
1941 }
1942
1943 static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
1944         __acquires(RCU_BH)
1945 {
1946         struct net *net = seq_file_net(seq);
1947         struct ipv6_route_iter *iter = seq->private;
1948
1949         rcu_read_lock_bh();
1950         iter->tbl = ipv6_route_seq_next_table(NULL, net);
1951         iter->skip = *pos;
1952
1953         if (iter->tbl) {
1954                 ipv6_route_seq_setup_walk(iter);
1955                 return ipv6_route_seq_next(seq, NULL, pos);
1956         } else {
1957                 return NULL;
1958         }
1959 }
1960
1961 static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
1962 {
1963         struct fib6_walker_t *w = &iter->w;
1964         return w->node && !(w->state == FWS_U && w->node == w->root);
1965 }
1966
1967 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
1968         __releases(RCU_BH)
1969 {
1970         struct ipv6_route_iter *iter = seq->private;
1971
1972         if (ipv6_route_iter_active(iter))
1973                 fib6_walker_unlink(&iter->w);
1974
1975         rcu_read_unlock_bh();
1976 }
1977
1978 static const struct seq_operations ipv6_route_seq_ops = {
1979         .start  = ipv6_route_seq_start,
1980         .next   = ipv6_route_seq_next,
1981         .stop   = ipv6_route_seq_stop,
1982         .show   = ipv6_route_seq_show
1983 };
1984
1985 int ipv6_route_open(struct inode *inode, struct file *file)
1986 {
1987         return seq_open_net(inode, file, &ipv6_route_seq_ops,
1988                             sizeof(struct ipv6_route_iter));
1989 }
1990
1991 #endif /* CONFIG_PROC_FS */