2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally described in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
26 * Code from fib_hash has been reused which includes the following header:
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
33 * IPv4 FIB: lookup engine and maintenance routines.
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
43 * Substantial contributions to this work comes from:
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
51 #define VERSION "0.409"
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/export.h>
75 #include <net/net_namespace.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
84 #define MAX_STAT_DEPTH 32
86 #define KEYLENGTH (8*sizeof(t_key))
88 typedef unsigned int t_key;
90 #define IS_TNODE(n) ((n)->bits)
91 #define IS_LEAF(n) (!(n)->bits)
95 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
96 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
97 struct tnode __rcu *parent;
100 struct tnode *tnode_free;
102 unsigned int full_children; /* KEYLENGTH bits needed */
103 unsigned int empty_children; /* KEYLENGTH bits needed */
104 struct rt_trie_node __rcu *child[0];
107 struct rt_trie_node {
111 struct tnode __rcu *parent;
119 struct tnode __rcu *parent;
121 struct hlist_head list;
125 struct hlist_node hlist;
127 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
128 struct list_head falh;
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
144 unsigned int totdepth;
145 unsigned int maxdepth;
148 unsigned int nullpointers;
149 unsigned int prefixes;
150 unsigned int nodesizes[MAX_STAT_DEPTH];
154 struct rt_trie_node __rcu *trie;
155 #ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats __percpu *stats;
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
167 static size_t tnode_free_size;
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
174 static const int sync_pages = 128;
176 static struct kmem_cache *fn_alias_kmem __read_mostly;
177 static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 /* caller must hold RTNL */
180 #define node_parent(n) rtnl_dereference((n)->parent)
182 /* caller must hold RCU read lock or RTNL */
183 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
185 /* wrapper for rcu_assign_pointer */
186 static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
189 rcu_assign_pointer(node->parent, ptr);
192 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
194 /* This provides us with the number of children in this node, in the case of a
195 * leaf this will return 0 meaning none of the children are accessible.
197 static inline int tnode_child_length(const struct tnode *tn)
199 return (1ul << tn->bits) & ~(1ul);
203 * caller must hold RTNL
205 static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
207 BUG_ON(i >= tnode_child_length(tn));
209 return rtnl_dereference(tn->child[i]);
213 * caller must hold RCU read lock or RTNL
215 static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
217 BUG_ON(i >= tnode_child_length(tn));
219 return rcu_dereference_rtnl(tn->child[i]);
222 static inline t_key mask_pfx(t_key k, unsigned int l)
224 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
227 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
229 if (offset < KEYLENGTH)
230 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
235 static inline int tkey_equals(t_key a, t_key b)
240 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
242 if (bits == 0 || offset >= KEYLENGTH)
244 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
245 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
248 static inline int tkey_mismatch(t_key a, int offset, t_key b)
255 while ((diff << i) >> (KEYLENGTH-1) == 0)
261 To understand this stuff, an understanding of keys and all their bits is
262 necessary. Every node in the trie has a key associated with it, but not
263 all of the bits in that key are significant.
265 Consider a node 'n' and its parent 'tp'.
267 If n is a leaf, every bit in its key is significant. Its presence is
268 necessitated by path compression, since during a tree traversal (when
269 searching for a leaf - unless we are doing an insertion) we will completely
270 ignore all skipped bits we encounter. Thus we need to verify, at the end of
271 a potentially successful search, that we have indeed been walking the
274 Note that we can never "miss" the correct key in the tree if present by
275 following the wrong path. Path compression ensures that segments of the key
276 that are the same for all keys with a given prefix are skipped, but the
277 skipped part *is* identical for each node in the subtrie below the skipped
278 bit! trie_insert() in this implementation takes care of that - note the
279 call to tkey_sub_equals() in trie_insert().
281 if n is an internal node - a 'tnode' here, the various parts of its key
282 have many different meanings.
285 _________________________________________________________________
286 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
287 -----------------------------------------------------------------
288 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
290 _________________________________________________________________
291 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
292 -----------------------------------------------------------------
293 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
300 First, let's just ignore the bits that come before the parent tp, that is
301 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
302 not use them for anything.
304 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
305 index into the parent's child array. That is, they will be used to find
306 'n' among tp's children.
308 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
311 All the bits we have seen so far are significant to the node n. The rest
312 of the bits are really not needed or indeed known in n->key.
314 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
315 n's child array, and will of course be different for each child.
318 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
323 static const int halve_threshold = 25;
324 static const int inflate_threshold = 50;
325 static const int halve_threshold_root = 15;
326 static const int inflate_threshold_root = 30;
328 static void __alias_free_mem(struct rcu_head *head)
330 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
331 kmem_cache_free(fn_alias_kmem, fa);
334 static inline void alias_free_mem_rcu(struct fib_alias *fa)
336 call_rcu(&fa->rcu, __alias_free_mem);
339 static void __leaf_free_rcu(struct rcu_head *head)
341 struct leaf *l = container_of(head, struct leaf, rcu);
342 kmem_cache_free(trie_leaf_kmem, l);
345 static inline void free_leaf(struct leaf *l)
347 call_rcu(&l->rcu, __leaf_free_rcu);
350 static inline void free_leaf_info(struct leaf_info *leaf)
352 kfree_rcu(leaf, rcu);
355 static struct tnode *tnode_alloc(size_t size)
357 if (size <= PAGE_SIZE)
358 return kzalloc(size, GFP_KERNEL);
360 return vzalloc(size);
363 static void __tnode_free_rcu(struct rcu_head *head)
365 struct tnode *tn = container_of(head, struct tnode, rcu);
366 size_t size = sizeof(struct tnode) +
367 (sizeof(struct rt_trie_node *) << tn->bits);
369 if (size <= PAGE_SIZE)
375 static inline void tnode_free(struct tnode *tn)
378 free_leaf((struct leaf *) tn);
380 call_rcu(&tn->rcu, __tnode_free_rcu);
383 static void tnode_free_safe(struct tnode *tn)
386 tn->tnode_free = tnode_free_head;
387 tnode_free_head = tn;
388 tnode_free_size += sizeof(struct tnode) +
389 (sizeof(struct rt_trie_node *) << tn->bits);
392 static void tnode_free_flush(void)
396 while ((tn = tnode_free_head)) {
397 tnode_free_head = tn->tnode_free;
398 tn->tnode_free = NULL;
402 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
408 static struct leaf *leaf_new(t_key key)
410 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
413 /* set key and pos to reflect full key value
414 * any trailing zeros in the key should be ignored
415 * as the nodes are searched
419 /* set bits to 0 indicating we are not a tnode */
422 INIT_HLIST_HEAD(&l->list);
427 static struct leaf_info *leaf_info_new(int plen)
429 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
432 li->mask_plen = ntohl(inet_make_mask(plen));
433 INIT_LIST_HEAD(&li->falh);
438 static struct tnode *tnode_new(t_key key, int pos, int bits)
440 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
441 struct tnode *tn = tnode_alloc(sz);
442 unsigned int shift = pos + bits;
444 /* verify bits and pos their msb bits clear and values are valid */
445 BUG_ON(!bits || (shift > KEYLENGTH));
451 tn->key = mask_pfx(key, pos);
452 tn->full_children = 0;
453 tn->empty_children = 1<<bits;
456 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
457 sizeof(struct rt_trie_node *) << bits);
462 * Check whether a tnode 'n' is "full", i.e. it is an internal node
463 * and no bits are skipped. See discussion in dyntree paper p. 6
466 static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
468 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
471 static inline void put_child(struct tnode *tn, int i,
472 struct rt_trie_node *n)
474 tnode_put_child_reorg(tn, i, n, -1);
478 * Add a child at position i overwriting the old value.
479 * Update the value of full_children and empty_children.
482 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
485 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
488 BUG_ON(i >= 1<<tn->bits);
490 /* update emptyChildren */
491 if (n == NULL && chi != NULL)
492 tn->empty_children++;
493 else if (n != NULL && chi == NULL)
494 tn->empty_children--;
496 /* update fullChildren */
498 wasfull = tnode_full(tn, chi);
500 isfull = tnode_full(tn, n);
501 if (wasfull && !isfull)
503 else if (!wasfull && isfull)
506 node_set_parent(n, tn);
508 rcu_assign_pointer(tn->child[i], n);
512 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
514 struct rt_trie_node *n = NULL;
515 struct tnode *old_tn;
516 int inflate_threshold_use;
517 int halve_threshold_use;
523 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
524 tn, inflate_threshold, halve_threshold);
527 if (tn->empty_children > (tnode_child_length(tn) - 1))
531 if (tn->empty_children == (tnode_child_length(tn) - 1))
534 * Double as long as the resulting node has a number of
535 * nonempty nodes that are above the threshold.
539 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
540 * the Helsinki University of Technology and Matti Tikkanen of Nokia
541 * Telecommunications, page 6:
542 * "A node is doubled if the ratio of non-empty children to all
543 * children in the *doubled* node is at least 'high'."
545 * 'high' in this instance is the variable 'inflate_threshold'. It
546 * is expressed as a percentage, so we multiply it with
547 * tnode_child_length() and instead of multiplying by 2 (since the
548 * child array will be doubled by inflate()) and multiplying
549 * the left-hand side by 100 (to handle the percentage thing) we
550 * multiply the left-hand side by 50.
552 * The left-hand side may look a bit weird: tnode_child_length(tn)
553 * - tn->empty_children is of course the number of non-null children
554 * in the current node. tn->full_children is the number of "full"
555 * children, that is non-null tnodes with a skip value of 0.
556 * All of those will be doubled in the resulting inflated tnode, so
557 * we just count them one extra time here.
559 * A clearer way to write this would be:
561 * to_be_doubled = tn->full_children;
562 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
565 * new_child_length = tnode_child_length(tn) * 2;
567 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
569 * if (new_fill_factor >= inflate_threshold)
571 * ...and so on, tho it would mess up the while () loop.
574 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
578 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
579 * inflate_threshold * new_child_length
581 * expand not_to_be_doubled and to_be_doubled, and shorten:
582 * 100 * (tnode_child_length(tn) - tn->empty_children +
583 * tn->full_children) >= inflate_threshold * new_child_length
585 * expand new_child_length:
586 * 100 * (tnode_child_length(tn) - tn->empty_children +
587 * tn->full_children) >=
588 * inflate_threshold * tnode_child_length(tn) * 2
591 * 50 * (tn->full_children + tnode_child_length(tn) -
592 * tn->empty_children) >= inflate_threshold *
593 * tnode_child_length(tn)
597 /* Keep root node larger */
599 if (!node_parent(tn)) {
600 inflate_threshold_use = inflate_threshold_root;
601 halve_threshold_use = halve_threshold_root;
603 inflate_threshold_use = inflate_threshold;
604 halve_threshold_use = halve_threshold;
608 while ((tn->full_children > 0 && max_work-- &&
609 50 * (tn->full_children + tnode_child_length(tn)
610 - tn->empty_children)
611 >= inflate_threshold_use * tnode_child_length(tn))) {
618 #ifdef CONFIG_IP_FIB_TRIE_STATS
619 this_cpu_inc(t->stats->resize_node_skipped);
625 /* Return if at least one inflate is run */
626 if (max_work != MAX_WORK)
627 return (struct rt_trie_node *) tn;
630 * Halve as long as the number of empty children in this
631 * node is above threshold.
635 while (tn->bits > 1 && max_work-- &&
636 100 * (tnode_child_length(tn) - tn->empty_children) <
637 halve_threshold_use * tnode_child_length(tn)) {
643 #ifdef CONFIG_IP_FIB_TRIE_STATS
644 this_cpu_inc(t->stats->resize_node_skipped);
651 /* Only one child remains */
652 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
655 for (i = tnode_child_length(tn); !n && i;)
656 n = tnode_get_child(tn, --i);
658 /* compress one level */
659 node_set_parent(n, NULL);
663 return (struct rt_trie_node *) tn;
667 static void tnode_clean_free(struct tnode *tn)
670 struct tnode *tofree;
672 for (i = 0; i < tnode_child_length(tn); i++) {
673 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
680 static struct tnode *inflate(struct trie *t, struct tnode *tn)
682 struct tnode *oldtnode = tn;
683 int olen = tnode_child_length(tn);
686 pr_debug("In inflate\n");
688 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
691 return ERR_PTR(-ENOMEM);
694 * Preallocate and store tnodes before the actual work so we
695 * don't get into an inconsistent state if memory allocation
696 * fails. In case of failure we return the oldnode and inflate
697 * of tnode is ignored.
700 for (i = 0; i < olen; i++) {
703 inode = (struct tnode *) tnode_get_child(oldtnode, i);
706 inode->pos == oldtnode->pos + oldtnode->bits &&
708 struct tnode *left, *right;
709 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
711 left = tnode_new(inode->key&(~m), inode->pos + 1,
716 right = tnode_new(inode->key|m, inode->pos + 1,
724 put_child(tn, 2*i, (struct rt_trie_node *) left);
725 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
729 for (i = 0; i < olen; i++) {
731 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
732 struct tnode *left, *right;
739 /* A leaf or an internal node with skipped bits */
741 if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
743 tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
748 /* An internal node with two children */
749 inode = (struct tnode *) node;
751 if (inode->bits == 1) {
752 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
753 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
755 tnode_free_safe(inode);
759 /* An internal node with more than two children */
761 /* We will replace this node 'inode' with two new
762 * ones, 'left' and 'right', each with half of the
763 * original children. The two new nodes will have
764 * a position one bit further down the key and this
765 * means that the "significant" part of their keys
766 * (see the discussion near the top of this file)
767 * will differ by one bit, which will be "0" in
768 * left's key and "1" in right's key. Since we are
769 * moving the key position by one step, the bit that
770 * we are moving away from - the bit at position
771 * (inode->pos) - is the one that will differ between
772 * left and right. So... we synthesize that bit in the
774 * The mask 'm' below will be a single "one" bit at
775 * the position (inode->pos)
778 /* Use the old key, but set the new significant
782 left = (struct tnode *) tnode_get_child(tn, 2*i);
783 put_child(tn, 2*i, NULL);
787 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
788 put_child(tn, 2*i+1, NULL);
792 size = tnode_child_length(left);
793 for (j = 0; j < size; j++) {
794 put_child(left, j, rtnl_dereference(inode->child[j]));
795 put_child(right, j, rtnl_dereference(inode->child[j + size]));
797 put_child(tn, 2*i, resize(t, left));
798 put_child(tn, 2*i+1, resize(t, right));
800 tnode_free_safe(inode);
802 tnode_free_safe(oldtnode);
805 tnode_clean_free(tn);
806 return ERR_PTR(-ENOMEM);
809 static struct tnode *halve(struct trie *t, struct tnode *tn)
811 struct tnode *oldtnode = tn;
812 struct rt_trie_node *left, *right;
814 int olen = tnode_child_length(tn);
816 pr_debug("In halve\n");
818 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
821 return ERR_PTR(-ENOMEM);
824 * Preallocate and store tnodes before the actual work so we
825 * don't get into an inconsistent state if memory allocation
826 * fails. In case of failure we return the oldnode and halve
827 * of tnode is ignored.
830 for (i = 0; i < olen; i += 2) {
831 left = tnode_get_child(oldtnode, i);
832 right = tnode_get_child(oldtnode, i+1);
834 /* Two nonempty children */
838 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
843 put_child(tn, i/2, (struct rt_trie_node *)newn);
848 for (i = 0; i < olen; i += 2) {
849 struct tnode *newBinNode;
851 left = tnode_get_child(oldtnode, i);
852 right = tnode_get_child(oldtnode, i+1);
854 /* At least one of the children is empty */
856 if (right == NULL) /* Both are empty */
858 put_child(tn, i/2, right);
863 put_child(tn, i/2, left);
867 /* Two nonempty children */
868 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
869 put_child(tn, i/2, NULL);
870 put_child(newBinNode, 0, left);
871 put_child(newBinNode, 1, right);
872 put_child(tn, i/2, resize(t, newBinNode));
874 tnode_free_safe(oldtnode);
877 tnode_clean_free(tn);
878 return ERR_PTR(-ENOMEM);
881 /* readside must use rcu_read_lock currently dump routines
882 via get_fa_head and dump */
884 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
886 struct hlist_head *head = &l->list;
887 struct leaf_info *li;
889 hlist_for_each_entry_rcu(li, head, hlist)
890 if (li->plen == plen)
896 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
898 struct leaf_info *li = find_leaf_info(l, plen);
906 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
908 struct leaf_info *li = NULL, *last = NULL;
910 if (hlist_empty(head)) {
911 hlist_add_head_rcu(&new->hlist, head);
913 hlist_for_each_entry(li, head, hlist) {
914 if (new->plen > li->plen)
920 hlist_add_behind_rcu(&new->hlist, &last->hlist);
922 hlist_add_before_rcu(&new->hlist, &li->hlist);
926 /* rcu_read_lock needs to be hold by caller from readside */
929 fib_find_node(struct trie *t, u32 key)
933 struct rt_trie_node *n;
936 n = rcu_dereference_rtnl(t->trie);
938 while (n && IS_TNODE(n)) {
939 tn = (struct tnode *) n;
941 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
942 pos = tn->pos + tn->bits;
943 n = tnode_get_child_rcu(tn,
944 tkey_extract_bits(key,
950 /* Case we have found a leaf. Compare prefixes */
952 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
953 return (struct leaf *)n;
958 static void trie_rebalance(struct trie *t, struct tnode *tn)
966 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
967 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
968 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
969 tn = (struct tnode *)resize(t, tn);
971 tnode_put_child_reorg(tp, cindex,
972 (struct rt_trie_node *)tn, wasfull);
974 tp = node_parent(tn);
976 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
984 /* Handle last (top) tnode */
986 tn = (struct tnode *)resize(t, tn);
988 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
992 /* only used from updater-side */
994 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
997 struct tnode *tp = NULL, *tn = NULL;
998 struct rt_trie_node *n;
1001 struct list_head *fa_head = NULL;
1002 struct leaf_info *li;
1006 n = rtnl_dereference(t->trie);
1008 /* If we point to NULL, stop. Either the tree is empty and we should
1009 * just put a new leaf in if, or we have reached an empty child slot,
1010 * and we should just put our new leaf in that.
1011 * If we point to a T_TNODE, check if it matches our key. Note that
1012 * a T_TNODE might be skipping any number of bits - its 'pos' need
1013 * not be the parent's 'pos'+'bits'!
1015 * If it does match the current key, get pos/bits from it, extract
1016 * the index from our key, push the T_TNODE and walk the tree.
1018 * If it doesn't, we have to replace it with a new T_TNODE.
1020 * If we point to a T_LEAF, it might or might not have the same key
1021 * as we do. If it does, just change the value, update the T_LEAF's
1022 * value, and return it.
1023 * If it doesn't, we need to replace it with a T_TNODE.
1026 while (n && IS_TNODE(n)) {
1027 tn = (struct tnode *) n;
1029 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1031 pos = tn->pos + tn->bits;
1032 n = tnode_get_child(tn,
1033 tkey_extract_bits(key,
1037 BUG_ON(n && node_parent(n) != tn);
1043 * n ----> NULL, LEAF or TNODE
1045 * tp is n's (parent) ----> NULL or TNODE
1048 BUG_ON(tp && IS_LEAF(tp));
1050 /* Case 1: n is a leaf. Compare prefixes */
1052 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1053 l = (struct leaf *) n;
1054 li = leaf_info_new(plen);
1059 fa_head = &li->falh;
1060 insert_leaf_info(&l->list, li);
1068 li = leaf_info_new(plen);
1075 fa_head = &li->falh;
1076 insert_leaf_info(&l->list, li);
1078 if (t->trie && n == NULL) {
1079 /* Case 2: n is NULL, and will just insert a new leaf */
1081 node_set_parent((struct rt_trie_node *)l, tp);
1083 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1084 put_child(tp, cindex, (struct rt_trie_node *)l);
1086 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1088 * Add a new tnode here
1089 * first tnode need some special handling
1093 pos = tp ? tp->pos+tp->bits : 0;
1094 newpos = tkey_mismatch(key, pos, n->key);
1095 tn = tnode_new(n->key, newpos, 1);
1098 tn = tnode_new(key, newpos, 1); /* First tnode */
1107 node_set_parent((struct rt_trie_node *)tn, tp);
1109 missbit = tkey_extract_bits(key, newpos, 1);
1110 put_child(tn, missbit, (struct rt_trie_node *)l);
1111 put_child(tn, 1-missbit, n);
1114 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1115 put_child(tp, cindex, (struct rt_trie_node *)tn);
1117 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1123 if (tp && tp->pos + tp->bits > 32)
1124 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1125 tp, tp->pos, tp->bits, key, plen);
1127 /* Rebalance the trie */
1129 trie_rebalance(t, tp);
1135 * Caller must hold RTNL.
1137 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1139 struct trie *t = (struct trie *) tb->tb_data;
1140 struct fib_alias *fa, *new_fa;
1141 struct list_head *fa_head = NULL;
1142 struct fib_info *fi;
1143 int plen = cfg->fc_dst_len;
1144 u8 tos = cfg->fc_tos;
1152 key = ntohl(cfg->fc_dst);
1154 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1156 mask = ntohl(inet_make_mask(plen));
1163 fi = fib_create_info(cfg);
1169 l = fib_find_node(t, key);
1173 fa_head = get_fa_head(l, plen);
1174 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1177 /* Now fa, if non-NULL, points to the first fib alias
1178 * with the same keys [prefix,tos,priority], if such key already
1179 * exists or to the node before which we will insert new one.
1181 * If fa is NULL, we will need to allocate a new one and
1182 * insert to the head of f.
1184 * If f is NULL, no fib node matched the destination key
1185 * and we need to allocate a new one of those as well.
1188 if (fa && fa->fa_tos == tos &&
1189 fa->fa_info->fib_priority == fi->fib_priority) {
1190 struct fib_alias *fa_first, *fa_match;
1193 if (cfg->fc_nlflags & NLM_F_EXCL)
1197 * 1. Find exact match for type, scope, fib_info to avoid
1199 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1203 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1204 list_for_each_entry_continue(fa, fa_head, fa_list) {
1205 if (fa->fa_tos != tos)
1207 if (fa->fa_info->fib_priority != fi->fib_priority)
1209 if (fa->fa_type == cfg->fc_type &&
1210 fa->fa_info == fi) {
1216 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1217 struct fib_info *fi_drop;
1227 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1231 fi_drop = fa->fa_info;
1232 new_fa->fa_tos = fa->fa_tos;
1233 new_fa->fa_info = fi;
1234 new_fa->fa_type = cfg->fc_type;
1235 state = fa->fa_state;
1236 new_fa->fa_state = state & ~FA_S_ACCESSED;
1238 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1239 alias_free_mem_rcu(fa);
1241 fib_release_info(fi_drop);
1242 if (state & FA_S_ACCESSED)
1243 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1244 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1245 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1249 /* Error if we find a perfect match which
1250 * uses the same scope, type, and nexthop
1256 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1260 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1264 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1268 new_fa->fa_info = fi;
1269 new_fa->fa_tos = tos;
1270 new_fa->fa_type = cfg->fc_type;
1271 new_fa->fa_state = 0;
1273 * Insert new entry to the list.
1277 fa_head = fib_insert_node(t, key, plen);
1278 if (unlikely(!fa_head)) {
1280 goto out_free_new_fa;
1285 tb->tb_num_default++;
1287 list_add_tail_rcu(&new_fa->fa_list,
1288 (fa ? &fa->fa_list : fa_head));
1290 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1291 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1292 &cfg->fc_nlinfo, 0);
1297 kmem_cache_free(fn_alias_kmem, new_fa);
1299 fib_release_info(fi);
1304 /* should be called with rcu_read_lock */
1305 static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1306 t_key key, const struct flowi4 *flp,
1307 struct fib_result *res, int fib_flags)
1309 struct leaf_info *li;
1310 struct hlist_head *hhead = &l->list;
1312 hlist_for_each_entry_rcu(li, hhead, hlist) {
1313 struct fib_alias *fa;
1315 if (l->key != (key & li->mask_plen))
1318 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1319 struct fib_info *fi = fa->fa_info;
1322 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1326 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1328 fib_alias_accessed(fa);
1329 err = fib_props[fa->fa_type].error;
1331 #ifdef CONFIG_IP_FIB_TRIE_STATS
1332 this_cpu_inc(t->stats->semantic_match_passed);
1336 if (fi->fib_flags & RTNH_F_DEAD)
1338 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1339 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1341 if (nh->nh_flags & RTNH_F_DEAD)
1343 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1346 #ifdef CONFIG_IP_FIB_TRIE_STATS
1347 this_cpu_inc(t->stats->semantic_match_passed);
1349 res->prefixlen = li->plen;
1350 res->nh_sel = nhsel;
1351 res->type = fa->fa_type;
1352 res->scope = fa->fa_info->fib_scope;
1355 res->fa_head = &li->falh;
1356 if (!(fib_flags & FIB_LOOKUP_NOREF))
1357 atomic_inc(&fi->fib_clntref);
1362 #ifdef CONFIG_IP_FIB_TRIE_STATS
1363 this_cpu_inc(t->stats->semantic_match_miss);
1370 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1371 struct fib_result *res, int fib_flags)
1373 struct trie *t = (struct trie *) tb->tb_data;
1374 #ifdef CONFIG_IP_FIB_TRIE_STATS
1375 struct trie_use_stats __percpu *stats = t->stats;
1378 struct rt_trie_node *n;
1380 unsigned int pos, bits;
1381 t_key key = ntohl(flp->daddr);
1382 unsigned int chopped_off;
1384 unsigned int current_prefix_length = KEYLENGTH;
1386 t_key pref_mismatch;
1390 n = rcu_dereference(t->trie);
1394 #ifdef CONFIG_IP_FIB_TRIE_STATS
1395 this_cpu_inc(stats->gets);
1400 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1404 pn = (struct tnode *) n;
1412 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1415 n = tnode_get_child_rcu(pn, cindex);
1418 #ifdef CONFIG_IP_FIB_TRIE_STATS
1419 this_cpu_inc(stats->null_node_hit);
1425 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1431 cn = (struct tnode *)n;
1434 * It's a tnode, and we can do some extra checks here if we
1435 * like, to avoid descending into a dead-end branch.
1436 * This tnode is in the parent's child array at index
1437 * key[p_pos..p_pos+p_bits] but potentially with some bits
1438 * chopped off, so in reality the index may be just a
1439 * subprefix, padded with zero at the end.
1440 * We can also take a look at any skipped bits in this
1441 * tnode - everything up to p_pos is supposed to be ok,
1442 * and the non-chopped bits of the index (se previous
1443 * paragraph) are also guaranteed ok, but the rest is
1444 * considered unknown.
1446 * The skipped bits are key[pos+bits..cn->pos].
1449 /* If current_prefix_length < pos+bits, we are already doing
1450 * actual prefix matching, which means everything from
1451 * pos+(bits-chopped_off) onward must be zero along some
1452 * branch of this subtree - otherwise there is *no* valid
1453 * prefix present. Here we can only check the skipped
1454 * bits. Remember, since we have already indexed into the
1455 * parent's child array, we know that the bits we chopped of
1459 /* NOTA BENE: Checking only skipped bits
1460 for the new node here */
1462 if (current_prefix_length < pos+bits) {
1463 if (tkey_extract_bits(cn->key, current_prefix_length,
1464 cn->pos - current_prefix_length)
1470 * If chopped_off=0, the index is fully validated and we
1471 * only need to look at the skipped bits for this, the new,
1472 * tnode. What we actually want to do is to find out if
1473 * these skipped bits match our key perfectly, or if we will
1474 * have to count on finding a matching prefix further down,
1475 * because if we do, we would like to have some way of
1476 * verifying the existence of such a prefix at this point.
1479 /* The only thing we can do at this point is to verify that
1480 * any such matching prefix can indeed be a prefix to our
1481 * key, and if the bits in the node we are inspecting that
1482 * do not match our key are not ZERO, this cannot be true.
1483 * Thus, find out where there is a mismatch (before cn->pos)
1484 * and verify that all the mismatching bits are zero in the
1489 * Note: We aren't very concerned about the piece of
1490 * the key that precede pn->pos+pn->bits, since these
1491 * have already been checked. The bits after cn->pos
1492 * aren't checked since these are by definition
1493 * "unknown" at this point. Thus, what we want to see
1494 * is if we are about to enter the "prefix matching"
1495 * state, and in that case verify that the skipped
1496 * bits that will prevail throughout this subtree are
1497 * zero, as they have to be if we are to find a
1501 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1504 * In short: If skipped bits in this node do not match
1505 * the search key, enter the "prefix matching"
1508 if (pref_mismatch) {
1509 /* fls(x) = __fls(x) + 1 */
1510 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1512 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1515 if (current_prefix_length >= cn->pos)
1516 current_prefix_length = mp;
1519 pn = (struct tnode *)n; /* Descend */
1526 /* As zero don't change the child key (cindex) */
1527 while ((chopped_off <= pn->bits)
1528 && !(cindex & (1<<(chopped_off-1))))
1531 /* Decrease current_... with bits chopped off */
1532 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1533 current_prefix_length = pn->pos + pn->bits
1537 * Either we do the actual chop off according or if we have
1538 * chopped off all bits in this tnode walk up to our parent.
1541 if (chopped_off <= pn->bits) {
1542 cindex &= ~(1 << (chopped_off-1));
1544 struct tnode *parent = node_parent_rcu(pn);
1548 /* Get Child's index */
1549 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1553 #ifdef CONFIG_IP_FIB_TRIE_STATS
1554 this_cpu_inc(stats->backtrack);
1565 EXPORT_SYMBOL_GPL(fib_table_lookup);
1568 * Remove the leaf and return parent.
1570 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1572 struct tnode *tp = node_parent(l);
1574 pr_debug("entering trie_leaf_remove(%p)\n", l);
1577 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1578 put_child(tp, cindex, NULL);
1579 trie_rebalance(t, tp);
1581 RCU_INIT_POINTER(t->trie, NULL);
1587 * Caller must hold RTNL.
1589 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1591 struct trie *t = (struct trie *) tb->tb_data;
1593 int plen = cfg->fc_dst_len;
1594 u8 tos = cfg->fc_tos;
1595 struct fib_alias *fa, *fa_to_delete;
1596 struct list_head *fa_head;
1598 struct leaf_info *li;
1603 key = ntohl(cfg->fc_dst);
1604 mask = ntohl(inet_make_mask(plen));
1610 l = fib_find_node(t, key);
1615 li = find_leaf_info(l, plen);
1620 fa_head = &li->falh;
1621 fa = fib_find_alias(fa_head, tos, 0);
1626 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1628 fa_to_delete = NULL;
1629 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1630 list_for_each_entry_continue(fa, fa_head, fa_list) {
1631 struct fib_info *fi = fa->fa_info;
1633 if (fa->fa_tos != tos)
1636 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1637 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1638 fa->fa_info->fib_scope == cfg->fc_scope) &&
1639 (!cfg->fc_prefsrc ||
1640 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1641 (!cfg->fc_protocol ||
1642 fi->fib_protocol == cfg->fc_protocol) &&
1643 fib_nh_match(cfg, fi) == 0) {
1653 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1654 &cfg->fc_nlinfo, 0);
1656 list_del_rcu(&fa->fa_list);
1659 tb->tb_num_default--;
1661 if (list_empty(fa_head)) {
1662 hlist_del_rcu(&li->hlist);
1666 if (hlist_empty(&l->list))
1667 trie_leaf_remove(t, l);
1669 if (fa->fa_state & FA_S_ACCESSED)
1670 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1672 fib_release_info(fa->fa_info);
1673 alias_free_mem_rcu(fa);
1677 static int trie_flush_list(struct list_head *head)
1679 struct fib_alias *fa, *fa_node;
1682 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1683 struct fib_info *fi = fa->fa_info;
1685 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1686 list_del_rcu(&fa->fa_list);
1687 fib_release_info(fa->fa_info);
1688 alias_free_mem_rcu(fa);
1695 static int trie_flush_leaf(struct leaf *l)
1698 struct hlist_head *lih = &l->list;
1699 struct hlist_node *tmp;
1700 struct leaf_info *li = NULL;
1702 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1703 found += trie_flush_list(&li->falh);
1705 if (list_empty(&li->falh)) {
1706 hlist_del_rcu(&li->hlist);
1714 * Scan for the next right leaf starting at node p->child[idx]
1715 * Since we have back pointer, no recursion necessary.
1717 static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1723 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1727 while (idx < 1u << p->bits) {
1728 c = tnode_get_child_rcu(p, idx++);
1733 return (struct leaf *) c;
1735 /* Rescan start scanning in new node */
1736 p = (struct tnode *) c;
1740 /* Node empty, walk back up to parent */
1741 c = (struct rt_trie_node *) p;
1742 } while ((p = node_parent_rcu(c)) != NULL);
1744 return NULL; /* Root of trie */
1747 static struct leaf *trie_firstleaf(struct trie *t)
1749 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1754 if (IS_LEAF(n)) /* trie is just a leaf */
1755 return (struct leaf *) n;
1757 return leaf_walk_rcu(n, NULL);
1760 static struct leaf *trie_nextleaf(struct leaf *l)
1762 struct rt_trie_node *c = (struct rt_trie_node *) l;
1763 struct tnode *p = node_parent_rcu(c);
1766 return NULL; /* trie with just one leaf */
1768 return leaf_walk_rcu(p, c);
1771 static struct leaf *trie_leafindex(struct trie *t, int index)
1773 struct leaf *l = trie_firstleaf(t);
1775 while (l && index-- > 0)
1776 l = trie_nextleaf(l);
1783 * Caller must hold RTNL.
1785 int fib_table_flush(struct fib_table *tb)
1787 struct trie *t = (struct trie *) tb->tb_data;
1788 struct leaf *l, *ll = NULL;
1791 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1792 found += trie_flush_leaf(l);
1794 if (ll && hlist_empty(&ll->list))
1795 trie_leaf_remove(t, ll);
1799 if (ll && hlist_empty(&ll->list))
1800 trie_leaf_remove(t, ll);
1802 pr_debug("trie_flush found=%d\n", found);
1806 void fib_free_table(struct fib_table *tb)
1808 #ifdef CONFIG_IP_FIB_TRIE_STATS
1809 struct trie *t = (struct trie *)tb->tb_data;
1811 free_percpu(t->stats);
1812 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1816 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1817 struct fib_table *tb,
1818 struct sk_buff *skb, struct netlink_callback *cb)
1821 struct fib_alias *fa;
1822 __be32 xkey = htonl(key);
1827 /* rcu_read_lock is hold by caller */
1829 list_for_each_entry_rcu(fa, fah, fa_list) {
1835 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1843 fa->fa_info, NLM_F_MULTI) < 0) {
1853 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1854 struct sk_buff *skb, struct netlink_callback *cb)
1856 struct leaf_info *li;
1862 /* rcu_read_lock is hold by caller */
1863 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1872 if (list_empty(&li->falh))
1875 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1886 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1887 struct netlink_callback *cb)
1890 struct trie *t = (struct trie *) tb->tb_data;
1891 t_key key = cb->args[2];
1892 int count = cb->args[3];
1895 /* Dump starting at last key.
1896 * Note: 0.0.0.0/0 (ie default) is first key.
1899 l = trie_firstleaf(t);
1901 /* Normally, continue from last key, but if that is missing
1902 * fallback to using slow rescan
1904 l = fib_find_node(t, key);
1906 l = trie_leafindex(t, count);
1910 cb->args[2] = l->key;
1911 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1912 cb->args[3] = count;
1918 l = trie_nextleaf(l);
1919 memset(&cb->args[4], 0,
1920 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1922 cb->args[3] = count;
1928 void __init fib_trie_init(void)
1930 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1931 sizeof(struct fib_alias),
1932 0, SLAB_PANIC, NULL);
1934 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1935 max(sizeof(struct leaf),
1936 sizeof(struct leaf_info)),
1937 0, SLAB_PANIC, NULL);
1941 struct fib_table *fib_trie_table(u32 id)
1943 struct fib_table *tb;
1946 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1952 tb->tb_default = -1;
1953 tb->tb_num_default = 0;
1955 t = (struct trie *) tb->tb_data;
1956 RCU_INIT_POINTER(t->trie, NULL);
1957 #ifdef CONFIG_IP_FIB_TRIE_STATS
1958 t->stats = alloc_percpu(struct trie_use_stats);
1968 #ifdef CONFIG_PROC_FS
1969 /* Depth first Trie walk iterator */
1970 struct fib_trie_iter {
1971 struct seq_net_private p;
1972 struct fib_table *tb;
1973 struct tnode *tnode;
1978 static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1980 struct tnode *tn = iter->tnode;
1981 unsigned int cindex = iter->index;
1984 /* A single entry routing table */
1988 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1989 iter->tnode, iter->index, iter->depth);
1991 while (cindex < (1<<tn->bits)) {
1992 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
1997 iter->index = cindex + 1;
1999 /* push down one level */
2000 iter->tnode = (struct tnode *) n;
2010 /* Current node exhausted, pop back up */
2011 p = node_parent_rcu((struct rt_trie_node *)tn);
2013 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2023 static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2026 struct rt_trie_node *n;
2031 n = rcu_dereference(t->trie);
2036 iter->tnode = (struct tnode *) n;
2048 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2050 struct rt_trie_node *n;
2051 struct fib_trie_iter iter;
2053 memset(s, 0, sizeof(*s));
2056 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2058 struct leaf *l = (struct leaf *)n;
2059 struct leaf_info *li;
2062 s->totdepth += iter.depth;
2063 if (iter.depth > s->maxdepth)
2064 s->maxdepth = iter.depth;
2066 hlist_for_each_entry_rcu(li, &l->list, hlist)
2069 const struct tnode *tn = (const struct tnode *) n;
2073 if (tn->bits < MAX_STAT_DEPTH)
2074 s->nodesizes[tn->bits]++;
2076 for (i = 0; i < (1<<tn->bits); i++)
2085 * This outputs /proc/net/fib_triestats
2087 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2089 unsigned int i, max, pointers, bytes, avdepth;
2092 avdepth = stat->totdepth*100 / stat->leaves;
2096 seq_printf(seq, "\tAver depth: %u.%02d\n",
2097 avdepth / 100, avdepth % 100);
2098 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2100 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2101 bytes = sizeof(struct leaf) * stat->leaves;
2103 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2104 bytes += sizeof(struct leaf_info) * stat->prefixes;
2106 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2107 bytes += sizeof(struct tnode) * stat->tnodes;
2109 max = MAX_STAT_DEPTH;
2110 while (max > 0 && stat->nodesizes[max-1] == 0)
2114 for (i = 1; i < max; i++)
2115 if (stat->nodesizes[i] != 0) {
2116 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2117 pointers += (1<<i) * stat->nodesizes[i];
2119 seq_putc(seq, '\n');
2120 seq_printf(seq, "\tPointers: %u\n", pointers);
2122 bytes += sizeof(struct rt_trie_node *) * pointers;
2123 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2124 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2127 #ifdef CONFIG_IP_FIB_TRIE_STATS
2128 static void trie_show_usage(struct seq_file *seq,
2129 const struct trie_use_stats __percpu *stats)
2131 struct trie_use_stats s = { 0 };
2134 /* loop through all of the CPUs and gather up the stats */
2135 for_each_possible_cpu(cpu) {
2136 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2138 s.gets += pcpu->gets;
2139 s.backtrack += pcpu->backtrack;
2140 s.semantic_match_passed += pcpu->semantic_match_passed;
2141 s.semantic_match_miss += pcpu->semantic_match_miss;
2142 s.null_node_hit += pcpu->null_node_hit;
2143 s.resize_node_skipped += pcpu->resize_node_skipped;
2146 seq_printf(seq, "\nCounters:\n---------\n");
2147 seq_printf(seq, "gets = %u\n", s.gets);
2148 seq_printf(seq, "backtracks = %u\n", s.backtrack);
2149 seq_printf(seq, "semantic match passed = %u\n",
2150 s.semantic_match_passed);
2151 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2152 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2153 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2155 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2157 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2159 if (tb->tb_id == RT_TABLE_LOCAL)
2160 seq_puts(seq, "Local:\n");
2161 else if (tb->tb_id == RT_TABLE_MAIN)
2162 seq_puts(seq, "Main:\n");
2164 seq_printf(seq, "Id %d:\n", tb->tb_id);
2168 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2170 struct net *net = (struct net *)seq->private;
2174 "Basic info: size of leaf:"
2175 " %Zd bytes, size of tnode: %Zd bytes.\n",
2176 sizeof(struct leaf), sizeof(struct tnode));
2178 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2179 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2180 struct fib_table *tb;
2182 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2183 struct trie *t = (struct trie *) tb->tb_data;
2184 struct trie_stat stat;
2189 fib_table_print(seq, tb);
2191 trie_collect_stats(t, &stat);
2192 trie_show_stats(seq, &stat);
2193 #ifdef CONFIG_IP_FIB_TRIE_STATS
2194 trie_show_usage(seq, t->stats);
2202 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2204 return single_open_net(inode, file, fib_triestat_seq_show);
2207 static const struct file_operations fib_triestat_fops = {
2208 .owner = THIS_MODULE,
2209 .open = fib_triestat_seq_open,
2211 .llseek = seq_lseek,
2212 .release = single_release_net,
2215 static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2217 struct fib_trie_iter *iter = seq->private;
2218 struct net *net = seq_file_net(seq);
2222 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2223 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2224 struct fib_table *tb;
2226 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2227 struct rt_trie_node *n;
2229 for (n = fib_trie_get_first(iter,
2230 (struct trie *) tb->tb_data);
2231 n; n = fib_trie_get_next(iter))
2242 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2246 return fib_trie_get_idx(seq, *pos);
2249 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2251 struct fib_trie_iter *iter = seq->private;
2252 struct net *net = seq_file_net(seq);
2253 struct fib_table *tb = iter->tb;
2254 struct hlist_node *tb_node;
2256 struct rt_trie_node *n;
2259 /* next node in same table */
2260 n = fib_trie_get_next(iter);
2264 /* walk rest of this hash chain */
2265 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2266 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2267 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2268 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2273 /* new hash chain */
2274 while (++h < FIB_TABLE_HASHSZ) {
2275 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2276 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2277 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2289 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2295 static void seq_indent(struct seq_file *seq, int n)
2301 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2304 case RT_SCOPE_UNIVERSE: return "universe";
2305 case RT_SCOPE_SITE: return "site";
2306 case RT_SCOPE_LINK: return "link";
2307 case RT_SCOPE_HOST: return "host";
2308 case RT_SCOPE_NOWHERE: return "nowhere";
2310 snprintf(buf, len, "scope=%d", s);
2315 static const char *const rtn_type_names[__RTN_MAX] = {
2316 [RTN_UNSPEC] = "UNSPEC",
2317 [RTN_UNICAST] = "UNICAST",
2318 [RTN_LOCAL] = "LOCAL",
2319 [RTN_BROADCAST] = "BROADCAST",
2320 [RTN_ANYCAST] = "ANYCAST",
2321 [RTN_MULTICAST] = "MULTICAST",
2322 [RTN_BLACKHOLE] = "BLACKHOLE",
2323 [RTN_UNREACHABLE] = "UNREACHABLE",
2324 [RTN_PROHIBIT] = "PROHIBIT",
2325 [RTN_THROW] = "THROW",
2327 [RTN_XRESOLVE] = "XRESOLVE",
2330 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2332 if (t < __RTN_MAX && rtn_type_names[t])
2333 return rtn_type_names[t];
2334 snprintf(buf, len, "type %u", t);
2338 /* Pretty print the trie */
2339 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2341 const struct fib_trie_iter *iter = seq->private;
2342 struct rt_trie_node *n = v;
2344 if (!node_parent_rcu(n))
2345 fib_table_print(seq, iter->tb);
2348 struct tnode *tn = (struct tnode *) n;
2349 __be32 prf = htonl(tn->key);
2351 seq_indent(seq, iter->depth-1);
2352 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2353 &prf, tn->pos, tn->bits, tn->full_children,
2354 tn->empty_children);
2357 struct leaf *l = (struct leaf *) n;
2358 struct leaf_info *li;
2359 __be32 val = htonl(l->key);
2361 seq_indent(seq, iter->depth);
2362 seq_printf(seq, " |-- %pI4\n", &val);
2364 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2365 struct fib_alias *fa;
2367 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2368 char buf1[32], buf2[32];
2370 seq_indent(seq, iter->depth+1);
2371 seq_printf(seq, " /%d %s %s", li->plen,
2372 rtn_scope(buf1, sizeof(buf1),
2373 fa->fa_info->fib_scope),
2374 rtn_type(buf2, sizeof(buf2),
2377 seq_printf(seq, " tos=%d", fa->fa_tos);
2378 seq_putc(seq, '\n');
2386 static const struct seq_operations fib_trie_seq_ops = {
2387 .start = fib_trie_seq_start,
2388 .next = fib_trie_seq_next,
2389 .stop = fib_trie_seq_stop,
2390 .show = fib_trie_seq_show,
2393 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2395 return seq_open_net(inode, file, &fib_trie_seq_ops,
2396 sizeof(struct fib_trie_iter));
2399 static const struct file_operations fib_trie_fops = {
2400 .owner = THIS_MODULE,
2401 .open = fib_trie_seq_open,
2403 .llseek = seq_lseek,
2404 .release = seq_release_net,
2407 struct fib_route_iter {
2408 struct seq_net_private p;
2409 struct trie *main_trie;
2414 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2416 struct leaf *l = NULL;
2417 struct trie *t = iter->main_trie;
2419 /* use cache location of last found key */
2420 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2424 l = trie_firstleaf(t);
2427 while (l && pos-- > 0) {
2429 l = trie_nextleaf(l);
2433 iter->key = pos; /* remember it */
2435 iter->pos = 0; /* forget it */
2440 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2443 struct fib_route_iter *iter = seq->private;
2444 struct fib_table *tb;
2447 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2451 iter->main_trie = (struct trie *) tb->tb_data;
2453 return SEQ_START_TOKEN;
2455 return fib_route_get_idx(iter, *pos - 1);
2458 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2460 struct fib_route_iter *iter = seq->private;
2464 if (v == SEQ_START_TOKEN) {
2466 l = trie_firstleaf(iter->main_trie);
2469 l = trie_nextleaf(l);
2479 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2485 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2487 unsigned int flags = 0;
2489 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2491 if (fi && fi->fib_nh->nh_gw)
2492 flags |= RTF_GATEWAY;
2493 if (mask == htonl(0xFFFFFFFF))
2500 * This outputs /proc/net/route.
2501 * The format of the file is not supposed to be changed
2502 * and needs to be same as fib_hash output to avoid breaking
2505 static int fib_route_seq_show(struct seq_file *seq, void *v)
2508 struct leaf_info *li;
2510 if (v == SEQ_START_TOKEN) {
2511 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2512 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2517 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2518 struct fib_alias *fa;
2519 __be32 mask, prefix;
2521 mask = inet_make_mask(li->plen);
2522 prefix = htonl(l->key);
2524 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2525 const struct fib_info *fi = fa->fa_info;
2526 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2528 if (fa->fa_type == RTN_BROADCAST
2529 || fa->fa_type == RTN_MULTICAST)
2532 seq_setwidth(seq, 127);
2536 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2537 "%d\t%08X\t%d\t%u\t%u",
2538 fi->fib_dev ? fi->fib_dev->name : "*",
2540 fi->fib_nh->nh_gw, flags, 0, 0,
2544 fi->fib_advmss + 40 : 0),
2549 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2550 "%d\t%08X\t%d\t%u\t%u",
2551 prefix, 0, flags, 0, 0, 0,
2561 static const struct seq_operations fib_route_seq_ops = {
2562 .start = fib_route_seq_start,
2563 .next = fib_route_seq_next,
2564 .stop = fib_route_seq_stop,
2565 .show = fib_route_seq_show,
2568 static int fib_route_seq_open(struct inode *inode, struct file *file)
2570 return seq_open_net(inode, file, &fib_route_seq_ops,
2571 sizeof(struct fib_route_iter));
2574 static const struct file_operations fib_route_fops = {
2575 .owner = THIS_MODULE,
2576 .open = fib_route_seq_open,
2578 .llseek = seq_lseek,
2579 .release = seq_release_net,
2582 int __net_init fib_proc_init(struct net *net)
2584 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2587 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2588 &fib_triestat_fops))
2591 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2597 remove_proc_entry("fib_triestat", net->proc_net);
2599 remove_proc_entry("fib_trie", net->proc_net);
2604 void __net_exit fib_proc_exit(struct net *net)
2606 remove_proc_entry("fib_trie", net->proc_net);
2607 remove_proc_entry("fib_triestat", net->proc_net);
2608 remove_proc_entry("route", net->proc_net);
2611 #endif /* CONFIG_PROC_FS */