2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally described in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
26 * Code from fib_hash has been reused which includes the following header:
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
33 * IPv4 FIB: lookup engine and maintenance routines.
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
43 * Substantial contributions to this work comes from:
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
51 #define VERSION "0.409"
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/export.h>
75 #include <net/net_namespace.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
84 #define MAX_STAT_DEPTH 32
86 #define KEYLENGTH (8*sizeof(t_key))
88 typedef unsigned int t_key;
90 #define IS_TNODE(n) ((n)->bits)
91 #define IS_LEAF(n) (!(n)->bits)
93 #define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
94 #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 struct tnode __rcu *parent;
103 /* The fields in this struct are valid if bits > 0 (TNODE) */
105 unsigned int full_children; /* KEYLENGTH bits needed */
106 unsigned int empty_children; /* KEYLENGTH bits needed */
107 struct tnode __rcu *child[0];
109 /* This list pointer if valid if bits == 0 (LEAF) */
110 struct hlist_head list;
115 struct hlist_node hlist;
117 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
118 struct list_head falh;
122 #ifdef CONFIG_IP_FIB_TRIE_STATS
123 struct trie_use_stats {
125 unsigned int backtrack;
126 unsigned int semantic_match_passed;
127 unsigned int semantic_match_miss;
128 unsigned int null_node_hit;
129 unsigned int resize_node_skipped;
134 unsigned int totdepth;
135 unsigned int maxdepth;
138 unsigned int nullpointers;
139 unsigned int prefixes;
140 unsigned int nodesizes[MAX_STAT_DEPTH];
144 struct tnode __rcu *trie;
145 #ifdef CONFIG_IP_FIB_TRIE_STATS
146 struct trie_use_stats __percpu *stats;
150 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
152 static struct tnode *resize(struct trie *t, struct tnode *tn);
153 static struct tnode *inflate(struct trie *t, struct tnode *tn);
154 static struct tnode *halve(struct trie *t, struct tnode *tn);
155 /* tnodes to free after resize(); protected by RTNL */
156 static struct callback_head *tnode_free_head;
157 static size_t tnode_free_size;
160 * synchronize_rcu after call_rcu for that many pages; it should be especially
161 * useful before resizing the root node with PREEMPT_NONE configs; the value was
162 * obtained experimentally, aiming to avoid visible slowdown.
164 static const int sync_pages = 128;
166 static struct kmem_cache *fn_alias_kmem __read_mostly;
167 static struct kmem_cache *trie_leaf_kmem __read_mostly;
169 /* caller must hold RTNL */
170 #define node_parent(n) rtnl_dereference((n)->parent)
172 /* caller must hold RCU read lock or RTNL */
173 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
175 /* wrapper for rcu_assign_pointer */
176 static inline void node_set_parent(struct tnode *n, struct tnode *tp)
179 rcu_assign_pointer(n->parent, tp);
182 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
184 /* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
187 static inline int tnode_child_length(const struct tnode *tn)
189 return (1ul << tn->bits) & ~(1ul);
193 * caller must hold RTNL
195 static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
197 BUG_ON(i >= tnode_child_length(tn));
199 return rtnl_dereference(tn->child[i]);
203 * caller must hold RCU read lock or RTNL
205 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
207 BUG_ON(i >= tnode_child_length(tn));
209 return rcu_dereference_rtnl(tn->child[i]);
212 static inline t_key mask_pfx(t_key k, unsigned int l)
214 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
217 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
219 if (offset < KEYLENGTH)
220 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
225 static inline int tkey_equals(t_key a, t_key b)
230 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
232 if (bits == 0 || offset >= KEYLENGTH)
234 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
235 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
238 static inline int tkey_mismatch(t_key a, int offset, t_key b)
245 while ((diff << i) >> (KEYLENGTH-1) == 0)
251 To understand this stuff, an understanding of keys and all their bits is
252 necessary. Every node in the trie has a key associated with it, but not
253 all of the bits in that key are significant.
255 Consider a node 'n' and its parent 'tp'.
257 If n is a leaf, every bit in its key is significant. Its presence is
258 necessitated by path compression, since during a tree traversal (when
259 searching for a leaf - unless we are doing an insertion) we will completely
260 ignore all skipped bits we encounter. Thus we need to verify, at the end of
261 a potentially successful search, that we have indeed been walking the
264 Note that we can never "miss" the correct key in the tree if present by
265 following the wrong path. Path compression ensures that segments of the key
266 that are the same for all keys with a given prefix are skipped, but the
267 skipped part *is* identical for each node in the subtrie below the skipped
268 bit! trie_insert() in this implementation takes care of that - note the
269 call to tkey_sub_equals() in trie_insert().
271 if n is an internal node - a 'tnode' here, the various parts of its key
272 have many different meanings.
275 _________________________________________________________________
276 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
277 -----------------------------------------------------------------
278 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
280 _________________________________________________________________
281 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
282 -----------------------------------------------------------------
283 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
290 First, let's just ignore the bits that come before the parent tp, that is
291 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
292 not use them for anything.
294 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
295 index into the parent's child array. That is, they will be used to find
296 'n' among tp's children.
298 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
301 All the bits we have seen so far are significant to the node n. The rest
302 of the bits are really not needed or indeed known in n->key.
304 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
305 n's child array, and will of course be different for each child.
308 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
313 static const int halve_threshold = 25;
314 static const int inflate_threshold = 50;
315 static const int halve_threshold_root = 15;
316 static const int inflate_threshold_root = 30;
318 static void __alias_free_mem(struct rcu_head *head)
320 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
321 kmem_cache_free(fn_alias_kmem, fa);
324 static inline void alias_free_mem_rcu(struct fib_alias *fa)
326 call_rcu(&fa->rcu, __alias_free_mem);
329 #define TNODE_KMALLOC_MAX \
330 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
332 static void __node_free_rcu(struct rcu_head *head)
334 struct tnode *n = container_of(head, struct tnode, rcu);
337 kmem_cache_free(trie_leaf_kmem, n);
338 else if (n->bits <= TNODE_KMALLOC_MAX)
344 #define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
346 static inline void free_leaf_info(struct leaf_info *leaf)
348 kfree_rcu(leaf, rcu);
351 static struct tnode *tnode_alloc(size_t size)
353 if (size <= PAGE_SIZE)
354 return kzalloc(size, GFP_KERNEL);
356 return vzalloc(size);
359 static void tnode_free_safe(struct tnode *tn)
362 tn->rcu.next = tnode_free_head;
363 tnode_free_head = &tn->rcu;
366 static void tnode_free_flush(void)
368 struct callback_head *head;
370 while ((head = tnode_free_head)) {
371 struct tnode *tn = container_of(head, struct tnode, rcu);
373 tnode_free_head = head->next;
374 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
379 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
385 static struct tnode *leaf_new(t_key key)
387 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
390 /* set key and pos to reflect full key value
391 * any trailing zeros in the key should be ignored
392 * as the nodes are searched
396 /* set bits to 0 indicating we are not a tnode */
399 INIT_HLIST_HEAD(&l->list);
404 static struct leaf_info *leaf_info_new(int plen)
406 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
409 li->mask_plen = ntohl(inet_make_mask(plen));
410 INIT_LIST_HEAD(&li->falh);
415 static struct tnode *tnode_new(t_key key, int pos, int bits)
417 size_t sz = offsetof(struct tnode, child[1 << bits]);
418 struct tnode *tn = tnode_alloc(sz);
419 unsigned int shift = pos + bits;
421 /* verify bits and pos their msb bits clear and values are valid */
422 BUG_ON(!bits || (shift > KEYLENGTH));
428 tn->key = mask_pfx(key, pos);
429 tn->full_children = 0;
430 tn->empty_children = 1<<bits;
433 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
434 sizeof(struct tnode *) << bits);
439 * Check whether a tnode 'n' is "full", i.e. it is an internal node
440 * and no bits are skipped. See discussion in dyntree paper p. 6
443 static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
445 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
448 static inline void put_child(struct tnode *tn, int i,
451 tnode_put_child_reorg(tn, i, n, -1);
455 * Add a child at position i overwriting the old value.
456 * Update the value of full_children and empty_children.
459 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
462 struct tnode *chi = rtnl_dereference(tn->child[i]);
465 BUG_ON(i >= 1<<tn->bits);
467 /* update emptyChildren */
468 if (n == NULL && chi != NULL)
469 tn->empty_children++;
470 else if (n != NULL && chi == NULL)
471 tn->empty_children--;
473 /* update fullChildren */
475 wasfull = tnode_full(tn, chi);
477 isfull = tnode_full(tn, n);
478 if (wasfull && !isfull)
480 else if (!wasfull && isfull)
483 node_set_parent(n, tn);
485 rcu_assign_pointer(tn->child[i], n);
489 static struct tnode *resize(struct trie *t, struct tnode *tn)
491 struct tnode *old_tn, *n = NULL;
492 int inflate_threshold_use;
493 int halve_threshold_use;
499 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
500 tn, inflate_threshold, halve_threshold);
503 if (tn->empty_children > (tnode_child_length(tn) - 1))
507 if (tn->empty_children == (tnode_child_length(tn) - 1))
510 * Double as long as the resulting node has a number of
511 * nonempty nodes that are above the threshold.
515 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
516 * the Helsinki University of Technology and Matti Tikkanen of Nokia
517 * Telecommunications, page 6:
518 * "A node is doubled if the ratio of non-empty children to all
519 * children in the *doubled* node is at least 'high'."
521 * 'high' in this instance is the variable 'inflate_threshold'. It
522 * is expressed as a percentage, so we multiply it with
523 * tnode_child_length() and instead of multiplying by 2 (since the
524 * child array will be doubled by inflate()) and multiplying
525 * the left-hand side by 100 (to handle the percentage thing) we
526 * multiply the left-hand side by 50.
528 * The left-hand side may look a bit weird: tnode_child_length(tn)
529 * - tn->empty_children is of course the number of non-null children
530 * in the current node. tn->full_children is the number of "full"
531 * children, that is non-null tnodes with a skip value of 0.
532 * All of those will be doubled in the resulting inflated tnode, so
533 * we just count them one extra time here.
535 * A clearer way to write this would be:
537 * to_be_doubled = tn->full_children;
538 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
541 * new_child_length = tnode_child_length(tn) * 2;
543 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
545 * if (new_fill_factor >= inflate_threshold)
547 * ...and so on, tho it would mess up the while () loop.
550 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
554 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
555 * inflate_threshold * new_child_length
557 * expand not_to_be_doubled and to_be_doubled, and shorten:
558 * 100 * (tnode_child_length(tn) - tn->empty_children +
559 * tn->full_children) >= inflate_threshold * new_child_length
561 * expand new_child_length:
562 * 100 * (tnode_child_length(tn) - tn->empty_children +
563 * tn->full_children) >=
564 * inflate_threshold * tnode_child_length(tn) * 2
567 * 50 * (tn->full_children + tnode_child_length(tn) -
568 * tn->empty_children) >= inflate_threshold *
569 * tnode_child_length(tn)
573 /* Keep root node larger */
575 if (!node_parent(tn)) {
576 inflate_threshold_use = inflate_threshold_root;
577 halve_threshold_use = halve_threshold_root;
579 inflate_threshold_use = inflate_threshold;
580 halve_threshold_use = halve_threshold;
584 while ((tn->full_children > 0 && max_work-- &&
585 50 * (tn->full_children + tnode_child_length(tn)
586 - tn->empty_children)
587 >= inflate_threshold_use * tnode_child_length(tn))) {
594 #ifdef CONFIG_IP_FIB_TRIE_STATS
595 this_cpu_inc(t->stats->resize_node_skipped);
601 /* Return if at least one inflate is run */
602 if (max_work != MAX_WORK)
606 * Halve as long as the number of empty children in this
607 * node is above threshold.
611 while (tn->bits > 1 && max_work-- &&
612 100 * (tnode_child_length(tn) - tn->empty_children) <
613 halve_threshold_use * tnode_child_length(tn)) {
619 #ifdef CONFIG_IP_FIB_TRIE_STATS
620 this_cpu_inc(t->stats->resize_node_skipped);
627 /* Only one child remains */
628 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
631 for (i = tnode_child_length(tn); !n && i;)
632 n = tnode_get_child(tn, --i);
634 /* compress one level */
635 node_set_parent(n, NULL);
643 static void tnode_clean_free(struct tnode *tn)
645 struct tnode *tofree;
648 for (i = 0; i < tnode_child_length(tn); i++) {
649 tofree = rtnl_dereference(tn->child[i]);
656 static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
658 int olen = tnode_child_length(oldtnode);
662 pr_debug("In inflate\n");
664 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
667 return ERR_PTR(-ENOMEM);
670 * Preallocate and store tnodes before the actual work so we
671 * don't get into an inconsistent state if memory allocation
672 * fails. In case of failure we return the oldnode and inflate
673 * of tnode is ignored.
676 for (i = 0; i < olen; i++) {
679 inode = tnode_get_child(oldtnode, i);
680 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
681 struct tnode *left, *right;
682 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
689 right = tnode_new(inode->key|m, inode->pos + 1,
697 put_child(tn, 2*i, left);
698 put_child(tn, 2*i+1, right);
702 for (i = 0; i < olen; i++) {
703 struct tnode *inode = tnode_get_child(oldtnode, i);
704 struct tnode *left, *right;
711 /* A leaf or an internal node with skipped bits */
712 if (!tnode_full(oldtnode, inode)) {
714 tkey_extract_bits(inode->key, tn->pos, tn->bits),
719 /* An internal node with two children */
720 if (inode->bits == 1) {
721 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
722 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
724 tnode_free_safe(inode);
728 /* An internal node with more than two children */
730 /* We will replace this node 'inode' with two new
731 * ones, 'left' and 'right', each with half of the
732 * original children. The two new nodes will have
733 * a position one bit further down the key and this
734 * means that the "significant" part of their keys
735 * (see the discussion near the top of this file)
736 * will differ by one bit, which will be "0" in
737 * left's key and "1" in right's key. Since we are
738 * moving the key position by one step, the bit that
739 * we are moving away from - the bit at position
740 * (inode->pos) - is the one that will differ between
741 * left and right. So... we synthesize that bit in the
743 * The mask 'm' below will be a single "one" bit at
744 * the position (inode->pos)
747 /* Use the old key, but set the new significant
751 left = tnode_get_child(tn, 2*i);
752 put_child(tn, 2*i, NULL);
756 right = tnode_get_child(tn, 2*i+1);
757 put_child(tn, 2*i+1, NULL);
761 size = tnode_child_length(left);
762 for (j = 0; j < size; j++) {
763 put_child(left, j, rtnl_dereference(inode->child[j]));
764 put_child(right, j, rtnl_dereference(inode->child[j + size]));
766 put_child(tn, 2*i, resize(t, left));
767 put_child(tn, 2*i+1, resize(t, right));
769 tnode_free_safe(inode);
771 tnode_free_safe(oldtnode);
774 tnode_clean_free(tn);
775 return ERR_PTR(-ENOMEM);
778 static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
780 int olen = tnode_child_length(oldtnode);
781 struct tnode *tn, *left, *right;
784 pr_debug("In halve\n");
786 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
789 return ERR_PTR(-ENOMEM);
792 * Preallocate and store tnodes before the actual work so we
793 * don't get into an inconsistent state if memory allocation
794 * fails. In case of failure we return the oldnode and halve
795 * of tnode is ignored.
798 for (i = 0; i < olen; i += 2) {
799 left = tnode_get_child(oldtnode, i);
800 right = tnode_get_child(oldtnode, i+1);
802 /* Two nonempty children */
806 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
811 put_child(tn, i/2, newn);
816 for (i = 0; i < olen; i += 2) {
817 struct tnode *newBinNode;
819 left = tnode_get_child(oldtnode, i);
820 right = tnode_get_child(oldtnode, i+1);
822 /* At least one of the children is empty */
824 if (right == NULL) /* Both are empty */
826 put_child(tn, i/2, right);
831 put_child(tn, i/2, left);
835 /* Two nonempty children */
836 newBinNode = tnode_get_child(tn, i/2);
837 put_child(tn, i/2, NULL);
838 put_child(newBinNode, 0, left);
839 put_child(newBinNode, 1, right);
840 put_child(tn, i/2, resize(t, newBinNode));
842 tnode_free_safe(oldtnode);
845 tnode_clean_free(tn);
846 return ERR_PTR(-ENOMEM);
849 /* readside must use rcu_read_lock currently dump routines
850 via get_fa_head and dump */
852 static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
854 struct hlist_head *head = &l->list;
855 struct leaf_info *li;
857 hlist_for_each_entry_rcu(li, head, hlist)
858 if (li->plen == plen)
864 static inline struct list_head *get_fa_head(struct tnode *l, int plen)
866 struct leaf_info *li = find_leaf_info(l, plen);
874 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
876 struct leaf_info *li = NULL, *last = NULL;
878 if (hlist_empty(head)) {
879 hlist_add_head_rcu(&new->hlist, head);
881 hlist_for_each_entry(li, head, hlist) {
882 if (new->plen > li->plen)
888 hlist_add_behind_rcu(&new->hlist, &last->hlist);
890 hlist_add_before_rcu(&new->hlist, &li->hlist);
894 /* rcu_read_lock needs to be hold by caller from readside */
896 static struct tnode *fib_find_node(struct trie *t, u32 key)
898 struct tnode *n = rcu_dereference_rtnl(t->trie);
901 while (n && IS_TNODE(n)) {
902 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
903 pos = n->pos + n->bits;
904 n = tnode_get_child_rcu(n,
905 tkey_extract_bits(key,
911 /* Case we have found a leaf. Compare prefixes */
913 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
919 static void trie_rebalance(struct trie *t, struct tnode *tn)
927 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
928 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
929 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
932 tnode_put_child_reorg(tp, cindex, tn, wasfull);
934 tp = node_parent(tn);
936 rcu_assign_pointer(t->trie, tn);
944 /* Handle last (top) tnode */
948 rcu_assign_pointer(t->trie, tn);
952 /* only used from updater-side */
954 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
957 struct tnode *tp = NULL, *tn = NULL;
961 struct list_head *fa_head = NULL;
962 struct leaf_info *li;
966 n = rtnl_dereference(t->trie);
968 /* If we point to NULL, stop. Either the tree is empty and we should
969 * just put a new leaf in if, or we have reached an empty child slot,
970 * and we should just put our new leaf in that.
971 * If we point to a T_TNODE, check if it matches our key. Note that
972 * a T_TNODE might be skipping any number of bits - its 'pos' need
973 * not be the parent's 'pos'+'bits'!
975 * If it does match the current key, get pos/bits from it, extract
976 * the index from our key, push the T_TNODE and walk the tree.
978 * If it doesn't, we have to replace it with a new T_TNODE.
980 * If we point to a T_LEAF, it might or might not have the same key
981 * as we do. If it does, just change the value, update the T_LEAF's
982 * value, and return it.
983 * If it doesn't, we need to replace it with a T_TNODE.
986 while (n && IS_TNODE(n)) {
987 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
989 pos = n->pos + n->bits;
990 n = tnode_get_child(n,
991 tkey_extract_bits(key,
995 BUG_ON(n && node_parent(n) != tp);
1001 * n ----> NULL, LEAF or TNODE
1003 * tp is n's (parent) ----> NULL or TNODE
1006 BUG_ON(tp && IS_LEAF(tp));
1008 /* Case 1: n is a leaf. Compare prefixes */
1010 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1011 li = leaf_info_new(plen);
1016 fa_head = &li->falh;
1017 insert_leaf_info(&n->list, li);
1025 li = leaf_info_new(plen);
1032 fa_head = &li->falh;
1033 insert_leaf_info(&l->list, li);
1035 if (t->trie && n == NULL) {
1036 /* Case 2: n is NULL, and will just insert a new leaf */
1038 node_set_parent(l, tp);
1040 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1041 put_child(tp, cindex, l);
1043 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1045 * Add a new tnode here
1046 * first tnode need some special handling
1050 pos = tp ? tp->pos+tp->bits : 0;
1051 newpos = tkey_mismatch(key, pos, n->key);
1052 tn = tnode_new(n->key, newpos, 1);
1055 tn = tnode_new(key, newpos, 1); /* First tnode */
1064 node_set_parent(tn, tp);
1066 missbit = tkey_extract_bits(key, newpos, 1);
1067 put_child(tn, missbit, l);
1068 put_child(tn, 1-missbit, n);
1071 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1072 put_child(tp, cindex, tn);
1074 rcu_assign_pointer(t->trie, tn);
1080 if (tp && tp->pos + tp->bits > 32)
1081 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1082 tp, tp->pos, tp->bits, key, plen);
1084 /* Rebalance the trie */
1086 trie_rebalance(t, tp);
1092 * Caller must hold RTNL.
1094 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1096 struct trie *t = (struct trie *) tb->tb_data;
1097 struct fib_alias *fa, *new_fa;
1098 struct list_head *fa_head = NULL;
1099 struct fib_info *fi;
1100 int plen = cfg->fc_dst_len;
1101 u8 tos = cfg->fc_tos;
1109 key = ntohl(cfg->fc_dst);
1111 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1113 mask = ntohl(inet_make_mask(plen));
1120 fi = fib_create_info(cfg);
1126 l = fib_find_node(t, key);
1130 fa_head = get_fa_head(l, plen);
1131 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1134 /* Now fa, if non-NULL, points to the first fib alias
1135 * with the same keys [prefix,tos,priority], if such key already
1136 * exists or to the node before which we will insert new one.
1138 * If fa is NULL, we will need to allocate a new one and
1139 * insert to the head of f.
1141 * If f is NULL, no fib node matched the destination key
1142 * and we need to allocate a new one of those as well.
1145 if (fa && fa->fa_tos == tos &&
1146 fa->fa_info->fib_priority == fi->fib_priority) {
1147 struct fib_alias *fa_first, *fa_match;
1150 if (cfg->fc_nlflags & NLM_F_EXCL)
1154 * 1. Find exact match for type, scope, fib_info to avoid
1156 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1160 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1161 list_for_each_entry_continue(fa, fa_head, fa_list) {
1162 if (fa->fa_tos != tos)
1164 if (fa->fa_info->fib_priority != fi->fib_priority)
1166 if (fa->fa_type == cfg->fc_type &&
1167 fa->fa_info == fi) {
1173 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1174 struct fib_info *fi_drop;
1184 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1188 fi_drop = fa->fa_info;
1189 new_fa->fa_tos = fa->fa_tos;
1190 new_fa->fa_info = fi;
1191 new_fa->fa_type = cfg->fc_type;
1192 state = fa->fa_state;
1193 new_fa->fa_state = state & ~FA_S_ACCESSED;
1195 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1196 alias_free_mem_rcu(fa);
1198 fib_release_info(fi_drop);
1199 if (state & FA_S_ACCESSED)
1200 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1201 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1202 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1206 /* Error if we find a perfect match which
1207 * uses the same scope, type, and nexthop
1213 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1217 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1221 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1225 new_fa->fa_info = fi;
1226 new_fa->fa_tos = tos;
1227 new_fa->fa_type = cfg->fc_type;
1228 new_fa->fa_state = 0;
1230 * Insert new entry to the list.
1234 fa_head = fib_insert_node(t, key, plen);
1235 if (unlikely(!fa_head)) {
1237 goto out_free_new_fa;
1242 tb->tb_num_default++;
1244 list_add_tail_rcu(&new_fa->fa_list,
1245 (fa ? &fa->fa_list : fa_head));
1247 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1248 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1249 &cfg->fc_nlinfo, 0);
1254 kmem_cache_free(fn_alias_kmem, new_fa);
1256 fib_release_info(fi);
1261 /* should be called with rcu_read_lock */
1262 static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1263 t_key key, const struct flowi4 *flp,
1264 struct fib_result *res, int fib_flags)
1266 struct leaf_info *li;
1267 struct hlist_head *hhead = &l->list;
1269 hlist_for_each_entry_rcu(li, hhead, hlist) {
1270 struct fib_alias *fa;
1272 if (l->key != (key & li->mask_plen))
1275 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1276 struct fib_info *fi = fa->fa_info;
1279 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1283 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1285 fib_alias_accessed(fa);
1286 err = fib_props[fa->fa_type].error;
1287 if (unlikely(err < 0)) {
1288 #ifdef CONFIG_IP_FIB_TRIE_STATS
1289 this_cpu_inc(t->stats->semantic_match_passed);
1293 if (fi->fib_flags & RTNH_F_DEAD)
1295 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1296 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1298 if (nh->nh_flags & RTNH_F_DEAD)
1300 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1303 #ifdef CONFIG_IP_FIB_TRIE_STATS
1304 this_cpu_inc(t->stats->semantic_match_passed);
1306 res->prefixlen = li->plen;
1307 res->nh_sel = nhsel;
1308 res->type = fa->fa_type;
1309 res->scope = fi->fib_scope;
1312 res->fa_head = &li->falh;
1313 if (!(fib_flags & FIB_LOOKUP_NOREF))
1314 atomic_inc(&fi->fib_clntref);
1319 #ifdef CONFIG_IP_FIB_TRIE_STATS
1320 this_cpu_inc(t->stats->semantic_match_miss);
1327 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1329 t_key prefix = n->key;
1331 return (key ^ prefix) & (prefix | -prefix);
1334 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1335 struct fib_result *res, int fib_flags)
1337 struct trie *t = (struct trie *)tb->tb_data;
1338 #ifdef CONFIG_IP_FIB_TRIE_STATS
1339 struct trie_use_stats __percpu *stats = t->stats;
1341 const t_key key = ntohl(flp->daddr);
1342 struct tnode *n, *pn;
1348 n = rcu_dereference(t->trie);
1352 #ifdef CONFIG_IP_FIB_TRIE_STATS
1353 this_cpu_inc(stats->gets);
1359 /* Step 1: Travel to the longest prefix match in the trie */
1361 unsigned long index = get_index(key, n);
1363 /* This bit of code is a bit tricky but it combines multiple
1364 * checks into a single check. The prefix consists of the
1365 * prefix plus zeros for the "bits" in the prefix. The index
1366 * is the difference between the key and this value. From
1367 * this we can actually derive several pieces of data.
1368 * if !(index >> bits)
1369 * we know the value is child index
1371 * we have a mismatch in skip bits and failed
1373 if (index >> n->bits)
1376 /* we have found a leaf. Prefixes have already been compared */
1380 /* only record pn and cindex if we are going to be chopping
1381 * bits later. Otherwise we are just wasting cycles.
1388 n = rcu_dereference(n->child[index]);
1393 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1395 /* record the pointer where our next node pointer is stored */
1396 struct tnode __rcu **cptr = n->child;
1398 /* This test verifies that none of the bits that differ
1399 * between the key and the prefix exist in the region of
1400 * the lsb and higher in the prefix.
1402 if (unlikely(prefix_mismatch(key, n)))
1405 /* exit out and process leaf */
1406 if (unlikely(IS_LEAF(n)))
1409 /* Don't bother recording parent info. Since we are in
1410 * prefix match mode we will have to come back to wherever
1411 * we started this traversal anyway
1414 while ((n = rcu_dereference(*cptr)) == NULL) {
1416 #ifdef CONFIG_IP_FIB_TRIE_STATS
1418 this_cpu_inc(stats->null_node_hit);
1420 /* If we are at cindex 0 there are no more bits for
1421 * us to strip at this level so we must ascend back
1422 * up one level to see if there are any more bits to
1423 * be stripped there.
1426 t_key pkey = pn->key;
1428 pn = node_parent_rcu(pn);
1431 #ifdef CONFIG_IP_FIB_TRIE_STATS
1432 this_cpu_inc(stats->backtrack);
1434 /* Get Child's index */
1435 cindex = get_index(pkey, pn);
1438 /* strip the least significant bit from the cindex */
1439 cindex &= cindex - 1;
1441 /* grab pointer for next child node */
1442 cptr = &pn->child[cindex];
1447 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1448 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1449 if (unlikely(ret > 0))
1455 EXPORT_SYMBOL_GPL(fib_table_lookup);
1458 * Remove the leaf and return parent.
1460 static void trie_leaf_remove(struct trie *t, struct tnode *l)
1462 struct tnode *tp = node_parent(l);
1464 pr_debug("entering trie_leaf_remove(%p)\n", l);
1467 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1468 put_child(tp, cindex, NULL);
1469 trie_rebalance(t, tp);
1471 RCU_INIT_POINTER(t->trie, NULL);
1477 * Caller must hold RTNL.
1479 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1481 struct trie *t = (struct trie *) tb->tb_data;
1483 int plen = cfg->fc_dst_len;
1484 u8 tos = cfg->fc_tos;
1485 struct fib_alias *fa, *fa_to_delete;
1486 struct list_head *fa_head;
1488 struct leaf_info *li;
1493 key = ntohl(cfg->fc_dst);
1494 mask = ntohl(inet_make_mask(plen));
1500 l = fib_find_node(t, key);
1505 li = find_leaf_info(l, plen);
1510 fa_head = &li->falh;
1511 fa = fib_find_alias(fa_head, tos, 0);
1516 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1518 fa_to_delete = NULL;
1519 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1520 list_for_each_entry_continue(fa, fa_head, fa_list) {
1521 struct fib_info *fi = fa->fa_info;
1523 if (fa->fa_tos != tos)
1526 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1527 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1528 fa->fa_info->fib_scope == cfg->fc_scope) &&
1529 (!cfg->fc_prefsrc ||
1530 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1531 (!cfg->fc_protocol ||
1532 fi->fib_protocol == cfg->fc_protocol) &&
1533 fib_nh_match(cfg, fi) == 0) {
1543 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1544 &cfg->fc_nlinfo, 0);
1546 list_del_rcu(&fa->fa_list);
1549 tb->tb_num_default--;
1551 if (list_empty(fa_head)) {
1552 hlist_del_rcu(&li->hlist);
1556 if (hlist_empty(&l->list))
1557 trie_leaf_remove(t, l);
1559 if (fa->fa_state & FA_S_ACCESSED)
1560 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1562 fib_release_info(fa->fa_info);
1563 alias_free_mem_rcu(fa);
1567 static int trie_flush_list(struct list_head *head)
1569 struct fib_alias *fa, *fa_node;
1572 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1573 struct fib_info *fi = fa->fa_info;
1575 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1576 list_del_rcu(&fa->fa_list);
1577 fib_release_info(fa->fa_info);
1578 alias_free_mem_rcu(fa);
1585 static int trie_flush_leaf(struct tnode *l)
1588 struct hlist_head *lih = &l->list;
1589 struct hlist_node *tmp;
1590 struct leaf_info *li = NULL;
1592 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1593 found += trie_flush_list(&li->falh);
1595 if (list_empty(&li->falh)) {
1596 hlist_del_rcu(&li->hlist);
1604 * Scan for the next right leaf starting at node p->child[idx]
1605 * Since we have back pointer, no recursion necessary.
1607 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1613 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1617 while (idx < 1u << p->bits) {
1618 c = tnode_get_child_rcu(p, idx++);
1625 /* Rescan start scanning in new node */
1630 /* Node empty, walk back up to parent */
1632 } while ((p = node_parent_rcu(c)) != NULL);
1634 return NULL; /* Root of trie */
1637 static struct tnode *trie_firstleaf(struct trie *t)
1639 struct tnode *n = rcu_dereference_rtnl(t->trie);
1644 if (IS_LEAF(n)) /* trie is just a leaf */
1647 return leaf_walk_rcu(n, NULL);
1650 static struct tnode *trie_nextleaf(struct tnode *l)
1652 struct tnode *p = node_parent_rcu(l);
1655 return NULL; /* trie with just one leaf */
1657 return leaf_walk_rcu(p, l);
1660 static struct tnode *trie_leafindex(struct trie *t, int index)
1662 struct tnode *l = trie_firstleaf(t);
1664 while (l && index-- > 0)
1665 l = trie_nextleaf(l);
1672 * Caller must hold RTNL.
1674 int fib_table_flush(struct fib_table *tb)
1676 struct trie *t = (struct trie *) tb->tb_data;
1677 struct tnode *l, *ll = NULL;
1680 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1681 found += trie_flush_leaf(l);
1683 if (ll && hlist_empty(&ll->list))
1684 trie_leaf_remove(t, ll);
1688 if (ll && hlist_empty(&ll->list))
1689 trie_leaf_remove(t, ll);
1691 pr_debug("trie_flush found=%d\n", found);
1695 void fib_free_table(struct fib_table *tb)
1697 #ifdef CONFIG_IP_FIB_TRIE_STATS
1698 struct trie *t = (struct trie *)tb->tb_data;
1700 free_percpu(t->stats);
1701 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1705 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1706 struct fib_table *tb,
1707 struct sk_buff *skb, struct netlink_callback *cb)
1710 struct fib_alias *fa;
1711 __be32 xkey = htonl(key);
1716 /* rcu_read_lock is hold by caller */
1718 list_for_each_entry_rcu(fa, fah, fa_list) {
1724 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1732 fa->fa_info, NLM_F_MULTI) < 0) {
1742 static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1743 struct sk_buff *skb, struct netlink_callback *cb)
1745 struct leaf_info *li;
1751 /* rcu_read_lock is hold by caller */
1752 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1761 if (list_empty(&li->falh))
1764 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1775 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1776 struct netlink_callback *cb)
1779 struct trie *t = (struct trie *) tb->tb_data;
1780 t_key key = cb->args[2];
1781 int count = cb->args[3];
1784 /* Dump starting at last key.
1785 * Note: 0.0.0.0/0 (ie default) is first key.
1788 l = trie_firstleaf(t);
1790 /* Normally, continue from last key, but if that is missing
1791 * fallback to using slow rescan
1793 l = fib_find_node(t, key);
1795 l = trie_leafindex(t, count);
1799 cb->args[2] = l->key;
1800 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1801 cb->args[3] = count;
1807 l = trie_nextleaf(l);
1808 memset(&cb->args[4], 0,
1809 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1811 cb->args[3] = count;
1817 void __init fib_trie_init(void)
1819 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1820 sizeof(struct fib_alias),
1821 0, SLAB_PANIC, NULL);
1823 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1824 max(sizeof(struct tnode),
1825 sizeof(struct leaf_info)),
1826 0, SLAB_PANIC, NULL);
1830 struct fib_table *fib_trie_table(u32 id)
1832 struct fib_table *tb;
1835 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1841 tb->tb_default = -1;
1842 tb->tb_num_default = 0;
1844 t = (struct trie *) tb->tb_data;
1845 RCU_INIT_POINTER(t->trie, NULL);
1846 #ifdef CONFIG_IP_FIB_TRIE_STATS
1847 t->stats = alloc_percpu(struct trie_use_stats);
1857 #ifdef CONFIG_PROC_FS
1858 /* Depth first Trie walk iterator */
1859 struct fib_trie_iter {
1860 struct seq_net_private p;
1861 struct fib_table *tb;
1862 struct tnode *tnode;
1867 static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1869 struct tnode *tn = iter->tnode;
1870 unsigned int cindex = iter->index;
1873 /* A single entry routing table */
1877 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1878 iter->tnode, iter->index, iter->depth);
1880 while (cindex < (1<<tn->bits)) {
1881 struct tnode *n = tnode_get_child_rcu(tn, cindex);
1886 iter->index = cindex + 1;
1888 /* push down one level */
1899 /* Current node exhausted, pop back up */
1900 p = node_parent_rcu(tn);
1902 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1912 static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1920 n = rcu_dereference(t->trie);
1937 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1940 struct fib_trie_iter iter;
1942 memset(s, 0, sizeof(*s));
1945 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1947 struct leaf_info *li;
1950 s->totdepth += iter.depth;
1951 if (iter.depth > s->maxdepth)
1952 s->maxdepth = iter.depth;
1954 hlist_for_each_entry_rcu(li, &n->list, hlist)
1960 if (n->bits < MAX_STAT_DEPTH)
1961 s->nodesizes[n->bits]++;
1963 for (i = 0; i < tnode_child_length(n); i++)
1964 if (!rcu_access_pointer(n->child[i]))
1972 * This outputs /proc/net/fib_triestats
1974 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1976 unsigned int i, max, pointers, bytes, avdepth;
1979 avdepth = stat->totdepth*100 / stat->leaves;
1983 seq_printf(seq, "\tAver depth: %u.%02d\n",
1984 avdepth / 100, avdepth % 100);
1985 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
1987 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
1988 bytes = sizeof(struct tnode) * stat->leaves;
1990 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
1991 bytes += sizeof(struct leaf_info) * stat->prefixes;
1993 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
1994 bytes += sizeof(struct tnode) * stat->tnodes;
1996 max = MAX_STAT_DEPTH;
1997 while (max > 0 && stat->nodesizes[max-1] == 0)
2001 for (i = 1; i < max; i++)
2002 if (stat->nodesizes[i] != 0) {
2003 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2004 pointers += (1<<i) * stat->nodesizes[i];
2006 seq_putc(seq, '\n');
2007 seq_printf(seq, "\tPointers: %u\n", pointers);
2009 bytes += sizeof(struct tnode *) * pointers;
2010 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2011 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2014 #ifdef CONFIG_IP_FIB_TRIE_STATS
2015 static void trie_show_usage(struct seq_file *seq,
2016 const struct trie_use_stats __percpu *stats)
2018 struct trie_use_stats s = { 0 };
2021 /* loop through all of the CPUs and gather up the stats */
2022 for_each_possible_cpu(cpu) {
2023 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2025 s.gets += pcpu->gets;
2026 s.backtrack += pcpu->backtrack;
2027 s.semantic_match_passed += pcpu->semantic_match_passed;
2028 s.semantic_match_miss += pcpu->semantic_match_miss;
2029 s.null_node_hit += pcpu->null_node_hit;
2030 s.resize_node_skipped += pcpu->resize_node_skipped;
2033 seq_printf(seq, "\nCounters:\n---------\n");
2034 seq_printf(seq, "gets = %u\n", s.gets);
2035 seq_printf(seq, "backtracks = %u\n", s.backtrack);
2036 seq_printf(seq, "semantic match passed = %u\n",
2037 s.semantic_match_passed);
2038 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2039 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2040 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2042 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2044 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2046 if (tb->tb_id == RT_TABLE_LOCAL)
2047 seq_puts(seq, "Local:\n");
2048 else if (tb->tb_id == RT_TABLE_MAIN)
2049 seq_puts(seq, "Main:\n");
2051 seq_printf(seq, "Id %d:\n", tb->tb_id);
2055 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2057 struct net *net = (struct net *)seq->private;
2061 "Basic info: size of leaf:"
2062 " %Zd bytes, size of tnode: %Zd bytes.\n",
2063 sizeof(struct tnode), sizeof(struct tnode));
2065 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2066 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2067 struct fib_table *tb;
2069 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2070 struct trie *t = (struct trie *) tb->tb_data;
2071 struct trie_stat stat;
2076 fib_table_print(seq, tb);
2078 trie_collect_stats(t, &stat);
2079 trie_show_stats(seq, &stat);
2080 #ifdef CONFIG_IP_FIB_TRIE_STATS
2081 trie_show_usage(seq, t->stats);
2089 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2091 return single_open_net(inode, file, fib_triestat_seq_show);
2094 static const struct file_operations fib_triestat_fops = {
2095 .owner = THIS_MODULE,
2096 .open = fib_triestat_seq_open,
2098 .llseek = seq_lseek,
2099 .release = single_release_net,
2102 static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2104 struct fib_trie_iter *iter = seq->private;
2105 struct net *net = seq_file_net(seq);
2109 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2110 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2111 struct fib_table *tb;
2113 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2116 for (n = fib_trie_get_first(iter,
2117 (struct trie *) tb->tb_data);
2118 n; n = fib_trie_get_next(iter))
2129 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2133 return fib_trie_get_idx(seq, *pos);
2136 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2138 struct fib_trie_iter *iter = seq->private;
2139 struct net *net = seq_file_net(seq);
2140 struct fib_table *tb = iter->tb;
2141 struct hlist_node *tb_node;
2146 /* next node in same table */
2147 n = fib_trie_get_next(iter);
2151 /* walk rest of this hash chain */
2152 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2153 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2154 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2155 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2160 /* new hash chain */
2161 while (++h < FIB_TABLE_HASHSZ) {
2162 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2163 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2164 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2176 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2182 static void seq_indent(struct seq_file *seq, int n)
2188 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2191 case RT_SCOPE_UNIVERSE: return "universe";
2192 case RT_SCOPE_SITE: return "site";
2193 case RT_SCOPE_LINK: return "link";
2194 case RT_SCOPE_HOST: return "host";
2195 case RT_SCOPE_NOWHERE: return "nowhere";
2197 snprintf(buf, len, "scope=%d", s);
2202 static const char *const rtn_type_names[__RTN_MAX] = {
2203 [RTN_UNSPEC] = "UNSPEC",
2204 [RTN_UNICAST] = "UNICAST",
2205 [RTN_LOCAL] = "LOCAL",
2206 [RTN_BROADCAST] = "BROADCAST",
2207 [RTN_ANYCAST] = "ANYCAST",
2208 [RTN_MULTICAST] = "MULTICAST",
2209 [RTN_BLACKHOLE] = "BLACKHOLE",
2210 [RTN_UNREACHABLE] = "UNREACHABLE",
2211 [RTN_PROHIBIT] = "PROHIBIT",
2212 [RTN_THROW] = "THROW",
2214 [RTN_XRESOLVE] = "XRESOLVE",
2217 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2219 if (t < __RTN_MAX && rtn_type_names[t])
2220 return rtn_type_names[t];
2221 snprintf(buf, len, "type %u", t);
2225 /* Pretty print the trie */
2226 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2228 const struct fib_trie_iter *iter = seq->private;
2229 struct tnode *n = v;
2231 if (!node_parent_rcu(n))
2232 fib_table_print(seq, iter->tb);
2235 __be32 prf = htonl(n->key);
2237 seq_indent(seq, iter->depth - 1);
2238 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2239 &prf, n->pos, n->bits, n->full_children,
2242 struct leaf_info *li;
2243 __be32 val = htonl(n->key);
2245 seq_indent(seq, iter->depth);
2246 seq_printf(seq, " |-- %pI4\n", &val);
2248 hlist_for_each_entry_rcu(li, &n->list, hlist) {
2249 struct fib_alias *fa;
2251 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2252 char buf1[32], buf2[32];
2254 seq_indent(seq, iter->depth+1);
2255 seq_printf(seq, " /%d %s %s", li->plen,
2256 rtn_scope(buf1, sizeof(buf1),
2257 fa->fa_info->fib_scope),
2258 rtn_type(buf2, sizeof(buf2),
2261 seq_printf(seq, " tos=%d", fa->fa_tos);
2262 seq_putc(seq, '\n');
2270 static const struct seq_operations fib_trie_seq_ops = {
2271 .start = fib_trie_seq_start,
2272 .next = fib_trie_seq_next,
2273 .stop = fib_trie_seq_stop,
2274 .show = fib_trie_seq_show,
2277 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2279 return seq_open_net(inode, file, &fib_trie_seq_ops,
2280 sizeof(struct fib_trie_iter));
2283 static const struct file_operations fib_trie_fops = {
2284 .owner = THIS_MODULE,
2285 .open = fib_trie_seq_open,
2287 .llseek = seq_lseek,
2288 .release = seq_release_net,
2291 struct fib_route_iter {
2292 struct seq_net_private p;
2293 struct trie *main_trie;
2298 static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2300 struct tnode *l = NULL;
2301 struct trie *t = iter->main_trie;
2303 /* use cache location of last found key */
2304 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2308 l = trie_firstleaf(t);
2311 while (l && pos-- > 0) {
2313 l = trie_nextleaf(l);
2317 iter->key = pos; /* remember it */
2319 iter->pos = 0; /* forget it */
2324 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2327 struct fib_route_iter *iter = seq->private;
2328 struct fib_table *tb;
2331 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2335 iter->main_trie = (struct trie *) tb->tb_data;
2337 return SEQ_START_TOKEN;
2339 return fib_route_get_idx(iter, *pos - 1);
2342 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2344 struct fib_route_iter *iter = seq->private;
2345 struct tnode *l = v;
2348 if (v == SEQ_START_TOKEN) {
2350 l = trie_firstleaf(iter->main_trie);
2353 l = trie_nextleaf(l);
2363 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2369 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2371 unsigned int flags = 0;
2373 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2375 if (fi && fi->fib_nh->nh_gw)
2376 flags |= RTF_GATEWAY;
2377 if (mask == htonl(0xFFFFFFFF))
2384 * This outputs /proc/net/route.
2385 * The format of the file is not supposed to be changed
2386 * and needs to be same as fib_hash output to avoid breaking
2389 static int fib_route_seq_show(struct seq_file *seq, void *v)
2391 struct tnode *l = v;
2392 struct leaf_info *li;
2394 if (v == SEQ_START_TOKEN) {
2395 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2396 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2401 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2402 struct fib_alias *fa;
2403 __be32 mask, prefix;
2405 mask = inet_make_mask(li->plen);
2406 prefix = htonl(l->key);
2408 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2409 const struct fib_info *fi = fa->fa_info;
2410 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2412 if (fa->fa_type == RTN_BROADCAST
2413 || fa->fa_type == RTN_MULTICAST)
2416 seq_setwidth(seq, 127);
2420 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2421 "%d\t%08X\t%d\t%u\t%u",
2422 fi->fib_dev ? fi->fib_dev->name : "*",
2424 fi->fib_nh->nh_gw, flags, 0, 0,
2428 fi->fib_advmss + 40 : 0),
2433 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2434 "%d\t%08X\t%d\t%u\t%u",
2435 prefix, 0, flags, 0, 0, 0,
2445 static const struct seq_operations fib_route_seq_ops = {
2446 .start = fib_route_seq_start,
2447 .next = fib_route_seq_next,
2448 .stop = fib_route_seq_stop,
2449 .show = fib_route_seq_show,
2452 static int fib_route_seq_open(struct inode *inode, struct file *file)
2454 return seq_open_net(inode, file, &fib_route_seq_ops,
2455 sizeof(struct fib_route_iter));
2458 static const struct file_operations fib_route_fops = {
2459 .owner = THIS_MODULE,
2460 .open = fib_route_seq_open,
2462 .llseek = seq_lseek,
2463 .release = seq_release_net,
2466 int __net_init fib_proc_init(struct net *net)
2468 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2471 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2472 &fib_triestat_fops))
2475 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2481 remove_proc_entry("fib_triestat", net->proc_net);
2483 remove_proc_entry("fib_trie", net->proc_net);
2488 void __net_exit fib_proc_exit(struct net *net)
2490 remove_proc_entry("fib_trie", net->proc_net);
2491 remove_proc_entry("fib_triestat", net->proc_net);
2492 remove_proc_entry("route", net->proc_net);
2495 #endif /* CONFIG_PROC_FS */