Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[cascardo/linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .check =                ipv4_dst_check,
161         .default_advmss =       ipv4_default_advmss,
162         .mtu =                  ipv4_mtu,
163         .cow_metrics =          ipv4_cow_metrics,
164         .destroy =              ipv4_dst_destroy,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .redirect =             ip_do_redirect,
169         .local_out =            __ip_local_out,
170         .neigh_lookup =         ipv4_neigh_lookup,
171 };
172
173 #define ECN_OR_COST(class)      TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201         if (*pos)
202                 return NULL;
203         return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208         ++*pos;
209         return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218         if (v == SEQ_START_TOKEN)
219                 seq_printf(seq, "%-127s\n",
220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222                            "HHUptod\tSpecDst");
223         return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227         .start  = rt_cache_seq_start,
228         .next   = rt_cache_seq_next,
229         .stop   = rt_cache_seq_stop,
230         .show   = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235         return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239         .owner   = THIS_MODULE,
240         .open    = rt_cache_seq_open,
241         .read    = seq_read,
242         .llseek  = seq_lseek,
243         .release = seq_release,
244 };
245
246
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 {
249         int cpu;
250
251         if (*pos == 0)
252                 return SEQ_START_TOKEN;
253
254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255                 if (!cpu_possible(cpu))
256                         continue;
257                 *pos = cpu+1;
258                 return &per_cpu(rt_cache_stat, cpu);
259         }
260         return NULL;
261 }
262
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 {
265         int cpu;
266
267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268                 if (!cpu_possible(cpu))
269                         continue;
270                 *pos = cpu+1;
271                 return &per_cpu(rt_cache_stat, cpu);
272         }
273         return NULL;
274
275 }
276
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
278 {
279
280 }
281
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 {
284         struct rt_cache_stat *st = v;
285
286         if (v == SEQ_START_TOKEN) {
287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288                 return 0;
289         }
290
291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293                    dst_entries_get_slow(&ipv4_dst_ops),
294                    0, /* st->in_hit */
295                    st->in_slow_tot,
296                    st->in_slow_mc,
297                    st->in_no_route,
298                    st->in_brd,
299                    st->in_martian_dst,
300                    st->in_martian_src,
301
302                    0, /* st->out_hit */
303                    st->out_slow_tot,
304                    st->out_slow_mc,
305
306                    0, /* st->gc_total */
307                    0, /* st->gc_ignored */
308                    0, /* st->gc_goal_miss */
309                    0, /* st->gc_dst_overflow */
310                    0, /* st->in_hlist_search */
311                    0  /* st->out_hlist_search */
312                 );
313         return 0;
314 }
315
316 static const struct seq_operations rt_cpu_seq_ops = {
317         .start  = rt_cpu_seq_start,
318         .next   = rt_cpu_seq_next,
319         .stop   = rt_cpu_seq_stop,
320         .show   = rt_cpu_seq_show,
321 };
322
323
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 {
326         return seq_open(file, &rt_cpu_seq_ops);
327 }
328
329 static const struct file_operations rt_cpu_seq_fops = {
330         .owner   = THIS_MODULE,
331         .open    = rt_cpu_seq_open,
332         .read    = seq_read,
333         .llseek  = seq_lseek,
334         .release = seq_release,
335 };
336
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file *m, void *v)
339 {
340         struct ip_rt_acct *dst, *src;
341         unsigned int i, j;
342
343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
344         if (!dst)
345                 return -ENOMEM;
346
347         for_each_possible_cpu(i) {
348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349                 for (j = 0; j < 256; j++) {
350                         dst[j].o_bytes   += src[j].o_bytes;
351                         dst[j].o_packets += src[j].o_packets;
352                         dst[j].i_bytes   += src[j].i_bytes;
353                         dst[j].i_packets += src[j].i_packets;
354                 }
355         }
356
357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
358         kfree(dst);
359         return 0;
360 }
361
362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
363 {
364         return single_open(file, rt_acct_proc_show, NULL);
365 }
366
367 static const struct file_operations rt_acct_proc_fops = {
368         .owner          = THIS_MODULE,
369         .open           = rt_acct_proc_open,
370         .read           = seq_read,
371         .llseek         = seq_lseek,
372         .release        = single_release,
373 };
374 #endif
375
376 static int __net_init ip_rt_do_proc_init(struct net *net)
377 {
378         struct proc_dir_entry *pde;
379
380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
381                           &rt_cache_seq_fops);
382         if (!pde)
383                 goto err1;
384
385         pde = proc_create("rt_cache", S_IRUGO,
386                           net->proc_net_stat, &rt_cpu_seq_fops);
387         if (!pde)
388                 goto err2;
389
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
392         if (!pde)
393                 goto err3;
394 #endif
395         return 0;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 err3:
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400 #endif
401 err2:
402         remove_proc_entry("rt_cache", net->proc_net);
403 err1:
404         return -ENOMEM;
405 }
406
407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
408 {
409         remove_proc_entry("rt_cache", net->proc_net_stat);
410         remove_proc_entry("rt_cache", net->proc_net);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412         remove_proc_entry("rt_acct", net->proc_net);
413 #endif
414 }
415
416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
417         .init = ip_rt_do_proc_init,
418         .exit = ip_rt_do_proc_exit,
419 };
420
421 static int __init ip_rt_proc_init(void)
422 {
423         return register_pernet_subsys(&ip_rt_proc_ops);
424 }
425
426 #else
427 static inline int ip_rt_proc_init(void)
428 {
429         return 0;
430 }
431 #endif /* CONFIG_PROC_FS */
432
433 static inline bool rt_is_expired(const struct rtable *rth)
434 {
435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
436 }
437
438 void rt_cache_flush(struct net *net)
439 {
440         rt_genid_bump_ipv4(net);
441 }
442
443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
444                                            struct sk_buff *skb,
445                                            const void *daddr)
446 {
447         struct net_device *dev = dst->dev;
448         const __be32 *pkey = daddr;
449         const struct rtable *rt;
450         struct neighbour *n;
451
452         rt = (const struct rtable *) dst;
453         if (rt->rt_gateway)
454                 pkey = (const __be32 *) &rt->rt_gateway;
455         else if (skb)
456                 pkey = &ip_hdr(skb)->daddr;
457
458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
459         if (n)
460                 return n;
461         return neigh_create(&arp_tbl, pkey, dev);
462 }
463
464 #define IP_IDENTS_SZ 2048u
465
466 static atomic_t *ip_idents __read_mostly;
467 static u32 *ip_tstamps __read_mostly;
468
469 /* In order to protect privacy, we add a perturbation to identifiers
470  * if one generator is seldom used. This makes hard for an attacker
471  * to infer how many packets were sent between two points in time.
472  */
473 u32 ip_idents_reserve(u32 hash, int segs)
474 {
475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
477         u32 old = ACCESS_ONCE(*p_tstamp);
478         u32 now = (u32)jiffies;
479         u32 new, delta = 0;
480
481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
482                 delta = prandom_u32_max(now - old);
483
484         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
485         do {
486                 old = (u32)atomic_read(p_id);
487                 new = old + delta + segs;
488         } while (atomic_cmpxchg(p_id, old, new) != old);
489
490         return new - segs;
491 }
492 EXPORT_SYMBOL(ip_idents_reserve);
493
494 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
495 {
496         static u32 ip_idents_hashrnd __read_mostly;
497         u32 hash, id;
498
499         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
500
501         hash = jhash_3words((__force u32)iph->daddr,
502                             (__force u32)iph->saddr,
503                             iph->protocol ^ net_hash_mix(net),
504                             ip_idents_hashrnd);
505         id = ip_idents_reserve(hash, segs);
506         iph->id = htons(id);
507 }
508 EXPORT_SYMBOL(__ip_select_ident);
509
510 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
511                              const struct iphdr *iph,
512                              int oif, u8 tos,
513                              u8 prot, u32 mark, int flow_flags)
514 {
515         if (sk) {
516                 const struct inet_sock *inet = inet_sk(sk);
517
518                 oif = sk->sk_bound_dev_if;
519                 mark = sk->sk_mark;
520                 tos = RT_CONN_FLAGS(sk);
521                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
522         }
523         flowi4_init_output(fl4, oif, mark, tos,
524                            RT_SCOPE_UNIVERSE, prot,
525                            flow_flags,
526                            iph->daddr, iph->saddr, 0, 0);
527 }
528
529 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
530                                const struct sock *sk)
531 {
532         const struct iphdr *iph = ip_hdr(skb);
533         int oif = skb->dev->ifindex;
534         u8 tos = RT_TOS(iph->tos);
535         u8 prot = iph->protocol;
536         u32 mark = skb->mark;
537
538         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
539 }
540
541 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
542 {
543         const struct inet_sock *inet = inet_sk(sk);
544         const struct ip_options_rcu *inet_opt;
545         __be32 daddr = inet->inet_daddr;
546
547         rcu_read_lock();
548         inet_opt = rcu_dereference(inet->inet_opt);
549         if (inet_opt && inet_opt->opt.srr)
550                 daddr = inet_opt->opt.faddr;
551         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
552                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
553                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
554                            inet_sk_flowi_flags(sk),
555                            daddr, inet->inet_saddr, 0, 0);
556         rcu_read_unlock();
557 }
558
559 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
560                                  const struct sk_buff *skb)
561 {
562         if (skb)
563                 build_skb_flow_key(fl4, skb, sk);
564         else
565                 build_sk_flow_key(fl4, sk);
566 }
567
568 static inline void rt_free(struct rtable *rt)
569 {
570         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
571 }
572
573 static DEFINE_SPINLOCK(fnhe_lock);
574
575 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
576 {
577         struct rtable *rt;
578
579         rt = rcu_dereference(fnhe->fnhe_rth_input);
580         if (rt) {
581                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
582                 rt_free(rt);
583         }
584         rt = rcu_dereference(fnhe->fnhe_rth_output);
585         if (rt) {
586                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
587                 rt_free(rt);
588         }
589 }
590
591 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
592 {
593         struct fib_nh_exception *fnhe, *oldest;
594
595         oldest = rcu_dereference(hash->chain);
596         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
597              fnhe = rcu_dereference(fnhe->fnhe_next)) {
598                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
599                         oldest = fnhe;
600         }
601         fnhe_flush_routes(oldest);
602         return oldest;
603 }
604
605 static inline u32 fnhe_hashfun(__be32 daddr)
606 {
607         static u32 fnhe_hashrnd __read_mostly;
608         u32 hval;
609
610         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
611         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
612         return hash_32(hval, FNHE_HASH_SHIFT);
613 }
614
615 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
616 {
617         rt->rt_pmtu = fnhe->fnhe_pmtu;
618         rt->dst.expires = fnhe->fnhe_expires;
619
620         if (fnhe->fnhe_gw) {
621                 rt->rt_flags |= RTCF_REDIRECTED;
622                 rt->rt_gateway = fnhe->fnhe_gw;
623                 rt->rt_uses_gateway = 1;
624         }
625 }
626
627 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
628                                   u32 pmtu, unsigned long expires)
629 {
630         struct fnhe_hash_bucket *hash;
631         struct fib_nh_exception *fnhe;
632         struct rtable *rt;
633         unsigned int i;
634         int depth;
635         u32 hval = fnhe_hashfun(daddr);
636
637         spin_lock_bh(&fnhe_lock);
638
639         hash = rcu_dereference(nh->nh_exceptions);
640         if (!hash) {
641                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
642                 if (!hash)
643                         goto out_unlock;
644                 rcu_assign_pointer(nh->nh_exceptions, hash);
645         }
646
647         hash += hval;
648
649         depth = 0;
650         for (fnhe = rcu_dereference(hash->chain); fnhe;
651              fnhe = rcu_dereference(fnhe->fnhe_next)) {
652                 if (fnhe->fnhe_daddr == daddr)
653                         break;
654                 depth++;
655         }
656
657         if (fnhe) {
658                 if (gw)
659                         fnhe->fnhe_gw = gw;
660                 if (pmtu) {
661                         fnhe->fnhe_pmtu = pmtu;
662                         fnhe->fnhe_expires = max(1UL, expires);
663                 }
664                 /* Update all cached dsts too */
665                 rt = rcu_dereference(fnhe->fnhe_rth_input);
666                 if (rt)
667                         fill_route_from_fnhe(rt, fnhe);
668                 rt = rcu_dereference(fnhe->fnhe_rth_output);
669                 if (rt)
670                         fill_route_from_fnhe(rt, fnhe);
671         } else {
672                 if (depth > FNHE_RECLAIM_DEPTH)
673                         fnhe = fnhe_oldest(hash);
674                 else {
675                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
676                         if (!fnhe)
677                                 goto out_unlock;
678
679                         fnhe->fnhe_next = hash->chain;
680                         rcu_assign_pointer(hash->chain, fnhe);
681                 }
682                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
683                 fnhe->fnhe_daddr = daddr;
684                 fnhe->fnhe_gw = gw;
685                 fnhe->fnhe_pmtu = pmtu;
686                 fnhe->fnhe_expires = expires;
687
688                 /* Exception created; mark the cached routes for the nexthop
689                  * stale, so anyone caching it rechecks if this exception
690                  * applies to them.
691                  */
692                 rt = rcu_dereference(nh->nh_rth_input);
693                 if (rt)
694                         rt->dst.obsolete = DST_OBSOLETE_KILL;
695
696                 for_each_possible_cpu(i) {
697                         struct rtable __rcu **prt;
698                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
699                         rt = rcu_dereference(*prt);
700                         if (rt)
701                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
702                 }
703         }
704
705         fnhe->fnhe_stamp = jiffies;
706
707 out_unlock:
708         spin_unlock_bh(&fnhe_lock);
709 }
710
711 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
712                              bool kill_route)
713 {
714         __be32 new_gw = icmp_hdr(skb)->un.gateway;
715         __be32 old_gw = ip_hdr(skb)->saddr;
716         struct net_device *dev = skb->dev;
717         struct in_device *in_dev;
718         struct fib_result res;
719         struct neighbour *n;
720         struct net *net;
721
722         switch (icmp_hdr(skb)->code & 7) {
723         case ICMP_REDIR_NET:
724         case ICMP_REDIR_NETTOS:
725         case ICMP_REDIR_HOST:
726         case ICMP_REDIR_HOSTTOS:
727                 break;
728
729         default:
730                 return;
731         }
732
733         if (rt->rt_gateway != old_gw)
734                 return;
735
736         in_dev = __in_dev_get_rcu(dev);
737         if (!in_dev)
738                 return;
739
740         net = dev_net(dev);
741         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
742             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
743             ipv4_is_zeronet(new_gw))
744                 goto reject_redirect;
745
746         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
747                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
748                         goto reject_redirect;
749                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
750                         goto reject_redirect;
751         } else {
752                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
753                         goto reject_redirect;
754         }
755
756         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
757         if (!IS_ERR(n)) {
758                 if (!(n->nud_state & NUD_VALID)) {
759                         neigh_event_send(n, NULL);
760                 } else {
761                         if (fib_lookup(net, fl4, &res, 0) == 0) {
762                                 struct fib_nh *nh = &FIB_RES_NH(res);
763
764                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
765                                                 0, jiffies + ip_rt_gc_timeout);
766                         }
767                         if (kill_route)
768                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
769                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
770                 }
771                 neigh_release(n);
772         }
773         return;
774
775 reject_redirect:
776 #ifdef CONFIG_IP_ROUTE_VERBOSE
777         if (IN_DEV_LOG_MARTIANS(in_dev)) {
778                 const struct iphdr *iph = (const struct iphdr *) skb->data;
779                 __be32 daddr = iph->daddr;
780                 __be32 saddr = iph->saddr;
781
782                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
783                                      "  Advised path = %pI4 -> %pI4\n",
784                                      &old_gw, dev->name, &new_gw,
785                                      &saddr, &daddr);
786         }
787 #endif
788         ;
789 }
790
791 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
792 {
793         struct rtable *rt;
794         struct flowi4 fl4;
795         const struct iphdr *iph = (const struct iphdr *) skb->data;
796         int oif = skb->dev->ifindex;
797         u8 tos = RT_TOS(iph->tos);
798         u8 prot = iph->protocol;
799         u32 mark = skb->mark;
800
801         rt = (struct rtable *) dst;
802
803         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
804         __ip_do_redirect(rt, skb, &fl4, true);
805 }
806
807 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
808 {
809         struct rtable *rt = (struct rtable *)dst;
810         struct dst_entry *ret = dst;
811
812         if (rt) {
813                 if (dst->obsolete > 0) {
814                         ip_rt_put(rt);
815                         ret = NULL;
816                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
817                            rt->dst.expires) {
818                         ip_rt_put(rt);
819                         ret = NULL;
820                 }
821         }
822         return ret;
823 }
824
825 /*
826  * Algorithm:
827  *      1. The first ip_rt_redirect_number redirects are sent
828  *         with exponential backoff, then we stop sending them at all,
829  *         assuming that the host ignores our redirects.
830  *      2. If we did not see packets requiring redirects
831  *         during ip_rt_redirect_silence, we assume that the host
832  *         forgot redirected route and start to send redirects again.
833  *
834  * This algorithm is much cheaper and more intelligent than dumb load limiting
835  * in icmp.c.
836  *
837  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
838  * and "frag. need" (breaks PMTU discovery) in icmp.c.
839  */
840
841 void ip_rt_send_redirect(struct sk_buff *skb)
842 {
843         struct rtable *rt = skb_rtable(skb);
844         struct in_device *in_dev;
845         struct inet_peer *peer;
846         struct net *net;
847         int log_martians;
848         int vif;
849
850         rcu_read_lock();
851         in_dev = __in_dev_get_rcu(rt->dst.dev);
852         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
853                 rcu_read_unlock();
854                 return;
855         }
856         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
857         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
858         rcu_read_unlock();
859
860         net = dev_net(rt->dst.dev);
861         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
862         if (!peer) {
863                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
864                           rt_nexthop(rt, ip_hdr(skb)->daddr));
865                 return;
866         }
867
868         /* No redirected packets during ip_rt_redirect_silence;
869          * reset the algorithm.
870          */
871         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
872                 peer->rate_tokens = 0;
873
874         /* Too many ignored redirects; do not send anything
875          * set dst.rate_last to the last seen redirected packet.
876          */
877         if (peer->rate_tokens >= ip_rt_redirect_number) {
878                 peer->rate_last = jiffies;
879                 goto out_put_peer;
880         }
881
882         /* Check for load limit; set rate_last to the latest sent
883          * redirect.
884          */
885         if (peer->rate_tokens == 0 ||
886             time_after(jiffies,
887                        (peer->rate_last +
888                         (ip_rt_redirect_load << peer->rate_tokens)))) {
889                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
890
891                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
892                 peer->rate_last = jiffies;
893                 ++peer->rate_tokens;
894 #ifdef CONFIG_IP_ROUTE_VERBOSE
895                 if (log_martians &&
896                     peer->rate_tokens == ip_rt_redirect_number)
897                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
898                                              &ip_hdr(skb)->saddr, inet_iif(skb),
899                                              &ip_hdr(skb)->daddr, &gw);
900 #endif
901         }
902 out_put_peer:
903         inet_putpeer(peer);
904 }
905
906 static int ip_error(struct sk_buff *skb)
907 {
908         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
909         struct rtable *rt = skb_rtable(skb);
910         struct inet_peer *peer;
911         unsigned long now;
912         struct net *net;
913         bool send;
914         int code;
915
916         /* IP on this device is disabled. */
917         if (!in_dev)
918                 goto out;
919
920         net = dev_net(rt->dst.dev);
921         if (!IN_DEV_FORWARD(in_dev)) {
922                 switch (rt->dst.error) {
923                 case EHOSTUNREACH:
924                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
925                         break;
926
927                 case ENETUNREACH:
928                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
929                         break;
930                 }
931                 goto out;
932         }
933
934         switch (rt->dst.error) {
935         case EINVAL:
936         default:
937                 goto out;
938         case EHOSTUNREACH:
939                 code = ICMP_HOST_UNREACH;
940                 break;
941         case ENETUNREACH:
942                 code = ICMP_NET_UNREACH;
943                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
944                 break;
945         case EACCES:
946                 code = ICMP_PKT_FILTERED;
947                 break;
948         }
949
950         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
951                                l3mdev_master_ifindex(skb->dev), 1);
952
953         send = true;
954         if (peer) {
955                 now = jiffies;
956                 peer->rate_tokens += now - peer->rate_last;
957                 if (peer->rate_tokens > ip_rt_error_burst)
958                         peer->rate_tokens = ip_rt_error_burst;
959                 peer->rate_last = now;
960                 if (peer->rate_tokens >= ip_rt_error_cost)
961                         peer->rate_tokens -= ip_rt_error_cost;
962                 else
963                         send = false;
964                 inet_putpeer(peer);
965         }
966         if (send)
967                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
968
969 out:    kfree_skb(skb);
970         return 0;
971 }
972
973 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
974 {
975         struct dst_entry *dst = &rt->dst;
976         struct fib_result res;
977
978         if (dst_metric_locked(dst, RTAX_MTU))
979                 return;
980
981         if (ipv4_mtu(dst) < mtu)
982                 return;
983
984         if (mtu < ip_rt_min_pmtu)
985                 mtu = ip_rt_min_pmtu;
986
987         if (rt->rt_pmtu == mtu &&
988             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
989                 return;
990
991         rcu_read_lock();
992         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
993                 struct fib_nh *nh = &FIB_RES_NH(res);
994
995                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
996                                       jiffies + ip_rt_mtu_expires);
997         }
998         rcu_read_unlock();
999 }
1000
1001 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1002                               struct sk_buff *skb, u32 mtu)
1003 {
1004         struct rtable *rt = (struct rtable *) dst;
1005         struct flowi4 fl4;
1006
1007         ip_rt_build_flow_key(&fl4, sk, skb);
1008         __ip_rt_update_pmtu(rt, &fl4, mtu);
1009 }
1010
1011 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1012                       int oif, u32 mark, u8 protocol, int flow_flags)
1013 {
1014         const struct iphdr *iph = (const struct iphdr *) skb->data;
1015         struct flowi4 fl4;
1016         struct rtable *rt;
1017
1018         if (!mark)
1019                 mark = IP4_REPLY_MARK(net, skb->mark);
1020
1021         __build_flow_key(&fl4, NULL, iph, oif,
1022                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1023         rt = __ip_route_output_key(net, &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1030
1031 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1032 {
1033         const struct iphdr *iph = (const struct iphdr *) skb->data;
1034         struct flowi4 fl4;
1035         struct rtable *rt;
1036
1037         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1038
1039         if (!fl4.flowi4_mark)
1040                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1041
1042         rt = __ip_route_output_key(sock_net(sk), &fl4);
1043         if (!IS_ERR(rt)) {
1044                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1045                 ip_rt_put(rt);
1046         }
1047 }
1048
1049 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050 {
1051         const struct iphdr *iph = (const struct iphdr *) skb->data;
1052         struct flowi4 fl4;
1053         struct rtable *rt;
1054         struct dst_entry *odst = NULL;
1055         bool new = false;
1056
1057         bh_lock_sock(sk);
1058
1059         if (!ip_sk_accept_pmtu(sk))
1060                 goto out;
1061
1062         odst = sk_dst_get(sk);
1063
1064         if (sock_owned_by_user(sk) || !odst) {
1065                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1066                 goto out;
1067         }
1068
1069         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1070
1071         rt = (struct rtable *)odst;
1072         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1073                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1074                 if (IS_ERR(rt))
1075                         goto out;
1076
1077                 new = true;
1078         }
1079
1080         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1081
1082         if (!dst_check(&rt->dst, 0)) {
1083                 if (new)
1084                         dst_release(&rt->dst);
1085
1086                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1087                 if (IS_ERR(rt))
1088                         goto out;
1089
1090                 new = true;
1091         }
1092
1093         if (new)
1094                 sk_dst_set(sk, &rt->dst);
1095
1096 out:
1097         bh_unlock_sock(sk);
1098         dst_release(odst);
1099 }
1100 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1101
1102 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1103                    int oif, u32 mark, u8 protocol, int flow_flags)
1104 {
1105         const struct iphdr *iph = (const struct iphdr *) skb->data;
1106         struct flowi4 fl4;
1107         struct rtable *rt;
1108
1109         __build_flow_key(&fl4, NULL, iph, oif,
1110                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1111         rt = __ip_route_output_key(net, &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_do_redirect(rt, skb, &fl4, false);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117 EXPORT_SYMBOL_GPL(ipv4_redirect);
1118
1119 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1120 {
1121         const struct iphdr *iph = (const struct iphdr *) skb->data;
1122         struct flowi4 fl4;
1123         struct rtable *rt;
1124
1125         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1126         rt = __ip_route_output_key(sock_net(sk), &fl4);
1127         if (!IS_ERR(rt)) {
1128                 __ip_do_redirect(rt, skb, &fl4, false);
1129                 ip_rt_put(rt);
1130         }
1131 }
1132 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1133
1134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1135 {
1136         struct rtable *rt = (struct rtable *) dst;
1137
1138         /* All IPV4 dsts are created with ->obsolete set to the value
1139          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1140          * into this function always.
1141          *
1142          * When a PMTU/redirect information update invalidates a route,
1143          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1144          * DST_OBSOLETE_DEAD by dst_free().
1145          */
1146         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1147                 return NULL;
1148         return dst;
1149 }
1150
1151 static void ipv4_link_failure(struct sk_buff *skb)
1152 {
1153         struct rtable *rt;
1154
1155         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1156
1157         rt = skb_rtable(skb);
1158         if (rt)
1159                 dst_set_expires(&rt->dst, 0);
1160 }
1161
1162 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1163 {
1164         pr_debug("%s: %pI4 -> %pI4, %s\n",
1165                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1166                  skb->dev ? skb->dev->name : "?");
1167         kfree_skb(skb);
1168         WARN_ON(1);
1169         return 0;
1170 }
1171
1172 /*
1173    We do not cache source address of outgoing interface,
1174    because it is used only by IP RR, TS and SRR options,
1175    so that it out of fast path.
1176
1177    BTW remember: "addr" is allowed to be not aligned
1178    in IP options!
1179  */
1180
1181 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1182 {
1183         __be32 src;
1184
1185         if (rt_is_output_route(rt))
1186                 src = ip_hdr(skb)->saddr;
1187         else {
1188                 struct fib_result res;
1189                 struct flowi4 fl4;
1190                 struct iphdr *iph;
1191
1192                 iph = ip_hdr(skb);
1193
1194                 memset(&fl4, 0, sizeof(fl4));
1195                 fl4.daddr = iph->daddr;
1196                 fl4.saddr = iph->saddr;
1197                 fl4.flowi4_tos = RT_TOS(iph->tos);
1198                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1199                 fl4.flowi4_iif = skb->dev->ifindex;
1200                 fl4.flowi4_mark = skb->mark;
1201
1202                 rcu_read_lock();
1203                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1204                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1205                 else
1206                         src = inet_select_addr(rt->dst.dev,
1207                                                rt_nexthop(rt, iph->daddr),
1208                                                RT_SCOPE_UNIVERSE);
1209                 rcu_read_unlock();
1210         }
1211         memcpy(addr, &src, 4);
1212 }
1213
1214 #ifdef CONFIG_IP_ROUTE_CLASSID
1215 static void set_class_tag(struct rtable *rt, u32 tag)
1216 {
1217         if (!(rt->dst.tclassid & 0xFFFF))
1218                 rt->dst.tclassid |= tag & 0xFFFF;
1219         if (!(rt->dst.tclassid & 0xFFFF0000))
1220                 rt->dst.tclassid |= tag & 0xFFFF0000;
1221 }
1222 #endif
1223
1224 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1225 {
1226         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1227
1228         if (advmss == 0) {
1229                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1230                                ip_rt_min_advmss);
1231                 if (advmss > 65535 - 40)
1232                         advmss = 65535 - 40;
1233         }
1234         return advmss;
1235 }
1236
1237 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1238 {
1239         const struct rtable *rt = (const struct rtable *) dst;
1240         unsigned int mtu = rt->rt_pmtu;
1241
1242         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1243                 mtu = dst_metric_raw(dst, RTAX_MTU);
1244
1245         if (mtu)
1246                 return mtu;
1247
1248         mtu = dst->dev->mtu;
1249
1250         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1251                 if (rt->rt_uses_gateway && mtu > 576)
1252                         mtu = 576;
1253         }
1254
1255         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1256
1257         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1258 }
1259
1260 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1261 {
1262         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1263         struct fib_nh_exception *fnhe;
1264         u32 hval;
1265
1266         if (!hash)
1267                 return NULL;
1268
1269         hval = fnhe_hashfun(daddr);
1270
1271         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1272              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1273                 if (fnhe->fnhe_daddr == daddr)
1274                         return fnhe;
1275         }
1276         return NULL;
1277 }
1278
1279 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1280                               __be32 daddr)
1281 {
1282         bool ret = false;
1283
1284         spin_lock_bh(&fnhe_lock);
1285
1286         if (daddr == fnhe->fnhe_daddr) {
1287                 struct rtable __rcu **porig;
1288                 struct rtable *orig;
1289                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1290
1291                 if (rt_is_input_route(rt))
1292                         porig = &fnhe->fnhe_rth_input;
1293                 else
1294                         porig = &fnhe->fnhe_rth_output;
1295                 orig = rcu_dereference(*porig);
1296
1297                 if (fnhe->fnhe_genid != genid) {
1298                         fnhe->fnhe_genid = genid;
1299                         fnhe->fnhe_gw = 0;
1300                         fnhe->fnhe_pmtu = 0;
1301                         fnhe->fnhe_expires = 0;
1302                         fnhe_flush_routes(fnhe);
1303                         orig = NULL;
1304                 }
1305                 fill_route_from_fnhe(rt, fnhe);
1306                 if (!rt->rt_gateway)
1307                         rt->rt_gateway = daddr;
1308
1309                 if (!(rt->dst.flags & DST_NOCACHE)) {
1310                         rcu_assign_pointer(*porig, rt);
1311                         if (orig)
1312                                 rt_free(orig);
1313                         ret = true;
1314                 }
1315
1316                 fnhe->fnhe_stamp = jiffies;
1317         }
1318         spin_unlock_bh(&fnhe_lock);
1319
1320         return ret;
1321 }
1322
1323 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1324 {
1325         struct rtable *orig, *prev, **p;
1326         bool ret = true;
1327
1328         if (rt_is_input_route(rt)) {
1329                 p = (struct rtable **)&nh->nh_rth_input;
1330         } else {
1331                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1332         }
1333         orig = *p;
1334
1335         prev = cmpxchg(p, orig, rt);
1336         if (prev == orig) {
1337                 if (orig)
1338                         rt_free(orig);
1339         } else
1340                 ret = false;
1341
1342         return ret;
1343 }
1344
1345 struct uncached_list {
1346         spinlock_t              lock;
1347         struct list_head        head;
1348 };
1349
1350 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1351
1352 static void rt_add_uncached_list(struct rtable *rt)
1353 {
1354         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1355
1356         rt->rt_uncached_list = ul;
1357
1358         spin_lock_bh(&ul->lock);
1359         list_add_tail(&rt->rt_uncached, &ul->head);
1360         spin_unlock_bh(&ul->lock);
1361 }
1362
1363 static void ipv4_dst_destroy(struct dst_entry *dst)
1364 {
1365         struct rtable *rt = (struct rtable *) dst;
1366
1367         if (!list_empty(&rt->rt_uncached)) {
1368                 struct uncached_list *ul = rt->rt_uncached_list;
1369
1370                 spin_lock_bh(&ul->lock);
1371                 list_del(&rt->rt_uncached);
1372                 spin_unlock_bh(&ul->lock);
1373         }
1374 }
1375
1376 void rt_flush_dev(struct net_device *dev)
1377 {
1378         struct net *net = dev_net(dev);
1379         struct rtable *rt;
1380         int cpu;
1381
1382         for_each_possible_cpu(cpu) {
1383                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1384
1385                 spin_lock_bh(&ul->lock);
1386                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1387                         if (rt->dst.dev != dev)
1388                                 continue;
1389                         rt->dst.dev = net->loopback_dev;
1390                         dev_hold(rt->dst.dev);
1391                         dev_put(dev);
1392                 }
1393                 spin_unlock_bh(&ul->lock);
1394         }
1395 }
1396
1397 static bool rt_cache_valid(const struct rtable *rt)
1398 {
1399         return  rt &&
1400                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1401                 !rt_is_expired(rt);
1402 }
1403
1404 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1405                            const struct fib_result *res,
1406                            struct fib_nh_exception *fnhe,
1407                            struct fib_info *fi, u16 type, u32 itag)
1408 {
1409         bool cached = false;
1410
1411         if (fi) {
1412                 struct fib_nh *nh = &FIB_RES_NH(*res);
1413
1414                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1415                         rt->rt_gateway = nh->nh_gw;
1416                         rt->rt_uses_gateway = 1;
1417                 }
1418                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1419 #ifdef CONFIG_IP_ROUTE_CLASSID
1420                 rt->dst.tclassid = nh->nh_tclassid;
1421 #endif
1422                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1423                 if (unlikely(fnhe))
1424                         cached = rt_bind_exception(rt, fnhe, daddr);
1425                 else if (!(rt->dst.flags & DST_NOCACHE))
1426                         cached = rt_cache_route(nh, rt);
1427                 if (unlikely(!cached)) {
1428                         /* Routes we intend to cache in nexthop exception or
1429                          * FIB nexthop have the DST_NOCACHE bit clear.
1430                          * However, if we are unsuccessful at storing this
1431                          * route into the cache we really need to set it.
1432                          */
1433                         rt->dst.flags |= DST_NOCACHE;
1434                         if (!rt->rt_gateway)
1435                                 rt->rt_gateway = daddr;
1436                         rt_add_uncached_list(rt);
1437                 }
1438         } else
1439                 rt_add_uncached_list(rt);
1440
1441 #ifdef CONFIG_IP_ROUTE_CLASSID
1442 #ifdef CONFIG_IP_MULTIPLE_TABLES
1443         set_class_tag(rt, res->tclassid);
1444 #endif
1445         set_class_tag(rt, itag);
1446 #endif
1447 }
1448
1449 struct rtable *rt_dst_alloc(struct net_device *dev,
1450                             unsigned int flags, u16 type,
1451                             bool nopolicy, bool noxfrm, bool will_cache)
1452 {
1453         struct rtable *rt;
1454
1455         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1456                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1457                        (nopolicy ? DST_NOPOLICY : 0) |
1458                        (noxfrm ? DST_NOXFRM : 0));
1459
1460         if (rt) {
1461                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1462                 rt->rt_flags = flags;
1463                 rt->rt_type = type;
1464                 rt->rt_is_input = 0;
1465                 rt->rt_iif = 0;
1466                 rt->rt_pmtu = 0;
1467                 rt->rt_gateway = 0;
1468                 rt->rt_uses_gateway = 0;
1469                 rt->rt_table_id = 0;
1470                 INIT_LIST_HEAD(&rt->rt_uncached);
1471
1472                 rt->dst.output = ip_output;
1473                 if (flags & RTCF_LOCAL)
1474                         rt->dst.input = ip_local_deliver;
1475         }
1476
1477         return rt;
1478 }
1479 EXPORT_SYMBOL(rt_dst_alloc);
1480
1481 /* called in rcu_read_lock() section */
1482 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1483                                 u8 tos, struct net_device *dev, int our)
1484 {
1485         struct rtable *rth;
1486         struct in_device *in_dev = __in_dev_get_rcu(dev);
1487         unsigned int flags = RTCF_MULTICAST;
1488         u32 itag = 0;
1489         int err;
1490
1491         /* Primary sanity checks. */
1492
1493         if (!in_dev)
1494                 return -EINVAL;
1495
1496         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1497             skb->protocol != htons(ETH_P_IP))
1498                 goto e_inval;
1499
1500         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1501                 goto e_inval;
1502
1503         if (ipv4_is_zeronet(saddr)) {
1504                 if (!ipv4_is_local_multicast(daddr))
1505                         goto e_inval;
1506         } else {
1507                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1508                                           in_dev, &itag);
1509                 if (err < 0)
1510                         goto e_err;
1511         }
1512         if (our)
1513                 flags |= RTCF_LOCAL;
1514
1515         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1516                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1517         if (!rth)
1518                 goto e_nobufs;
1519
1520 #ifdef CONFIG_IP_ROUTE_CLASSID
1521         rth->dst.tclassid = itag;
1522 #endif
1523         rth->dst.output = ip_rt_bug;
1524         rth->rt_is_input= 1;
1525
1526 #ifdef CONFIG_IP_MROUTE
1527         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1528                 rth->dst.input = ip_mr_input;
1529 #endif
1530         RT_CACHE_STAT_INC(in_slow_mc);
1531
1532         skb_dst_set(skb, &rth->dst);
1533         return 0;
1534
1535 e_nobufs:
1536         return -ENOBUFS;
1537 e_inval:
1538         return -EINVAL;
1539 e_err:
1540         return err;
1541 }
1542
1543
1544 static void ip_handle_martian_source(struct net_device *dev,
1545                                      struct in_device *in_dev,
1546                                      struct sk_buff *skb,
1547                                      __be32 daddr,
1548                                      __be32 saddr)
1549 {
1550         RT_CACHE_STAT_INC(in_martian_src);
1551 #ifdef CONFIG_IP_ROUTE_VERBOSE
1552         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1553                 /*
1554                  *      RFC1812 recommendation, if source is martian,
1555                  *      the only hint is MAC header.
1556                  */
1557                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1558                         &daddr, &saddr, dev->name);
1559                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1560                         print_hex_dump(KERN_WARNING, "ll header: ",
1561                                        DUMP_PREFIX_OFFSET, 16, 1,
1562                                        skb_mac_header(skb),
1563                                        dev->hard_header_len, true);
1564                 }
1565         }
1566 #endif
1567 }
1568
1569 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1570 {
1571         struct fnhe_hash_bucket *hash;
1572         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1573         u32 hval = fnhe_hashfun(daddr);
1574
1575         spin_lock_bh(&fnhe_lock);
1576
1577         hash = rcu_dereference_protected(nh->nh_exceptions,
1578                                          lockdep_is_held(&fnhe_lock));
1579         hash += hval;
1580
1581         fnhe_p = &hash->chain;
1582         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1583         while (fnhe) {
1584                 if (fnhe->fnhe_daddr == daddr) {
1585                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1586                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1587                         fnhe_flush_routes(fnhe);
1588                         kfree_rcu(fnhe, rcu);
1589                         break;
1590                 }
1591                 fnhe_p = &fnhe->fnhe_next;
1592                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1593                                                  lockdep_is_held(&fnhe_lock));
1594         }
1595
1596         spin_unlock_bh(&fnhe_lock);
1597 }
1598
1599 /* called in rcu_read_lock() section */
1600 static int __mkroute_input(struct sk_buff *skb,
1601                            const struct fib_result *res,
1602                            struct in_device *in_dev,
1603                            __be32 daddr, __be32 saddr, u32 tos)
1604 {
1605         struct fib_nh_exception *fnhe;
1606         struct rtable *rth;
1607         int err;
1608         struct in_device *out_dev;
1609         bool do_cache;
1610         u32 itag = 0;
1611
1612         /* get a working reference to the output device */
1613         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1614         if (!out_dev) {
1615                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1616                 return -EINVAL;
1617         }
1618
1619         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1620                                   in_dev->dev, in_dev, &itag);
1621         if (err < 0) {
1622                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1623                                          saddr);
1624
1625                 goto cleanup;
1626         }
1627
1628         do_cache = res->fi && !itag;
1629         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1630             skb->protocol == htons(ETH_P_IP) &&
1631             (IN_DEV_SHARED_MEDIA(out_dev) ||
1632              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1633                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1634
1635         if (skb->protocol != htons(ETH_P_IP)) {
1636                 /* Not IP (i.e. ARP). Do not create route, if it is
1637                  * invalid for proxy arp. DNAT routes are always valid.
1638                  *
1639                  * Proxy arp feature have been extended to allow, ARP
1640                  * replies back to the same interface, to support
1641                  * Private VLAN switch technologies. See arp.c.
1642                  */
1643                 if (out_dev == in_dev &&
1644                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1645                         err = -EINVAL;
1646                         goto cleanup;
1647                 }
1648         }
1649
1650         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1651         if (do_cache) {
1652                 if (fnhe) {
1653                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1654                         if (rth && rth->dst.expires &&
1655                             time_after(jiffies, rth->dst.expires)) {
1656                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1657                                 fnhe = NULL;
1658                         } else {
1659                                 goto rt_cache;
1660                         }
1661                 }
1662
1663                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1664
1665 rt_cache:
1666                 if (rt_cache_valid(rth)) {
1667                         skb_dst_set_noref(skb, &rth->dst);
1668                         goto out;
1669                 }
1670         }
1671
1672         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1673                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1674                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1675         if (!rth) {
1676                 err = -ENOBUFS;
1677                 goto cleanup;
1678         }
1679
1680         rth->rt_is_input = 1;
1681         if (res->table)
1682                 rth->rt_table_id = res->table->tb_id;
1683         RT_CACHE_STAT_INC(in_slow_tot);
1684
1685         rth->dst.input = ip_forward;
1686
1687         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1688         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1689                 rth->dst.lwtstate->orig_output = rth->dst.output;
1690                 rth->dst.output = lwtunnel_output;
1691         }
1692         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1693                 rth->dst.lwtstate->orig_input = rth->dst.input;
1694                 rth->dst.input = lwtunnel_input;
1695         }
1696         skb_dst_set(skb, &rth->dst);
1697 out:
1698         err = 0;
1699  cleanup:
1700         return err;
1701 }
1702
1703 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1704
1705 /* To make ICMP packets follow the right flow, the multipath hash is
1706  * calculated from the inner IP addresses in reverse order.
1707  */
1708 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1709 {
1710         const struct iphdr *outer_iph = ip_hdr(skb);
1711         struct icmphdr _icmph;
1712         const struct icmphdr *icmph;
1713         struct iphdr _inner_iph;
1714         const struct iphdr *inner_iph;
1715
1716         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1717                 goto standard_hash;
1718
1719         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1720                                    &_icmph);
1721         if (!icmph)
1722                 goto standard_hash;
1723
1724         if (icmph->type != ICMP_DEST_UNREACH &&
1725             icmph->type != ICMP_REDIRECT &&
1726             icmph->type != ICMP_TIME_EXCEEDED &&
1727             icmph->type != ICMP_PARAMETERPROB) {
1728                 goto standard_hash;
1729         }
1730
1731         inner_iph = skb_header_pointer(skb,
1732                                        outer_iph->ihl * 4 + sizeof(_icmph),
1733                                        sizeof(_inner_iph), &_inner_iph);
1734         if (!inner_iph)
1735                 goto standard_hash;
1736
1737         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1738
1739 standard_hash:
1740         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1741 }
1742
1743 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1744
1745 static int ip_mkroute_input(struct sk_buff *skb,
1746                             struct fib_result *res,
1747                             const struct flowi4 *fl4,
1748                             struct in_device *in_dev,
1749                             __be32 daddr, __be32 saddr, u32 tos)
1750 {
1751 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1752         if (res->fi && res->fi->fib_nhs > 1) {
1753                 int h;
1754
1755                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1756                         h = ip_multipath_icmp_hash(skb);
1757                 else
1758                         h = fib_multipath_hash(saddr, daddr);
1759                 fib_select_multipath(res, h);
1760         }
1761 #endif
1762
1763         /* create a routing cache entry */
1764         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1765 }
1766
1767 /*
1768  *      NOTE. We drop all the packets that has local source
1769  *      addresses, because every properly looped back packet
1770  *      must have correct destination already attached by output routine.
1771  *
1772  *      Such approach solves two big problems:
1773  *      1. Not simplex devices are handled properly.
1774  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1775  *      called with rcu_read_lock()
1776  */
1777
1778 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1779                                u8 tos, struct net_device *dev)
1780 {
1781         struct fib_result res;
1782         struct in_device *in_dev = __in_dev_get_rcu(dev);
1783         struct ip_tunnel_info *tun_info;
1784         struct flowi4   fl4;
1785         unsigned int    flags = 0;
1786         u32             itag = 0;
1787         struct rtable   *rth;
1788         int             err = -EINVAL;
1789         struct net    *net = dev_net(dev);
1790         bool do_cache;
1791
1792         /* IP on this device is disabled. */
1793
1794         if (!in_dev)
1795                 goto out;
1796
1797         /* Check for the most weird martians, which can be not detected
1798            by fib_lookup.
1799          */
1800
1801         tun_info = skb_tunnel_info(skb);
1802         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1803                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1804         else
1805                 fl4.flowi4_tun_key.tun_id = 0;
1806         skb_dst_drop(skb);
1807
1808         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1809                 goto martian_source;
1810
1811         res.fi = NULL;
1812         res.table = NULL;
1813         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1814                 goto brd_input;
1815
1816         /* Accept zero addresses only to limited broadcast;
1817          * I even do not know to fix it or not. Waiting for complains :-)
1818          */
1819         if (ipv4_is_zeronet(saddr))
1820                 goto martian_source;
1821
1822         if (ipv4_is_zeronet(daddr))
1823                 goto martian_destination;
1824
1825         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1826          * and call it once if daddr or/and saddr are loopback addresses
1827          */
1828         if (ipv4_is_loopback(daddr)) {
1829                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1830                         goto martian_destination;
1831         } else if (ipv4_is_loopback(saddr)) {
1832                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1833                         goto martian_source;
1834         }
1835
1836         /*
1837          *      Now we are ready to route packet.
1838          */
1839         fl4.flowi4_oif = 0;
1840         fl4.flowi4_iif = dev->ifindex;
1841         fl4.flowi4_mark = skb->mark;
1842         fl4.flowi4_tos = tos;
1843         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1844         fl4.flowi4_flags = 0;
1845         fl4.daddr = daddr;
1846         fl4.saddr = saddr;
1847         err = fib_lookup(net, &fl4, &res, 0);
1848         if (err != 0) {
1849                 if (!IN_DEV_FORWARD(in_dev))
1850                         err = -EHOSTUNREACH;
1851                 goto no_route;
1852         }
1853
1854         if (res.type == RTN_BROADCAST)
1855                 goto brd_input;
1856
1857         if (res.type == RTN_LOCAL) {
1858                 err = fib_validate_source(skb, saddr, daddr, tos,
1859                                           0, dev, in_dev, &itag);
1860                 if (err < 0)
1861                         goto martian_source;
1862                 goto local_input;
1863         }
1864
1865         if (!IN_DEV_FORWARD(in_dev)) {
1866                 err = -EHOSTUNREACH;
1867                 goto no_route;
1868         }
1869         if (res.type != RTN_UNICAST)
1870                 goto martian_destination;
1871
1872         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1873 out:    return err;
1874
1875 brd_input:
1876         if (skb->protocol != htons(ETH_P_IP))
1877                 goto e_inval;
1878
1879         if (!ipv4_is_zeronet(saddr)) {
1880                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1881                                           in_dev, &itag);
1882                 if (err < 0)
1883                         goto martian_source;
1884         }
1885         flags |= RTCF_BROADCAST;
1886         res.type = RTN_BROADCAST;
1887         RT_CACHE_STAT_INC(in_brd);
1888
1889 local_input:
1890         do_cache = false;
1891         if (res.fi) {
1892                 if (!itag) {
1893                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1894                         if (rt_cache_valid(rth)) {
1895                                 skb_dst_set_noref(skb, &rth->dst);
1896                                 err = 0;
1897                                 goto out;
1898                         }
1899                         do_cache = true;
1900                 }
1901         }
1902
1903         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1904                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1905         if (!rth)
1906                 goto e_nobufs;
1907
1908         rth->dst.output= ip_rt_bug;
1909 #ifdef CONFIG_IP_ROUTE_CLASSID
1910         rth->dst.tclassid = itag;
1911 #endif
1912         rth->rt_is_input = 1;
1913         if (res.table)
1914                 rth->rt_table_id = res.table->tb_id;
1915
1916         RT_CACHE_STAT_INC(in_slow_tot);
1917         if (res.type == RTN_UNREACHABLE) {
1918                 rth->dst.input= ip_error;
1919                 rth->dst.error= -err;
1920                 rth->rt_flags   &= ~RTCF_LOCAL;
1921         }
1922         if (do_cache) {
1923                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1924                         rth->dst.flags |= DST_NOCACHE;
1925                         rt_add_uncached_list(rth);
1926                 }
1927         }
1928         skb_dst_set(skb, &rth->dst);
1929         err = 0;
1930         goto out;
1931
1932 no_route:
1933         RT_CACHE_STAT_INC(in_no_route);
1934         res.type = RTN_UNREACHABLE;
1935         res.fi = NULL;
1936         res.table = NULL;
1937         goto local_input;
1938
1939         /*
1940          *      Do not cache martian addresses: they should be logged (RFC1812)
1941          */
1942 martian_destination:
1943         RT_CACHE_STAT_INC(in_martian_dst);
1944 #ifdef CONFIG_IP_ROUTE_VERBOSE
1945         if (IN_DEV_LOG_MARTIANS(in_dev))
1946                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1947                                      &daddr, &saddr, dev->name);
1948 #endif
1949
1950 e_inval:
1951         err = -EINVAL;
1952         goto out;
1953
1954 e_nobufs:
1955         err = -ENOBUFS;
1956         goto out;
1957
1958 martian_source:
1959         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1960         goto out;
1961 }
1962
1963 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1964                          u8 tos, struct net_device *dev)
1965 {
1966         int res;
1967
1968         rcu_read_lock();
1969
1970         /* Multicast recognition logic is moved from route cache to here.
1971            The problem was that too many Ethernet cards have broken/missing
1972            hardware multicast filters :-( As result the host on multicasting
1973            network acquires a lot of useless route cache entries, sort of
1974            SDR messages from all the world. Now we try to get rid of them.
1975            Really, provided software IP multicast filter is organized
1976            reasonably (at least, hashed), it does not result in a slowdown
1977            comparing with route cache reject entries.
1978            Note, that multicast routers are not affected, because
1979            route cache entry is created eventually.
1980          */
1981         if (ipv4_is_multicast(daddr)) {
1982                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1983
1984                 if (in_dev) {
1985                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1986                                                   ip_hdr(skb)->protocol);
1987                         if (our
1988 #ifdef CONFIG_IP_MROUTE
1989                                 ||
1990                             (!ipv4_is_local_multicast(daddr) &&
1991                              IN_DEV_MFORWARD(in_dev))
1992 #endif
1993                            ) {
1994                                 int res = ip_route_input_mc(skb, daddr, saddr,
1995                                                             tos, dev, our);
1996                                 rcu_read_unlock();
1997                                 return res;
1998                         }
1999                 }
2000                 rcu_read_unlock();
2001                 return -EINVAL;
2002         }
2003         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2004         rcu_read_unlock();
2005         return res;
2006 }
2007 EXPORT_SYMBOL(ip_route_input_noref);
2008
2009 /* called with rcu_read_lock() */
2010 static struct rtable *__mkroute_output(const struct fib_result *res,
2011                                        const struct flowi4 *fl4, int orig_oif,
2012                                        struct net_device *dev_out,
2013                                        unsigned int flags)
2014 {
2015         struct fib_info *fi = res->fi;
2016         struct fib_nh_exception *fnhe;
2017         struct in_device *in_dev;
2018         u16 type = res->type;
2019         struct rtable *rth;
2020         bool do_cache;
2021
2022         in_dev = __in_dev_get_rcu(dev_out);
2023         if (!in_dev)
2024                 return ERR_PTR(-EINVAL);
2025
2026         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2027                 if (ipv4_is_loopback(fl4->saddr) &&
2028                     !(dev_out->flags & IFF_LOOPBACK) &&
2029                     !netif_is_l3_master(dev_out))
2030                         return ERR_PTR(-EINVAL);
2031
2032         if (ipv4_is_lbcast(fl4->daddr))
2033                 type = RTN_BROADCAST;
2034         else if (ipv4_is_multicast(fl4->daddr))
2035                 type = RTN_MULTICAST;
2036         else if (ipv4_is_zeronet(fl4->daddr))
2037                 return ERR_PTR(-EINVAL);
2038
2039         if (dev_out->flags & IFF_LOOPBACK)
2040                 flags |= RTCF_LOCAL;
2041
2042         do_cache = true;
2043         if (type == RTN_BROADCAST) {
2044                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2045                 fi = NULL;
2046         } else if (type == RTN_MULTICAST) {
2047                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2048                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2049                                      fl4->flowi4_proto))
2050                         flags &= ~RTCF_LOCAL;
2051                 else
2052                         do_cache = false;
2053                 /* If multicast route do not exist use
2054                  * default one, but do not gateway in this case.
2055                  * Yes, it is hack.
2056                  */
2057                 if (fi && res->prefixlen < 4)
2058                         fi = NULL;
2059         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2060                    (orig_oif != dev_out->ifindex)) {
2061                 /* For local routes that require a particular output interface
2062                  * we do not want to cache the result.  Caching the result
2063                  * causes incorrect behaviour when there are multiple source
2064                  * addresses on the interface, the end result being that if the
2065                  * intended recipient is waiting on that interface for the
2066                  * packet he won't receive it because it will be delivered on
2067                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2068                  * be set to the loopback interface as well.
2069                  */
2070                 fi = NULL;
2071         }
2072
2073         fnhe = NULL;
2074         do_cache &= fi != NULL;
2075         if (do_cache) {
2076                 struct rtable __rcu **prth;
2077                 struct fib_nh *nh = &FIB_RES_NH(*res);
2078
2079                 fnhe = find_exception(nh, fl4->daddr);
2080                 if (fnhe) {
2081                         prth = &fnhe->fnhe_rth_output;
2082                         rth = rcu_dereference(*prth);
2083                         if (rth && rth->dst.expires &&
2084                             time_after(jiffies, rth->dst.expires)) {
2085                                 ip_del_fnhe(nh, fl4->daddr);
2086                                 fnhe = NULL;
2087                         } else {
2088                                 goto rt_cache;
2089                         }
2090                 }
2091
2092                 if (unlikely(fl4->flowi4_flags &
2093                              FLOWI_FLAG_KNOWN_NH &&
2094                              !(nh->nh_gw &&
2095                                nh->nh_scope == RT_SCOPE_LINK))) {
2096                         do_cache = false;
2097                         goto add;
2098                 }
2099                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2100                 rth = rcu_dereference(*prth);
2101
2102 rt_cache:
2103                 if (rt_cache_valid(rth)) {
2104                         dst_hold(&rth->dst);
2105                         return rth;
2106                 }
2107         }
2108
2109 add:
2110         rth = rt_dst_alloc(dev_out, flags, type,
2111                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2112                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2113                            do_cache);
2114         if (!rth)
2115                 return ERR_PTR(-ENOBUFS);
2116
2117         rth->rt_iif     = orig_oif ? : 0;
2118         if (res->table)
2119                 rth->rt_table_id = res->table->tb_id;
2120
2121         RT_CACHE_STAT_INC(out_slow_tot);
2122
2123         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2124                 if (flags & RTCF_LOCAL &&
2125                     !(dev_out->flags & IFF_LOOPBACK)) {
2126                         rth->dst.output = ip_mc_output;
2127                         RT_CACHE_STAT_INC(out_slow_mc);
2128                 }
2129 #ifdef CONFIG_IP_MROUTE
2130                 if (type == RTN_MULTICAST) {
2131                         if (IN_DEV_MFORWARD(in_dev) &&
2132                             !ipv4_is_local_multicast(fl4->daddr)) {
2133                                 rth->dst.input = ip_mr_input;
2134                                 rth->dst.output = ip_mc_output;
2135                         }
2136                 }
2137 #endif
2138         }
2139
2140         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2141         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2142                 rth->dst.output = lwtunnel_output;
2143
2144         return rth;
2145 }
2146
2147 /*
2148  * Major route resolver routine.
2149  */
2150
2151 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2152                                           int mp_hash)
2153 {
2154         struct net_device *dev_out = NULL;
2155         __u8 tos = RT_FL_TOS(fl4);
2156         unsigned int flags = 0;
2157         struct fib_result res;
2158         struct rtable *rth;
2159         int orig_oif;
2160         int err = -ENETUNREACH;
2161
2162         res.tclassid    = 0;
2163         res.fi          = NULL;
2164         res.table       = NULL;
2165
2166         orig_oif = fl4->flowi4_oif;
2167
2168         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2169         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2170         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2171                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2172
2173         rcu_read_lock();
2174         if (fl4->saddr) {
2175                 rth = ERR_PTR(-EINVAL);
2176                 if (ipv4_is_multicast(fl4->saddr) ||
2177                     ipv4_is_lbcast(fl4->saddr) ||
2178                     ipv4_is_zeronet(fl4->saddr))
2179                         goto out;
2180
2181                 /* I removed check for oif == dev_out->oif here.
2182                    It was wrong for two reasons:
2183                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2184                       is assigned to multiple interfaces.
2185                    2. Moreover, we are allowed to send packets with saddr
2186                       of another iface. --ANK
2187                  */
2188
2189                 if (fl4->flowi4_oif == 0 &&
2190                     (ipv4_is_multicast(fl4->daddr) ||
2191                      ipv4_is_lbcast(fl4->daddr))) {
2192                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2194                         if (!dev_out)
2195                                 goto out;
2196
2197                         /* Special hack: user can direct multicasts
2198                            and limited broadcast via necessary interface
2199                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2200                            This hack is not just for fun, it allows
2201                            vic,vat and friends to work.
2202                            They bind socket to loopback, set ttl to zero
2203                            and expect that it will work.
2204                            From the viewpoint of routing cache they are broken,
2205                            because we are not allowed to build multicast path
2206                            with loopback source addr (look, routing cache
2207                            cannot know, that ttl is zero, so that packet
2208                            will not leave this host and route is valid).
2209                            Luckily, this hack is good workaround.
2210                          */
2211
2212                         fl4->flowi4_oif = dev_out->ifindex;
2213                         goto make_route;
2214                 }
2215
2216                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2217                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2218                         if (!__ip_dev_find(net, fl4->saddr, false))
2219                                 goto out;
2220                 }
2221         }
2222
2223
2224         if (fl4->flowi4_oif) {
2225                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2226                 rth = ERR_PTR(-ENODEV);
2227                 if (!dev_out)
2228                         goto out;
2229
2230                 /* RACE: Check return value of inet_select_addr instead. */
2231                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2232                         rth = ERR_PTR(-ENETUNREACH);
2233                         goto out;
2234                 }
2235                 if (ipv4_is_local_multicast(fl4->daddr) ||
2236                     ipv4_is_lbcast(fl4->daddr) ||
2237                     fl4->flowi4_proto == IPPROTO_IGMP) {
2238                         if (!fl4->saddr)
2239                                 fl4->saddr = inet_select_addr(dev_out, 0,
2240                                                               RT_SCOPE_LINK);
2241                         goto make_route;
2242                 }
2243                 if (!fl4->saddr) {
2244                         if (ipv4_is_multicast(fl4->daddr))
2245                                 fl4->saddr = inet_select_addr(dev_out, 0,
2246                                                               fl4->flowi4_scope);
2247                         else if (!fl4->daddr)
2248                                 fl4->saddr = inet_select_addr(dev_out, 0,
2249                                                               RT_SCOPE_HOST);
2250                 }
2251         }
2252
2253         if (!fl4->daddr) {
2254                 fl4->daddr = fl4->saddr;
2255                 if (!fl4->daddr)
2256                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2257                 dev_out = net->loopback_dev;
2258                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2259                 res.type = RTN_LOCAL;
2260                 flags |= RTCF_LOCAL;
2261                 goto make_route;
2262         }
2263
2264         err = fib_lookup(net, fl4, &res, 0);
2265         if (err) {
2266                 res.fi = NULL;
2267                 res.table = NULL;
2268                 if (fl4->flowi4_oif) {
2269                         /* Apparently, routing tables are wrong. Assume,
2270                            that the destination is on link.
2271
2272                            WHY? DW.
2273                            Because we are allowed to send to iface
2274                            even if it has NO routes and NO assigned
2275                            addresses. When oif is specified, routing
2276                            tables are looked up with only one purpose:
2277                            to catch if destination is gatewayed, rather than
2278                            direct. Moreover, if MSG_DONTROUTE is set,
2279                            we send packet, ignoring both routing tables
2280                            and ifaddr state. --ANK
2281
2282
2283                            We could make it even if oif is unknown,
2284                            likely IPv6, but we do not.
2285                          */
2286
2287                         if (fl4->saddr == 0)
2288                                 fl4->saddr = inet_select_addr(dev_out, 0,
2289                                                               RT_SCOPE_LINK);
2290                         res.type = RTN_UNICAST;
2291                         goto make_route;
2292                 }
2293                 rth = ERR_PTR(err);
2294                 goto out;
2295         }
2296
2297         if (res.type == RTN_LOCAL) {
2298                 if (!fl4->saddr) {
2299                         if (res.fi->fib_prefsrc)
2300                                 fl4->saddr = res.fi->fib_prefsrc;
2301                         else
2302                                 fl4->saddr = fl4->daddr;
2303                 }
2304
2305                 /* L3 master device is the loopback for that domain */
2306                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2307                 fl4->flowi4_oif = dev_out->ifindex;
2308                 flags |= RTCF_LOCAL;
2309                 goto make_route;
2310         }
2311
2312         fib_select_path(net, &res, fl4, mp_hash);
2313
2314         dev_out = FIB_RES_DEV(res);
2315         fl4->flowi4_oif = dev_out->ifindex;
2316
2317
2318 make_route:
2319         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2320
2321 out:
2322         rcu_read_unlock();
2323         return rth;
2324 }
2325 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2326
2327 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2328 {
2329         return NULL;
2330 }
2331
2332 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2333 {
2334         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2335
2336         return mtu ? : dst->dev->mtu;
2337 }
2338
2339 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2340                                           struct sk_buff *skb, u32 mtu)
2341 {
2342 }
2343
2344 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2345                                        struct sk_buff *skb)
2346 {
2347 }
2348
2349 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2350                                           unsigned long old)
2351 {
2352         return NULL;
2353 }
2354
2355 static struct dst_ops ipv4_dst_blackhole_ops = {
2356         .family                 =       AF_INET,
2357         .check                  =       ipv4_blackhole_dst_check,
2358         .mtu                    =       ipv4_blackhole_mtu,
2359         .default_advmss         =       ipv4_default_advmss,
2360         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2361         .redirect               =       ipv4_rt_blackhole_redirect,
2362         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2363         .neigh_lookup           =       ipv4_neigh_lookup,
2364 };
2365
2366 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2367 {
2368         struct rtable *ort = (struct rtable *) dst_orig;
2369         struct rtable *rt;
2370
2371         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2372         if (rt) {
2373                 struct dst_entry *new = &rt->dst;
2374
2375                 new->__use = 1;
2376                 new->input = dst_discard;
2377                 new->output = dst_discard_out;
2378
2379                 new->dev = ort->dst.dev;
2380                 if (new->dev)
2381                         dev_hold(new->dev);
2382
2383                 rt->rt_is_input = ort->rt_is_input;
2384                 rt->rt_iif = ort->rt_iif;
2385                 rt->rt_pmtu = ort->rt_pmtu;
2386
2387                 rt->rt_genid = rt_genid_ipv4(net);
2388                 rt->rt_flags = ort->rt_flags;
2389                 rt->rt_type = ort->rt_type;
2390                 rt->rt_gateway = ort->rt_gateway;
2391                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2392
2393                 INIT_LIST_HEAD(&rt->rt_uncached);
2394                 dst_free(new);
2395         }
2396
2397         dst_release(dst_orig);
2398
2399         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2400 }
2401
2402 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2403                                     const struct sock *sk)
2404 {
2405         struct rtable *rt = __ip_route_output_key(net, flp4);
2406
2407         if (IS_ERR(rt))
2408                 return rt;
2409
2410         if (flp4->flowi4_proto)
2411                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2412                                                         flowi4_to_flowi(flp4),
2413                                                         sk, 0);
2414
2415         return rt;
2416 }
2417 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2418
2419 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2420                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2421                         u32 seq, int event, int nowait, unsigned int flags)
2422 {
2423         struct rtable *rt = skb_rtable(skb);
2424         struct rtmsg *r;
2425         struct nlmsghdr *nlh;
2426         unsigned long expires = 0;
2427         u32 error;
2428         u32 metrics[RTAX_MAX];
2429
2430         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2431         if (!nlh)
2432                 return -EMSGSIZE;
2433
2434         r = nlmsg_data(nlh);
2435         r->rtm_family    = AF_INET;
2436         r->rtm_dst_len  = 32;
2437         r->rtm_src_len  = 0;
2438         r->rtm_tos      = fl4->flowi4_tos;
2439         r->rtm_table    = table_id;
2440         if (nla_put_u32(skb, RTA_TABLE, table_id))
2441                 goto nla_put_failure;
2442         r->rtm_type     = rt->rt_type;
2443         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2444         r->rtm_protocol = RTPROT_UNSPEC;
2445         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2446         if (rt->rt_flags & RTCF_NOTIFY)
2447                 r->rtm_flags |= RTM_F_NOTIFY;
2448         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2449                 r->rtm_flags |= RTCF_DOREDIRECT;
2450
2451         if (nla_put_in_addr(skb, RTA_DST, dst))
2452                 goto nla_put_failure;
2453         if (src) {
2454                 r->rtm_src_len = 32;
2455                 if (nla_put_in_addr(skb, RTA_SRC, src))
2456                         goto nla_put_failure;
2457         }
2458         if (rt->dst.dev &&
2459             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2460                 goto nla_put_failure;
2461 #ifdef CONFIG_IP_ROUTE_CLASSID
2462         if (rt->dst.tclassid &&
2463             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2464                 goto nla_put_failure;
2465 #endif
2466         if (!rt_is_input_route(rt) &&
2467             fl4->saddr != src) {
2468                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2469                         goto nla_put_failure;
2470         }
2471         if (rt->rt_uses_gateway &&
2472             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2473                 goto nla_put_failure;
2474
2475         expires = rt->dst.expires;
2476         if (expires) {
2477                 unsigned long now = jiffies;
2478
2479                 if (time_before(now, expires))
2480                         expires -= now;
2481                 else
2482                         expires = 0;
2483         }
2484
2485         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2486         if (rt->rt_pmtu && expires)
2487                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2488         if (rtnetlink_put_metrics(skb, metrics) < 0)
2489                 goto nla_put_failure;
2490
2491         if (fl4->flowi4_mark &&
2492             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2493                 goto nla_put_failure;
2494
2495         error = rt->dst.error;
2496
2497         if (rt_is_input_route(rt)) {
2498 #ifdef CONFIG_IP_MROUTE
2499                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2500                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2501                         int err = ipmr_get_route(net, skb,
2502                                                  fl4->saddr, fl4->daddr,
2503                                                  r, nowait);
2504                         if (err <= 0) {
2505                                 if (!nowait) {
2506                                         if (err == 0)
2507                                                 return 0;
2508                                         goto nla_put_failure;
2509                                 } else {
2510                                         if (err == -EMSGSIZE)
2511                                                 goto nla_put_failure;
2512                                         error = err;
2513                                 }
2514                         }
2515                 } else
2516 #endif
2517                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2518                                 goto nla_put_failure;
2519         }
2520
2521         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2522                 goto nla_put_failure;
2523
2524         nlmsg_end(skb, nlh);
2525         return 0;
2526
2527 nla_put_failure:
2528         nlmsg_cancel(skb, nlh);
2529         return -EMSGSIZE;
2530 }
2531
2532 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2533 {
2534         struct net *net = sock_net(in_skb->sk);
2535         struct rtmsg *rtm;
2536         struct nlattr *tb[RTA_MAX+1];
2537         struct rtable *rt = NULL;
2538         struct flowi4 fl4;
2539         __be32 dst = 0;
2540         __be32 src = 0;
2541         u32 iif;
2542         int err;
2543         int mark;
2544         struct sk_buff *skb;
2545         u32 table_id = RT_TABLE_MAIN;
2546
2547         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2548         if (err < 0)
2549                 goto errout;
2550
2551         rtm = nlmsg_data(nlh);
2552
2553         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2554         if (!skb) {
2555                 err = -ENOBUFS;
2556                 goto errout;
2557         }
2558
2559         /* Reserve room for dummy headers, this skb can pass
2560            through good chunk of routing engine.
2561          */
2562         skb_reset_mac_header(skb);
2563         skb_reset_network_header(skb);
2564
2565         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2566         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2567         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2568
2569         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2570         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2571         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2572         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2573
2574         memset(&fl4, 0, sizeof(fl4));
2575         fl4.daddr = dst;
2576         fl4.saddr = src;
2577         fl4.flowi4_tos = rtm->rtm_tos;
2578         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2579         fl4.flowi4_mark = mark;
2580
2581         if (iif) {
2582                 struct net_device *dev;
2583
2584                 dev = __dev_get_by_index(net, iif);
2585                 if (!dev) {
2586                         err = -ENODEV;
2587                         goto errout_free;
2588                 }
2589
2590                 skb->protocol   = htons(ETH_P_IP);
2591                 skb->dev        = dev;
2592                 skb->mark       = mark;
2593                 local_bh_disable();
2594                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2595                 local_bh_enable();
2596
2597                 rt = skb_rtable(skb);
2598                 if (err == 0 && rt->dst.error)
2599                         err = -rt->dst.error;
2600         } else {
2601                 rt = ip_route_output_key(net, &fl4);
2602
2603                 err = 0;
2604                 if (IS_ERR(rt))
2605                         err = PTR_ERR(rt);
2606         }
2607
2608         if (err)
2609                 goto errout_free;
2610
2611         skb_dst_set(skb, &rt->dst);
2612         if (rtm->rtm_flags & RTM_F_NOTIFY)
2613                 rt->rt_flags |= RTCF_NOTIFY;
2614
2615         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2616                 table_id = rt->rt_table_id;
2617
2618         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2619                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2620                            RTM_NEWROUTE, 0, 0);
2621         if (err < 0)
2622                 goto errout_free;
2623
2624         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2625 errout:
2626         return err;
2627
2628 errout_free:
2629         kfree_skb(skb);
2630         goto errout;
2631 }
2632
2633 void ip_rt_multicast_event(struct in_device *in_dev)
2634 {
2635         rt_cache_flush(dev_net(in_dev->dev));
2636 }
2637
2638 #ifdef CONFIG_SYSCTL
2639 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2640 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2641 static int ip_rt_gc_elasticity __read_mostly    = 8;
2642
2643 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2644                                         void __user *buffer,
2645                                         size_t *lenp, loff_t *ppos)
2646 {
2647         struct net *net = (struct net *)__ctl->extra1;
2648
2649         if (write) {
2650                 rt_cache_flush(net);
2651                 fnhe_genid_bump(net);
2652                 return 0;
2653         }
2654
2655         return -EINVAL;
2656 }
2657
2658 static struct ctl_table ipv4_route_table[] = {
2659         {
2660                 .procname       = "gc_thresh",
2661                 .data           = &ipv4_dst_ops.gc_thresh,
2662                 .maxlen         = sizeof(int),
2663                 .mode           = 0644,
2664                 .proc_handler   = proc_dointvec,
2665         },
2666         {
2667                 .procname       = "max_size",
2668                 .data           = &ip_rt_max_size,
2669                 .maxlen         = sizeof(int),
2670                 .mode           = 0644,
2671                 .proc_handler   = proc_dointvec,
2672         },
2673         {
2674                 /*  Deprecated. Use gc_min_interval_ms */
2675
2676                 .procname       = "gc_min_interval",
2677                 .data           = &ip_rt_gc_min_interval,
2678                 .maxlen         = sizeof(int),
2679                 .mode           = 0644,
2680                 .proc_handler   = proc_dointvec_jiffies,
2681         },
2682         {
2683                 .procname       = "gc_min_interval_ms",
2684                 .data           = &ip_rt_gc_min_interval,
2685                 .maxlen         = sizeof(int),
2686                 .mode           = 0644,
2687                 .proc_handler   = proc_dointvec_ms_jiffies,
2688         },
2689         {
2690                 .procname       = "gc_timeout",
2691                 .data           = &ip_rt_gc_timeout,
2692                 .maxlen         = sizeof(int),
2693                 .mode           = 0644,
2694                 .proc_handler   = proc_dointvec_jiffies,
2695         },
2696         {
2697                 .procname       = "gc_interval",
2698                 .data           = &ip_rt_gc_interval,
2699                 .maxlen         = sizeof(int),
2700                 .mode           = 0644,
2701                 .proc_handler   = proc_dointvec_jiffies,
2702         },
2703         {
2704                 .procname       = "redirect_load",
2705                 .data           = &ip_rt_redirect_load,
2706                 .maxlen         = sizeof(int),
2707                 .mode           = 0644,
2708                 .proc_handler   = proc_dointvec,
2709         },
2710         {
2711                 .procname       = "redirect_number",
2712                 .data           = &ip_rt_redirect_number,
2713                 .maxlen         = sizeof(int),
2714                 .mode           = 0644,
2715                 .proc_handler   = proc_dointvec,
2716         },
2717         {
2718                 .procname       = "redirect_silence",
2719                 .data           = &ip_rt_redirect_silence,
2720                 .maxlen         = sizeof(int),
2721                 .mode           = 0644,
2722                 .proc_handler   = proc_dointvec,
2723         },
2724         {
2725                 .procname       = "error_cost",
2726                 .data           = &ip_rt_error_cost,
2727                 .maxlen         = sizeof(int),
2728                 .mode           = 0644,
2729                 .proc_handler   = proc_dointvec,
2730         },
2731         {
2732                 .procname       = "error_burst",
2733                 .data           = &ip_rt_error_burst,
2734                 .maxlen         = sizeof(int),
2735                 .mode           = 0644,
2736                 .proc_handler   = proc_dointvec,
2737         },
2738         {
2739                 .procname       = "gc_elasticity",
2740                 .data           = &ip_rt_gc_elasticity,
2741                 .maxlen         = sizeof(int),
2742                 .mode           = 0644,
2743                 .proc_handler   = proc_dointvec,
2744         },
2745         {
2746                 .procname       = "mtu_expires",
2747                 .data           = &ip_rt_mtu_expires,
2748                 .maxlen         = sizeof(int),
2749                 .mode           = 0644,
2750                 .proc_handler   = proc_dointvec_jiffies,
2751         },
2752         {
2753                 .procname       = "min_pmtu",
2754                 .data           = &ip_rt_min_pmtu,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = proc_dointvec,
2758         },
2759         {
2760                 .procname       = "min_adv_mss",
2761                 .data           = &ip_rt_min_advmss,
2762                 .maxlen         = sizeof(int),
2763                 .mode           = 0644,
2764                 .proc_handler   = proc_dointvec,
2765         },
2766         { }
2767 };
2768
2769 static struct ctl_table ipv4_route_flush_table[] = {
2770         {
2771                 .procname       = "flush",
2772                 .maxlen         = sizeof(int),
2773                 .mode           = 0200,
2774                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2775         },
2776         { },
2777 };
2778
2779 static __net_init int sysctl_route_net_init(struct net *net)
2780 {
2781         struct ctl_table *tbl;
2782
2783         tbl = ipv4_route_flush_table;
2784         if (!net_eq(net, &init_net)) {
2785                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2786                 if (!tbl)
2787                         goto err_dup;
2788
2789                 /* Don't export sysctls to unprivileged users */
2790                 if (net->user_ns != &init_user_ns)
2791                         tbl[0].procname = NULL;
2792         }
2793         tbl[0].extra1 = net;
2794
2795         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2796         if (!net->ipv4.route_hdr)
2797                 goto err_reg;
2798         return 0;
2799
2800 err_reg:
2801         if (tbl != ipv4_route_flush_table)
2802                 kfree(tbl);
2803 err_dup:
2804         return -ENOMEM;
2805 }
2806
2807 static __net_exit void sysctl_route_net_exit(struct net *net)
2808 {
2809         struct ctl_table *tbl;
2810
2811         tbl = net->ipv4.route_hdr->ctl_table_arg;
2812         unregister_net_sysctl_table(net->ipv4.route_hdr);
2813         BUG_ON(tbl == ipv4_route_flush_table);
2814         kfree(tbl);
2815 }
2816
2817 static __net_initdata struct pernet_operations sysctl_route_ops = {
2818         .init = sysctl_route_net_init,
2819         .exit = sysctl_route_net_exit,
2820 };
2821 #endif
2822
2823 static __net_init int rt_genid_init(struct net *net)
2824 {
2825         atomic_set(&net->ipv4.rt_genid, 0);
2826         atomic_set(&net->fnhe_genid, 0);
2827         get_random_bytes(&net->ipv4.dev_addr_genid,
2828                          sizeof(net->ipv4.dev_addr_genid));
2829         return 0;
2830 }
2831
2832 static __net_initdata struct pernet_operations rt_genid_ops = {
2833         .init = rt_genid_init,
2834 };
2835
2836 static int __net_init ipv4_inetpeer_init(struct net *net)
2837 {
2838         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2839
2840         if (!bp)
2841                 return -ENOMEM;
2842         inet_peer_base_init(bp);
2843         net->ipv4.peers = bp;
2844         return 0;
2845 }
2846
2847 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2848 {
2849         struct inet_peer_base *bp = net->ipv4.peers;
2850
2851         net->ipv4.peers = NULL;
2852         inetpeer_invalidate_tree(bp);
2853         kfree(bp);
2854 }
2855
2856 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2857         .init   =       ipv4_inetpeer_init,
2858         .exit   =       ipv4_inetpeer_exit,
2859 };
2860
2861 #ifdef CONFIG_IP_ROUTE_CLASSID
2862 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2863 #endif /* CONFIG_IP_ROUTE_CLASSID */
2864
2865 int __init ip_rt_init(void)
2866 {
2867         int rc = 0;
2868         int cpu;
2869
2870         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2871         if (!ip_idents)
2872                 panic("IP: failed to allocate ip_idents\n");
2873
2874         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2875
2876         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2877         if (!ip_tstamps)
2878                 panic("IP: failed to allocate ip_tstamps\n");
2879
2880         for_each_possible_cpu(cpu) {
2881                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2882
2883                 INIT_LIST_HEAD(&ul->head);
2884                 spin_lock_init(&ul->lock);
2885         }
2886 #ifdef CONFIG_IP_ROUTE_CLASSID
2887         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2888         if (!ip_rt_acct)
2889                 panic("IP: failed to allocate ip_rt_acct\n");
2890 #endif
2891
2892         ipv4_dst_ops.kmem_cachep =
2893                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2894                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2895
2896         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2897
2898         if (dst_entries_init(&ipv4_dst_ops) < 0)
2899                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2900
2901         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2902                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2903
2904         ipv4_dst_ops.gc_thresh = ~0;
2905         ip_rt_max_size = INT_MAX;
2906
2907         devinet_init();
2908         ip_fib_init();
2909
2910         if (ip_rt_proc_init())
2911                 pr_err("Unable to create route proc files\n");
2912 #ifdef CONFIG_XFRM
2913         xfrm_init();
2914         xfrm4_init();
2915 #endif
2916         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2917
2918 #ifdef CONFIG_SYSCTL
2919         register_pernet_subsys(&sysctl_route_ops);
2920 #endif
2921         register_pernet_subsys(&rt_genid_ops);
2922         register_pernet_subsys(&ipv4_inetpeer_ops);
2923         return rc;
2924 }
2925
2926 #ifdef CONFIG_SYSCTL
2927 /*
2928  * We really need to sanitize the damn ipv4 init order, then all
2929  * this nonsense will go away.
2930  */
2931 void __init ip_static_sysctl_init(void)
2932 {
2933         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2934 }
2935 #endif