be2net: support ndo_get_phys_port_id()
[cascardo/linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
128 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly       = 256;
130
131 /*
132  *      Interface to generic destination cache.
133  */
134
135 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
136 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
137 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
138 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
139 static void              ipv4_link_failure(struct sk_buff *skb);
140 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
141                                            struct sk_buff *skb, u32 mtu);
142 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
143                                         struct sk_buff *skb);
144 static void             ipv4_dst_destroy(struct dst_entry *dst);
145
146 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
147 {
148         WARN_ON(1);
149         return NULL;
150 }
151
152 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
153                                            struct sk_buff *skb,
154                                            const void *daddr);
155
156 static struct dst_ops ipv4_dst_ops = {
157         .family =               AF_INET,
158         .check =                ipv4_dst_check,
159         .default_advmss =       ipv4_default_advmss,
160         .mtu =                  ipv4_mtu,
161         .cow_metrics =          ipv4_cow_metrics,
162         .destroy =              ipv4_dst_destroy,
163         .negative_advice =      ipv4_negative_advice,
164         .link_failure =         ipv4_link_failure,
165         .update_pmtu =          ip_rt_update_pmtu,
166         .redirect =             ip_do_redirect,
167         .local_out =            __ip_local_out,
168         .neigh_lookup =         ipv4_neigh_lookup,
169 };
170
171 #define ECN_OR_COST(class)      TC_PRIO_##class
172
173 const __u8 ip_tos2prio[16] = {
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(BESTEFFORT),
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK)
190 };
191 EXPORT_SYMBOL(ip_tos2prio);
192
193 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
194 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
195
196 #ifdef CONFIG_PROC_FS
197 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
198 {
199         if (*pos)
200                 return NULL;
201         return SEQ_START_TOKEN;
202 }
203
204 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
205 {
206         ++*pos;
207         return NULL;
208 }
209
210 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
211 {
212 }
213
214 static int rt_cache_seq_show(struct seq_file *seq, void *v)
215 {
216         if (v == SEQ_START_TOKEN)
217                 seq_printf(seq, "%-127s\n",
218                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
219                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
220                            "HHUptod\tSpecDst");
221         return 0;
222 }
223
224 static const struct seq_operations rt_cache_seq_ops = {
225         .start  = rt_cache_seq_start,
226         .next   = rt_cache_seq_next,
227         .stop   = rt_cache_seq_stop,
228         .show   = rt_cache_seq_show,
229 };
230
231 static int rt_cache_seq_open(struct inode *inode, struct file *file)
232 {
233         return seq_open(file, &rt_cache_seq_ops);
234 }
235
236 static const struct file_operations rt_cache_seq_fops = {
237         .owner   = THIS_MODULE,
238         .open    = rt_cache_seq_open,
239         .read    = seq_read,
240         .llseek  = seq_lseek,
241         .release = seq_release,
242 };
243
244
245 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
246 {
247         int cpu;
248
249         if (*pos == 0)
250                 return SEQ_START_TOKEN;
251
252         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
253                 if (!cpu_possible(cpu))
254                         continue;
255                 *pos = cpu+1;
256                 return &per_cpu(rt_cache_stat, cpu);
257         }
258         return NULL;
259 }
260
261 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
262 {
263         int cpu;
264
265         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
266                 if (!cpu_possible(cpu))
267                         continue;
268                 *pos = cpu+1;
269                 return &per_cpu(rt_cache_stat, cpu);
270         }
271         return NULL;
272
273 }
274
275 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
276 {
277
278 }
279
280 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
281 {
282         struct rt_cache_stat *st = v;
283
284         if (v == SEQ_START_TOKEN) {
285                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
286                 return 0;
287         }
288
289         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
290                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
291                    dst_entries_get_slow(&ipv4_dst_ops),
292                    0, /* st->in_hit */
293                    st->in_slow_tot,
294                    st->in_slow_mc,
295                    st->in_no_route,
296                    st->in_brd,
297                    st->in_martian_dst,
298                    st->in_martian_src,
299
300                    0, /* st->out_hit */
301                    st->out_slow_tot,
302                    st->out_slow_mc,
303
304                    0, /* st->gc_total */
305                    0, /* st->gc_ignored */
306                    0, /* st->gc_goal_miss */
307                    0, /* st->gc_dst_overflow */
308                    0, /* st->in_hlist_search */
309                    0  /* st->out_hlist_search */
310                 );
311         return 0;
312 }
313
314 static const struct seq_operations rt_cpu_seq_ops = {
315         .start  = rt_cpu_seq_start,
316         .next   = rt_cpu_seq_next,
317         .stop   = rt_cpu_seq_stop,
318         .show   = rt_cpu_seq_show,
319 };
320
321
322 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
323 {
324         return seq_open(file, &rt_cpu_seq_ops);
325 }
326
327 static const struct file_operations rt_cpu_seq_fops = {
328         .owner   = THIS_MODULE,
329         .open    = rt_cpu_seq_open,
330         .read    = seq_read,
331         .llseek  = seq_lseek,
332         .release = seq_release,
333 };
334
335 #ifdef CONFIG_IP_ROUTE_CLASSID
336 static int rt_acct_proc_show(struct seq_file *m, void *v)
337 {
338         struct ip_rt_acct *dst, *src;
339         unsigned int i, j;
340
341         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
342         if (!dst)
343                 return -ENOMEM;
344
345         for_each_possible_cpu(i) {
346                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
347                 for (j = 0; j < 256; j++) {
348                         dst[j].o_bytes   += src[j].o_bytes;
349                         dst[j].o_packets += src[j].o_packets;
350                         dst[j].i_bytes   += src[j].i_bytes;
351                         dst[j].i_packets += src[j].i_packets;
352                 }
353         }
354
355         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
356         kfree(dst);
357         return 0;
358 }
359
360 static int rt_acct_proc_open(struct inode *inode, struct file *file)
361 {
362         return single_open(file, rt_acct_proc_show, NULL);
363 }
364
365 static const struct file_operations rt_acct_proc_fops = {
366         .owner          = THIS_MODULE,
367         .open           = rt_acct_proc_open,
368         .read           = seq_read,
369         .llseek         = seq_lseek,
370         .release        = single_release,
371 };
372 #endif
373
374 static int __net_init ip_rt_do_proc_init(struct net *net)
375 {
376         struct proc_dir_entry *pde;
377
378         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
379                           &rt_cache_seq_fops);
380         if (!pde)
381                 goto err1;
382
383         pde = proc_create("rt_cache", S_IRUGO,
384                           net->proc_net_stat, &rt_cpu_seq_fops);
385         if (!pde)
386                 goto err2;
387
388 #ifdef CONFIG_IP_ROUTE_CLASSID
389         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
390         if (!pde)
391                 goto err3;
392 #endif
393         return 0;
394
395 #ifdef CONFIG_IP_ROUTE_CLASSID
396 err3:
397         remove_proc_entry("rt_cache", net->proc_net_stat);
398 #endif
399 err2:
400         remove_proc_entry("rt_cache", net->proc_net);
401 err1:
402         return -ENOMEM;
403 }
404
405 static void __net_exit ip_rt_do_proc_exit(struct net *net)
406 {
407         remove_proc_entry("rt_cache", net->proc_net_stat);
408         remove_proc_entry("rt_cache", net->proc_net);
409 #ifdef CONFIG_IP_ROUTE_CLASSID
410         remove_proc_entry("rt_acct", net->proc_net);
411 #endif
412 }
413
414 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
415         .init = ip_rt_do_proc_init,
416         .exit = ip_rt_do_proc_exit,
417 };
418
419 static int __init ip_rt_proc_init(void)
420 {
421         return register_pernet_subsys(&ip_rt_proc_ops);
422 }
423
424 #else
425 static inline int ip_rt_proc_init(void)
426 {
427         return 0;
428 }
429 #endif /* CONFIG_PROC_FS */
430
431 static inline bool rt_is_expired(const struct rtable *rth)
432 {
433         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
434 }
435
436 void rt_cache_flush(struct net *net)
437 {
438         rt_genid_bump_ipv4(net);
439 }
440
441 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
442                                            struct sk_buff *skb,
443                                            const void *daddr)
444 {
445         struct net_device *dev = dst->dev;
446         const __be32 *pkey = daddr;
447         const struct rtable *rt;
448         struct neighbour *n;
449
450         rt = (const struct rtable *) dst;
451         if (rt->rt_gateway)
452                 pkey = (const __be32 *) &rt->rt_gateway;
453         else if (skb)
454                 pkey = &ip_hdr(skb)->daddr;
455
456         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
457         if (n)
458                 return n;
459         return neigh_create(&arp_tbl, pkey, dev);
460 }
461
462 #define IP_IDENTS_SZ 2048u
463
464 static atomic_t *ip_idents __read_mostly;
465 static u32 *ip_tstamps __read_mostly;
466
467 /* In order to protect privacy, we add a perturbation to identifiers
468  * if one generator is seldom used. This makes hard for an attacker
469  * to infer how many packets were sent between two points in time.
470  */
471 u32 ip_idents_reserve(u32 hash, int segs)
472 {
473         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
474         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
475         u32 old = ACCESS_ONCE(*p_tstamp);
476         u32 now = (u32)jiffies;
477         u32 delta = 0;
478
479         if (old != now && cmpxchg(p_tstamp, old, now) == old)
480                 delta = prandom_u32_max(now - old);
481
482         return atomic_add_return(segs + delta, p_id) - segs;
483 }
484 EXPORT_SYMBOL(ip_idents_reserve);
485
486 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
487 {
488         static u32 ip_idents_hashrnd __read_mostly;
489         u32 hash, id;
490
491         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
492
493         hash = jhash_3words((__force u32)iph->daddr,
494                             (__force u32)iph->saddr,
495                             iph->protocol ^ net_hash_mix(net),
496                             ip_idents_hashrnd);
497         id = ip_idents_reserve(hash, segs);
498         iph->id = htons(id);
499 }
500 EXPORT_SYMBOL(__ip_select_ident);
501
502 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
503                              const struct iphdr *iph,
504                              int oif, u8 tos,
505                              u8 prot, u32 mark, int flow_flags)
506 {
507         if (sk) {
508                 const struct inet_sock *inet = inet_sk(sk);
509
510                 oif = sk->sk_bound_dev_if;
511                 mark = sk->sk_mark;
512                 tos = RT_CONN_FLAGS(sk);
513                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
514         }
515         flowi4_init_output(fl4, oif, mark, tos,
516                            RT_SCOPE_UNIVERSE, prot,
517                            flow_flags,
518                            iph->daddr, iph->saddr, 0, 0);
519 }
520
521 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
522                                const struct sock *sk)
523 {
524         const struct iphdr *iph = ip_hdr(skb);
525         int oif = skb->dev->ifindex;
526         u8 tos = RT_TOS(iph->tos);
527         u8 prot = iph->protocol;
528         u32 mark = skb->mark;
529
530         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
531 }
532
533 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
534 {
535         const struct inet_sock *inet = inet_sk(sk);
536         const struct ip_options_rcu *inet_opt;
537         __be32 daddr = inet->inet_daddr;
538
539         rcu_read_lock();
540         inet_opt = rcu_dereference(inet->inet_opt);
541         if (inet_opt && inet_opt->opt.srr)
542                 daddr = inet_opt->opt.faddr;
543         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
544                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
545                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
546                            inet_sk_flowi_flags(sk),
547                            daddr, inet->inet_saddr, 0, 0);
548         rcu_read_unlock();
549 }
550
551 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
552                                  const struct sk_buff *skb)
553 {
554         if (skb)
555                 build_skb_flow_key(fl4, skb, sk);
556         else
557                 build_sk_flow_key(fl4, sk);
558 }
559
560 static inline void rt_free(struct rtable *rt)
561 {
562         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
563 }
564
565 static DEFINE_SPINLOCK(fnhe_lock);
566
567 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
568 {
569         struct rtable *rt;
570
571         rt = rcu_dereference(fnhe->fnhe_rth_input);
572         if (rt) {
573                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
574                 rt_free(rt);
575         }
576         rt = rcu_dereference(fnhe->fnhe_rth_output);
577         if (rt) {
578                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
579                 rt_free(rt);
580         }
581 }
582
583 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584 {
585         struct fib_nh_exception *fnhe, *oldest;
586
587         oldest = rcu_dereference(hash->chain);
588         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
589              fnhe = rcu_dereference(fnhe->fnhe_next)) {
590                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
591                         oldest = fnhe;
592         }
593         fnhe_flush_routes(oldest);
594         return oldest;
595 }
596
597 static inline u32 fnhe_hashfun(__be32 daddr)
598 {
599         static u32 fnhe_hashrnd __read_mostly;
600         u32 hval;
601
602         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
603         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
604         return hash_32(hval, FNHE_HASH_SHIFT);
605 }
606
607 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
608 {
609         rt->rt_pmtu = fnhe->fnhe_pmtu;
610         rt->dst.expires = fnhe->fnhe_expires;
611
612         if (fnhe->fnhe_gw) {
613                 rt->rt_flags |= RTCF_REDIRECTED;
614                 rt->rt_gateway = fnhe->fnhe_gw;
615                 rt->rt_uses_gateway = 1;
616         }
617 }
618
619 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
620                                   u32 pmtu, unsigned long expires)
621 {
622         struct fnhe_hash_bucket *hash;
623         struct fib_nh_exception *fnhe;
624         struct rtable *rt;
625         unsigned int i;
626         int depth;
627         u32 hval = fnhe_hashfun(daddr);
628
629         spin_lock_bh(&fnhe_lock);
630
631         hash = rcu_dereference(nh->nh_exceptions);
632         if (!hash) {
633                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
634                 if (!hash)
635                         goto out_unlock;
636                 rcu_assign_pointer(nh->nh_exceptions, hash);
637         }
638
639         hash += hval;
640
641         depth = 0;
642         for (fnhe = rcu_dereference(hash->chain); fnhe;
643              fnhe = rcu_dereference(fnhe->fnhe_next)) {
644                 if (fnhe->fnhe_daddr == daddr)
645                         break;
646                 depth++;
647         }
648
649         if (fnhe) {
650                 if (gw)
651                         fnhe->fnhe_gw = gw;
652                 if (pmtu) {
653                         fnhe->fnhe_pmtu = pmtu;
654                         fnhe->fnhe_expires = max(1UL, expires);
655                 }
656                 /* Update all cached dsts too */
657                 rt = rcu_dereference(fnhe->fnhe_rth_input);
658                 if (rt)
659                         fill_route_from_fnhe(rt, fnhe);
660                 rt = rcu_dereference(fnhe->fnhe_rth_output);
661                 if (rt)
662                         fill_route_from_fnhe(rt, fnhe);
663         } else {
664                 if (depth > FNHE_RECLAIM_DEPTH)
665                         fnhe = fnhe_oldest(hash);
666                 else {
667                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
668                         if (!fnhe)
669                                 goto out_unlock;
670
671                         fnhe->fnhe_next = hash->chain;
672                         rcu_assign_pointer(hash->chain, fnhe);
673                 }
674                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
675                 fnhe->fnhe_daddr = daddr;
676                 fnhe->fnhe_gw = gw;
677                 fnhe->fnhe_pmtu = pmtu;
678                 fnhe->fnhe_expires = expires;
679
680                 /* Exception created; mark the cached routes for the nexthop
681                  * stale, so anyone caching it rechecks if this exception
682                  * applies to them.
683                  */
684                 rt = rcu_dereference(nh->nh_rth_input);
685                 if (rt)
686                         rt->dst.obsolete = DST_OBSOLETE_KILL;
687
688                 for_each_possible_cpu(i) {
689                         struct rtable __rcu **prt;
690                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
691                         rt = rcu_dereference(*prt);
692                         if (rt)
693                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
694                 }
695         }
696
697         fnhe->fnhe_stamp = jiffies;
698
699 out_unlock:
700         spin_unlock_bh(&fnhe_lock);
701 }
702
703 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
704                              bool kill_route)
705 {
706         __be32 new_gw = icmp_hdr(skb)->un.gateway;
707         __be32 old_gw = ip_hdr(skb)->saddr;
708         struct net_device *dev = skb->dev;
709         struct in_device *in_dev;
710         struct fib_result res;
711         struct neighbour *n;
712         struct net *net;
713
714         switch (icmp_hdr(skb)->code & 7) {
715         case ICMP_REDIR_NET:
716         case ICMP_REDIR_NETTOS:
717         case ICMP_REDIR_HOST:
718         case ICMP_REDIR_HOSTTOS:
719                 break;
720
721         default:
722                 return;
723         }
724
725         if (rt->rt_gateway != old_gw)
726                 return;
727
728         in_dev = __in_dev_get_rcu(dev);
729         if (!in_dev)
730                 return;
731
732         net = dev_net(dev);
733         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
734             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
735             ipv4_is_zeronet(new_gw))
736                 goto reject_redirect;
737
738         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
739                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
740                         goto reject_redirect;
741                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
742                         goto reject_redirect;
743         } else {
744                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
745                         goto reject_redirect;
746         }
747
748         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
749         if (!IS_ERR(n)) {
750                 if (!(n->nud_state & NUD_VALID)) {
751                         neigh_event_send(n, NULL);
752                 } else {
753                         if (fib_lookup(net, fl4, &res, 0) == 0) {
754                                 struct fib_nh *nh = &FIB_RES_NH(res);
755
756                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
757                                                       0, 0);
758                         }
759                         if (kill_route)
760                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
761                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
762                 }
763                 neigh_release(n);
764         }
765         return;
766
767 reject_redirect:
768 #ifdef CONFIG_IP_ROUTE_VERBOSE
769         if (IN_DEV_LOG_MARTIANS(in_dev)) {
770                 const struct iphdr *iph = (const struct iphdr *) skb->data;
771                 __be32 daddr = iph->daddr;
772                 __be32 saddr = iph->saddr;
773
774                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
775                                      "  Advised path = %pI4 -> %pI4\n",
776                                      &old_gw, dev->name, &new_gw,
777                                      &saddr, &daddr);
778         }
779 #endif
780         ;
781 }
782
783 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
784 {
785         struct rtable *rt;
786         struct flowi4 fl4;
787         const struct iphdr *iph = (const struct iphdr *) skb->data;
788         int oif = skb->dev->ifindex;
789         u8 tos = RT_TOS(iph->tos);
790         u8 prot = iph->protocol;
791         u32 mark = skb->mark;
792
793         rt = (struct rtable *) dst;
794
795         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
796         __ip_do_redirect(rt, skb, &fl4, true);
797 }
798
799 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
800 {
801         struct rtable *rt = (struct rtable *)dst;
802         struct dst_entry *ret = dst;
803
804         if (rt) {
805                 if (dst->obsolete > 0) {
806                         ip_rt_put(rt);
807                         ret = NULL;
808                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
809                            rt->dst.expires) {
810                         ip_rt_put(rt);
811                         ret = NULL;
812                 }
813         }
814         return ret;
815 }
816
817 /*
818  * Algorithm:
819  *      1. The first ip_rt_redirect_number redirects are sent
820  *         with exponential backoff, then we stop sending them at all,
821  *         assuming that the host ignores our redirects.
822  *      2. If we did not see packets requiring redirects
823  *         during ip_rt_redirect_silence, we assume that the host
824  *         forgot redirected route and start to send redirects again.
825  *
826  * This algorithm is much cheaper and more intelligent than dumb load limiting
827  * in icmp.c.
828  *
829  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
830  * and "frag. need" (breaks PMTU discovery) in icmp.c.
831  */
832
833 void ip_rt_send_redirect(struct sk_buff *skb)
834 {
835         struct rtable *rt = skb_rtable(skb);
836         struct in_device *in_dev;
837         struct inet_peer *peer;
838         struct net *net;
839         int log_martians;
840
841         rcu_read_lock();
842         in_dev = __in_dev_get_rcu(rt->dst.dev);
843         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
844                 rcu_read_unlock();
845                 return;
846         }
847         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
848         rcu_read_unlock();
849
850         net = dev_net(rt->dst.dev);
851         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
852         if (!peer) {
853                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
854                           rt_nexthop(rt, ip_hdr(skb)->daddr));
855                 return;
856         }
857
858         /* No redirected packets during ip_rt_redirect_silence;
859          * reset the algorithm.
860          */
861         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
862                 peer->rate_tokens = 0;
863
864         /* Too many ignored redirects; do not send anything
865          * set dst.rate_last to the last seen redirected packet.
866          */
867         if (peer->rate_tokens >= ip_rt_redirect_number) {
868                 peer->rate_last = jiffies;
869                 goto out_put_peer;
870         }
871
872         /* Check for load limit; set rate_last to the latest sent
873          * redirect.
874          */
875         if (peer->rate_tokens == 0 ||
876             time_after(jiffies,
877                        (peer->rate_last +
878                         (ip_rt_redirect_load << peer->rate_tokens)))) {
879                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
880
881                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
882                 peer->rate_last = jiffies;
883                 ++peer->rate_tokens;
884 #ifdef CONFIG_IP_ROUTE_VERBOSE
885                 if (log_martians &&
886                     peer->rate_tokens == ip_rt_redirect_number)
887                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
888                                              &ip_hdr(skb)->saddr, inet_iif(skb),
889                                              &ip_hdr(skb)->daddr, &gw);
890 #endif
891         }
892 out_put_peer:
893         inet_putpeer(peer);
894 }
895
896 static int ip_error(struct sk_buff *skb)
897 {
898         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
899         struct rtable *rt = skb_rtable(skb);
900         struct inet_peer *peer;
901         unsigned long now;
902         struct net *net;
903         bool send;
904         int code;
905
906         /* IP on this device is disabled. */
907         if (!in_dev)
908                 goto out;
909
910         net = dev_net(rt->dst.dev);
911         if (!IN_DEV_FORWARD(in_dev)) {
912                 switch (rt->dst.error) {
913                 case EHOSTUNREACH:
914                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
915                         break;
916
917                 case ENETUNREACH:
918                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
919                         break;
920                 }
921                 goto out;
922         }
923
924         switch (rt->dst.error) {
925         case EINVAL:
926         default:
927                 goto out;
928         case EHOSTUNREACH:
929                 code = ICMP_HOST_UNREACH;
930                 break;
931         case ENETUNREACH:
932                 code = ICMP_NET_UNREACH;
933                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
934                 break;
935         case EACCES:
936                 code = ICMP_PKT_FILTERED;
937                 break;
938         }
939
940         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
941
942         send = true;
943         if (peer) {
944                 now = jiffies;
945                 peer->rate_tokens += now - peer->rate_last;
946                 if (peer->rate_tokens > ip_rt_error_burst)
947                         peer->rate_tokens = ip_rt_error_burst;
948                 peer->rate_last = now;
949                 if (peer->rate_tokens >= ip_rt_error_cost)
950                         peer->rate_tokens -= ip_rt_error_cost;
951                 else
952                         send = false;
953                 inet_putpeer(peer);
954         }
955         if (send)
956                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
957
958 out:    kfree_skb(skb);
959         return 0;
960 }
961
962 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
963 {
964         struct dst_entry *dst = &rt->dst;
965         struct fib_result res;
966
967         if (dst_metric_locked(dst, RTAX_MTU))
968                 return;
969
970         if (ipv4_mtu(dst) < mtu)
971                 return;
972
973         if (mtu < ip_rt_min_pmtu)
974                 mtu = ip_rt_min_pmtu;
975
976         if (rt->rt_pmtu == mtu &&
977             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
978                 return;
979
980         rcu_read_lock();
981         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
982                 struct fib_nh *nh = &FIB_RES_NH(res);
983
984                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
985                                       jiffies + ip_rt_mtu_expires);
986         }
987         rcu_read_unlock();
988 }
989
990 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
991                               struct sk_buff *skb, u32 mtu)
992 {
993         struct rtable *rt = (struct rtable *) dst;
994         struct flowi4 fl4;
995
996         ip_rt_build_flow_key(&fl4, sk, skb);
997         __ip_rt_update_pmtu(rt, &fl4, mtu);
998 }
999
1000 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1001                       int oif, u32 mark, u8 protocol, int flow_flags)
1002 {
1003         const struct iphdr *iph = (const struct iphdr *) skb->data;
1004         struct flowi4 fl4;
1005         struct rtable *rt;
1006
1007         if (!mark)
1008                 mark = IP4_REPLY_MARK(net, skb->mark);
1009
1010         __build_flow_key(&fl4, NULL, iph, oif,
1011                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1012         rt = __ip_route_output_key(net, &fl4);
1013         if (!IS_ERR(rt)) {
1014                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1015                 ip_rt_put(rt);
1016         }
1017 }
1018 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1019
1020 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021 {
1022         const struct iphdr *iph = (const struct iphdr *) skb->data;
1023         struct flowi4 fl4;
1024         struct rtable *rt;
1025
1026         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1027
1028         if (!fl4.flowi4_mark)
1029                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1030
1031         rt = __ip_route_output_key(sock_net(sk), &fl4);
1032         if (!IS_ERR(rt)) {
1033                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1034                 ip_rt_put(rt);
1035         }
1036 }
1037
1038 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1039 {
1040         const struct iphdr *iph = (const struct iphdr *) skb->data;
1041         struct flowi4 fl4;
1042         struct rtable *rt;
1043         struct dst_entry *odst = NULL;
1044         bool new = false;
1045
1046         bh_lock_sock(sk);
1047
1048         if (!ip_sk_accept_pmtu(sk))
1049                 goto out;
1050
1051         odst = sk_dst_get(sk);
1052
1053         if (sock_owned_by_user(sk) || !odst) {
1054                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1055                 goto out;
1056         }
1057
1058         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1059
1060         rt = (struct rtable *)odst;
1061         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1062                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1063                 if (IS_ERR(rt))
1064                         goto out;
1065
1066                 new = true;
1067         }
1068
1069         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1070
1071         if (!dst_check(&rt->dst, 0)) {
1072                 if (new)
1073                         dst_release(&rt->dst);
1074
1075                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1076                 if (IS_ERR(rt))
1077                         goto out;
1078
1079                 new = true;
1080         }
1081
1082         if (new)
1083                 sk_dst_set(sk, &rt->dst);
1084
1085 out:
1086         bh_unlock_sock(sk);
1087         dst_release(odst);
1088 }
1089 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1090
1091 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1092                    int oif, u32 mark, u8 protocol, int flow_flags)
1093 {
1094         const struct iphdr *iph = (const struct iphdr *) skb->data;
1095         struct flowi4 fl4;
1096         struct rtable *rt;
1097
1098         __build_flow_key(&fl4, NULL, iph, oif,
1099                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1100         rt = __ip_route_output_key(net, &fl4);
1101         if (!IS_ERR(rt)) {
1102                 __ip_do_redirect(rt, skb, &fl4, false);
1103                 ip_rt_put(rt);
1104         }
1105 }
1106 EXPORT_SYMBOL_GPL(ipv4_redirect);
1107
1108 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1109 {
1110         const struct iphdr *iph = (const struct iphdr *) skb->data;
1111         struct flowi4 fl4;
1112         struct rtable *rt;
1113
1114         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1115         rt = __ip_route_output_key(sock_net(sk), &fl4);
1116         if (!IS_ERR(rt)) {
1117                 __ip_do_redirect(rt, skb, &fl4, false);
1118                 ip_rt_put(rt);
1119         }
1120 }
1121 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1122
1123 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1124 {
1125         struct rtable *rt = (struct rtable *) dst;
1126
1127         /* All IPV4 dsts are created with ->obsolete set to the value
1128          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1129          * into this function always.
1130          *
1131          * When a PMTU/redirect information update invalidates a route,
1132          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1133          * DST_OBSOLETE_DEAD by dst_free().
1134          */
1135         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1136                 return NULL;
1137         return dst;
1138 }
1139
1140 static void ipv4_link_failure(struct sk_buff *skb)
1141 {
1142         struct rtable *rt;
1143
1144         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1145
1146         rt = skb_rtable(skb);
1147         if (rt)
1148                 dst_set_expires(&rt->dst, 0);
1149 }
1150
1151 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1152 {
1153         pr_debug("%s: %pI4 -> %pI4, %s\n",
1154                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1155                  skb->dev ? skb->dev->name : "?");
1156         kfree_skb(skb);
1157         WARN_ON(1);
1158         return 0;
1159 }
1160
1161 /*
1162    We do not cache source address of outgoing interface,
1163    because it is used only by IP RR, TS and SRR options,
1164    so that it out of fast path.
1165
1166    BTW remember: "addr" is allowed to be not aligned
1167    in IP options!
1168  */
1169
1170 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1171 {
1172         __be32 src;
1173
1174         if (rt_is_output_route(rt))
1175                 src = ip_hdr(skb)->saddr;
1176         else {
1177                 struct fib_result res;
1178                 struct flowi4 fl4;
1179                 struct iphdr *iph;
1180
1181                 iph = ip_hdr(skb);
1182
1183                 memset(&fl4, 0, sizeof(fl4));
1184                 fl4.daddr = iph->daddr;
1185                 fl4.saddr = iph->saddr;
1186                 fl4.flowi4_tos = RT_TOS(iph->tos);
1187                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1188                 fl4.flowi4_iif = skb->dev->ifindex;
1189                 fl4.flowi4_mark = skb->mark;
1190
1191                 rcu_read_lock();
1192                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1193                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1194                 else
1195                         src = inet_select_addr(rt->dst.dev,
1196                                                rt_nexthop(rt, iph->daddr),
1197                                                RT_SCOPE_UNIVERSE);
1198                 rcu_read_unlock();
1199         }
1200         memcpy(addr, &src, 4);
1201 }
1202
1203 #ifdef CONFIG_IP_ROUTE_CLASSID
1204 static void set_class_tag(struct rtable *rt, u32 tag)
1205 {
1206         if (!(rt->dst.tclassid & 0xFFFF))
1207                 rt->dst.tclassid |= tag & 0xFFFF;
1208         if (!(rt->dst.tclassid & 0xFFFF0000))
1209                 rt->dst.tclassid |= tag & 0xFFFF0000;
1210 }
1211 #endif
1212
1213 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1214 {
1215         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1216
1217         if (advmss == 0) {
1218                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1219                                ip_rt_min_advmss);
1220                 if (advmss > 65535 - 40)
1221                         advmss = 65535 - 40;
1222         }
1223         return advmss;
1224 }
1225
1226 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1227 {
1228         const struct rtable *rt = (const struct rtable *) dst;
1229         unsigned int mtu = rt->rt_pmtu;
1230
1231         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1232                 mtu = dst_metric_raw(dst, RTAX_MTU);
1233
1234         if (mtu)
1235                 return mtu;
1236
1237         mtu = dst->dev->mtu;
1238
1239         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1240                 if (rt->rt_uses_gateway && mtu > 576)
1241                         mtu = 576;
1242         }
1243
1244         return min_t(unsigned int, mtu, IP_MAX_MTU);
1245 }
1246
1247 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1248 {
1249         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1250         struct fib_nh_exception *fnhe;
1251         u32 hval;
1252
1253         if (!hash)
1254                 return NULL;
1255
1256         hval = fnhe_hashfun(daddr);
1257
1258         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1259              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1260                 if (fnhe->fnhe_daddr == daddr)
1261                         return fnhe;
1262         }
1263         return NULL;
1264 }
1265
1266 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1267                               __be32 daddr)
1268 {
1269         bool ret = false;
1270
1271         spin_lock_bh(&fnhe_lock);
1272
1273         if (daddr == fnhe->fnhe_daddr) {
1274                 struct rtable __rcu **porig;
1275                 struct rtable *orig;
1276                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1277
1278                 if (rt_is_input_route(rt))
1279                         porig = &fnhe->fnhe_rth_input;
1280                 else
1281                         porig = &fnhe->fnhe_rth_output;
1282                 orig = rcu_dereference(*porig);
1283
1284                 if (fnhe->fnhe_genid != genid) {
1285                         fnhe->fnhe_genid = genid;
1286                         fnhe->fnhe_gw = 0;
1287                         fnhe->fnhe_pmtu = 0;
1288                         fnhe->fnhe_expires = 0;
1289                         fnhe_flush_routes(fnhe);
1290                         orig = NULL;
1291                 }
1292                 fill_route_from_fnhe(rt, fnhe);
1293                 if (!rt->rt_gateway)
1294                         rt->rt_gateway = daddr;
1295
1296                 if (!(rt->dst.flags & DST_NOCACHE)) {
1297                         rcu_assign_pointer(*porig, rt);
1298                         if (orig)
1299                                 rt_free(orig);
1300                         ret = true;
1301                 }
1302
1303                 fnhe->fnhe_stamp = jiffies;
1304         }
1305         spin_unlock_bh(&fnhe_lock);
1306
1307         return ret;
1308 }
1309
1310 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1311 {
1312         struct rtable *orig, *prev, **p;
1313         bool ret = true;
1314
1315         if (rt_is_input_route(rt)) {
1316                 p = (struct rtable **)&nh->nh_rth_input;
1317         } else {
1318                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1319         }
1320         orig = *p;
1321
1322         prev = cmpxchg(p, orig, rt);
1323         if (prev == orig) {
1324                 if (orig)
1325                         rt_free(orig);
1326         } else
1327                 ret = false;
1328
1329         return ret;
1330 }
1331
1332 struct uncached_list {
1333         spinlock_t              lock;
1334         struct list_head        head;
1335 };
1336
1337 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1338
1339 static void rt_add_uncached_list(struct rtable *rt)
1340 {
1341         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1342
1343         rt->rt_uncached_list = ul;
1344
1345         spin_lock_bh(&ul->lock);
1346         list_add_tail(&rt->rt_uncached, &ul->head);
1347         spin_unlock_bh(&ul->lock);
1348 }
1349
1350 static void ipv4_dst_destroy(struct dst_entry *dst)
1351 {
1352         struct rtable *rt = (struct rtable *) dst;
1353
1354         if (!list_empty(&rt->rt_uncached)) {
1355                 struct uncached_list *ul = rt->rt_uncached_list;
1356
1357                 spin_lock_bh(&ul->lock);
1358                 list_del(&rt->rt_uncached);
1359                 spin_unlock_bh(&ul->lock);
1360         }
1361         lwtunnel_state_put(rt->rt_lwtstate);
1362 }
1363
1364 void rt_flush_dev(struct net_device *dev)
1365 {
1366         struct net *net = dev_net(dev);
1367         struct rtable *rt;
1368         int cpu;
1369
1370         for_each_possible_cpu(cpu) {
1371                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1372
1373                 spin_lock_bh(&ul->lock);
1374                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1375                         if (rt->dst.dev != dev)
1376                                 continue;
1377                         rt->dst.dev = net->loopback_dev;
1378                         dev_hold(rt->dst.dev);
1379                         dev_put(dev);
1380                 }
1381                 spin_unlock_bh(&ul->lock);
1382         }
1383 }
1384
1385 static bool rt_cache_valid(const struct rtable *rt)
1386 {
1387         return  rt &&
1388                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1389                 !rt_is_expired(rt);
1390 }
1391
1392 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1393                            const struct fib_result *res,
1394                            struct fib_nh_exception *fnhe,
1395                            struct fib_info *fi, u16 type, u32 itag)
1396 {
1397         bool cached = false;
1398
1399         if (fi) {
1400                 struct fib_nh *nh = &FIB_RES_NH(*res);
1401
1402                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1403                         rt->rt_gateway = nh->nh_gw;
1404                         rt->rt_uses_gateway = 1;
1405                 }
1406                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1407 #ifdef CONFIG_IP_ROUTE_CLASSID
1408                 rt->dst.tclassid = nh->nh_tclassid;
1409 #endif
1410                 if (nh->nh_lwtstate) {
1411                         lwtunnel_state_get(nh->nh_lwtstate);
1412                         rt->rt_lwtstate = nh->nh_lwtstate;
1413                 } else {
1414                         rt->rt_lwtstate = NULL;
1415                 }
1416                 if (unlikely(fnhe))
1417                         cached = rt_bind_exception(rt, fnhe, daddr);
1418                 else if (!(rt->dst.flags & DST_NOCACHE))
1419                         cached = rt_cache_route(nh, rt);
1420                 if (unlikely(!cached)) {
1421                         /* Routes we intend to cache in nexthop exception or
1422                          * FIB nexthop have the DST_NOCACHE bit clear.
1423                          * However, if we are unsuccessful at storing this
1424                          * route into the cache we really need to set it.
1425                          */
1426                         rt->dst.flags |= DST_NOCACHE;
1427                         if (!rt->rt_gateway)
1428                                 rt->rt_gateway = daddr;
1429                         rt_add_uncached_list(rt);
1430                 }
1431         } else
1432                 rt_add_uncached_list(rt);
1433
1434 #ifdef CONFIG_IP_ROUTE_CLASSID
1435 #ifdef CONFIG_IP_MULTIPLE_TABLES
1436         set_class_tag(rt, res->tclassid);
1437 #endif
1438         set_class_tag(rt, itag);
1439 #endif
1440 }
1441
1442 static struct rtable *rt_dst_alloc(struct net_device *dev,
1443                                    bool nopolicy, bool noxfrm, bool will_cache)
1444 {
1445         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1446                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1447                          (nopolicy ? DST_NOPOLICY : 0) |
1448                          (noxfrm ? DST_NOXFRM : 0));
1449 }
1450
1451 /* called in rcu_read_lock() section */
1452 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1453                                 u8 tos, struct net_device *dev, int our)
1454 {
1455         struct rtable *rth;
1456         struct in_device *in_dev = __in_dev_get_rcu(dev);
1457         u32 itag = 0;
1458         int err;
1459
1460         /* Primary sanity checks. */
1461
1462         if (!in_dev)
1463                 return -EINVAL;
1464
1465         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1466             skb->protocol != htons(ETH_P_IP))
1467                 goto e_inval;
1468
1469         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1470                 if (ipv4_is_loopback(saddr))
1471                         goto e_inval;
1472
1473         if (ipv4_is_zeronet(saddr)) {
1474                 if (!ipv4_is_local_multicast(daddr))
1475                         goto e_inval;
1476         } else {
1477                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1478                                           in_dev, &itag);
1479                 if (err < 0)
1480                         goto e_err;
1481         }
1482         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1483                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1484         if (!rth)
1485                 goto e_nobufs;
1486
1487 #ifdef CONFIG_IP_ROUTE_CLASSID
1488         rth->dst.tclassid = itag;
1489 #endif
1490         rth->dst.output = ip_rt_bug;
1491
1492         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1493         rth->rt_flags   = RTCF_MULTICAST;
1494         rth->rt_type    = RTN_MULTICAST;
1495         rth->rt_is_input= 1;
1496         rth->rt_iif     = 0;
1497         rth->rt_pmtu    = 0;
1498         rth->rt_gateway = 0;
1499         rth->rt_uses_gateway = 0;
1500         INIT_LIST_HEAD(&rth->rt_uncached);
1501         rth->rt_lwtstate = NULL;
1502         if (our) {
1503                 rth->dst.input= ip_local_deliver;
1504                 rth->rt_flags |= RTCF_LOCAL;
1505         }
1506
1507 #ifdef CONFIG_IP_MROUTE
1508         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1509                 rth->dst.input = ip_mr_input;
1510 #endif
1511         RT_CACHE_STAT_INC(in_slow_mc);
1512
1513         skb_dst_set(skb, &rth->dst);
1514         return 0;
1515
1516 e_nobufs:
1517         return -ENOBUFS;
1518 e_inval:
1519         return -EINVAL;
1520 e_err:
1521         return err;
1522 }
1523
1524
1525 static void ip_handle_martian_source(struct net_device *dev,
1526                                      struct in_device *in_dev,
1527                                      struct sk_buff *skb,
1528                                      __be32 daddr,
1529                                      __be32 saddr)
1530 {
1531         RT_CACHE_STAT_INC(in_martian_src);
1532 #ifdef CONFIG_IP_ROUTE_VERBOSE
1533         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1534                 /*
1535                  *      RFC1812 recommendation, if source is martian,
1536                  *      the only hint is MAC header.
1537                  */
1538                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1539                         &daddr, &saddr, dev->name);
1540                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1541                         print_hex_dump(KERN_WARNING, "ll header: ",
1542                                        DUMP_PREFIX_OFFSET, 16, 1,
1543                                        skb_mac_header(skb),
1544                                        dev->hard_header_len, true);
1545                 }
1546         }
1547 #endif
1548 }
1549
1550 /* called in rcu_read_lock() section */
1551 static int __mkroute_input(struct sk_buff *skb,
1552                            const struct fib_result *res,
1553                            struct in_device *in_dev,
1554                            __be32 daddr, __be32 saddr, u32 tos)
1555 {
1556         struct fib_nh_exception *fnhe;
1557         struct rtable *rth;
1558         int err;
1559         struct in_device *out_dev;
1560         bool do_cache;
1561         u32 itag = 0;
1562
1563         /* get a working reference to the output device */
1564         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1565         if (!out_dev) {
1566                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1567                 return -EINVAL;
1568         }
1569
1570         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1571                                   in_dev->dev, in_dev, &itag);
1572         if (err < 0) {
1573                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1574                                          saddr);
1575
1576                 goto cleanup;
1577         }
1578
1579         do_cache = res->fi && !itag;
1580         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1581             skb->protocol == htons(ETH_P_IP) &&
1582             (IN_DEV_SHARED_MEDIA(out_dev) ||
1583              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1584                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1585
1586         if (skb->protocol != htons(ETH_P_IP)) {
1587                 /* Not IP (i.e. ARP). Do not create route, if it is
1588                  * invalid for proxy arp. DNAT routes are always valid.
1589                  *
1590                  * Proxy arp feature have been extended to allow, ARP
1591                  * replies back to the same interface, to support
1592                  * Private VLAN switch technologies. See arp.c.
1593                  */
1594                 if (out_dev == in_dev &&
1595                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1596                         err = -EINVAL;
1597                         goto cleanup;
1598                 }
1599         }
1600
1601         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1602         if (do_cache) {
1603                 if (fnhe)
1604                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1605                 else
1606                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1607
1608                 if (rt_cache_valid(rth)) {
1609                         skb_dst_set_noref(skb, &rth->dst);
1610                         goto out;
1611                 }
1612         }
1613
1614         rth = rt_dst_alloc(out_dev->dev,
1615                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1616                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1617         if (!rth) {
1618                 err = -ENOBUFS;
1619                 goto cleanup;
1620         }
1621
1622         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1623         rth->rt_flags = 0;
1624         rth->rt_type = res->type;
1625         rth->rt_is_input = 1;
1626         rth->rt_iif     = 0;
1627         rth->rt_pmtu    = 0;
1628         rth->rt_gateway = 0;
1629         rth->rt_uses_gateway = 0;
1630         INIT_LIST_HEAD(&rth->rt_uncached);
1631         rth->rt_lwtstate = NULL;
1632         RT_CACHE_STAT_INC(in_slow_tot);
1633
1634         rth->dst.input = ip_forward;
1635         rth->dst.output = ip_output;
1636
1637         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1638         if (lwtunnel_output_redirect(rth->rt_lwtstate))
1639                 rth->dst.output = lwtunnel_output;
1640         skb_dst_set(skb, &rth->dst);
1641 out:
1642         err = 0;
1643  cleanup:
1644         return err;
1645 }
1646
1647 static int ip_mkroute_input(struct sk_buff *skb,
1648                             struct fib_result *res,
1649                             const struct flowi4 *fl4,
1650                             struct in_device *in_dev,
1651                             __be32 daddr, __be32 saddr, u32 tos)
1652 {
1653 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1654         if (res->fi && res->fi->fib_nhs > 1)
1655                 fib_select_multipath(res);
1656 #endif
1657
1658         /* create a routing cache entry */
1659         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1660 }
1661
1662 /*
1663  *      NOTE. We drop all the packets that has local source
1664  *      addresses, because every properly looped back packet
1665  *      must have correct destination already attached by output routine.
1666  *
1667  *      Such approach solves two big problems:
1668  *      1. Not simplex devices are handled properly.
1669  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1670  *      called with rcu_read_lock()
1671  */
1672
1673 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1674                                u8 tos, struct net_device *dev)
1675 {
1676         struct fib_result res;
1677         struct in_device *in_dev = __in_dev_get_rcu(dev);
1678         struct ip_tunnel_info *tun_info;
1679         struct flowi4   fl4;
1680         unsigned int    flags = 0;
1681         u32             itag = 0;
1682         struct rtable   *rth;
1683         int             err = -EINVAL;
1684         struct net    *net = dev_net(dev);
1685         bool do_cache;
1686
1687         /* IP on this device is disabled. */
1688
1689         if (!in_dev)
1690                 goto out;
1691
1692         /* Check for the most weird martians, which can be not detected
1693            by fib_lookup.
1694          */
1695
1696         tun_info = skb_tunnel_info(skb, AF_INET);
1697         if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
1698                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1699         else
1700                 fl4.flowi4_tun_key.tun_id = 0;
1701         skb_dst_drop(skb);
1702
1703         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1704                 goto martian_source;
1705
1706         res.fi = NULL;
1707         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1708                 goto brd_input;
1709
1710         /* Accept zero addresses only to limited broadcast;
1711          * I even do not know to fix it or not. Waiting for complains :-)
1712          */
1713         if (ipv4_is_zeronet(saddr))
1714                 goto martian_source;
1715
1716         if (ipv4_is_zeronet(daddr))
1717                 goto martian_destination;
1718
1719         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1720          * and call it once if daddr or/and saddr are loopback addresses
1721          */
1722         if (ipv4_is_loopback(daddr)) {
1723                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1724                         goto martian_destination;
1725         } else if (ipv4_is_loopback(saddr)) {
1726                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1727                         goto martian_source;
1728         }
1729
1730         /*
1731          *      Now we are ready to route packet.
1732          */
1733         fl4.flowi4_oif = 0;
1734         fl4.flowi4_iif = dev->ifindex;
1735         fl4.flowi4_mark = skb->mark;
1736         fl4.flowi4_tos = tos;
1737         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1738         fl4.daddr = daddr;
1739         fl4.saddr = saddr;
1740         err = fib_lookup(net, &fl4, &res, 0);
1741         if (err != 0) {
1742                 if (!IN_DEV_FORWARD(in_dev))
1743                         err = -EHOSTUNREACH;
1744                 goto no_route;
1745         }
1746
1747         if (res.type == RTN_BROADCAST)
1748                 goto brd_input;
1749
1750         if (res.type == RTN_LOCAL) {
1751                 err = fib_validate_source(skb, saddr, daddr, tos,
1752                                           0, dev, in_dev, &itag);
1753                 if (err < 0)
1754                         goto martian_source_keep_err;
1755                 goto local_input;
1756         }
1757
1758         if (!IN_DEV_FORWARD(in_dev)) {
1759                 err = -EHOSTUNREACH;
1760                 goto no_route;
1761         }
1762         if (res.type != RTN_UNICAST)
1763                 goto martian_destination;
1764
1765         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1766 out:    return err;
1767
1768 brd_input:
1769         if (skb->protocol != htons(ETH_P_IP))
1770                 goto e_inval;
1771
1772         if (!ipv4_is_zeronet(saddr)) {
1773                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1774                                           in_dev, &itag);
1775                 if (err < 0)
1776                         goto martian_source_keep_err;
1777         }
1778         flags |= RTCF_BROADCAST;
1779         res.type = RTN_BROADCAST;
1780         RT_CACHE_STAT_INC(in_brd);
1781
1782 local_input:
1783         do_cache = false;
1784         if (res.fi) {
1785                 if (!itag) {
1786                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1787                         if (rt_cache_valid(rth)) {
1788                                 skb_dst_set_noref(skb, &rth->dst);
1789                                 err = 0;
1790                                 goto out;
1791                         }
1792                         do_cache = true;
1793                 }
1794         }
1795
1796         rth = rt_dst_alloc(net->loopback_dev,
1797                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1798         if (!rth)
1799                 goto e_nobufs;
1800
1801         rth->dst.input= ip_local_deliver;
1802         rth->dst.output= ip_rt_bug;
1803 #ifdef CONFIG_IP_ROUTE_CLASSID
1804         rth->dst.tclassid = itag;
1805 #endif
1806
1807         rth->rt_genid = rt_genid_ipv4(net);
1808         rth->rt_flags   = flags|RTCF_LOCAL;
1809         rth->rt_type    = res.type;
1810         rth->rt_is_input = 1;
1811         rth->rt_iif     = 0;
1812         rth->rt_pmtu    = 0;
1813         rth->rt_gateway = 0;
1814         rth->rt_uses_gateway = 0;
1815         INIT_LIST_HEAD(&rth->rt_uncached);
1816         rth->rt_lwtstate = NULL;
1817
1818         RT_CACHE_STAT_INC(in_slow_tot);
1819         if (res.type == RTN_UNREACHABLE) {
1820                 rth->dst.input= ip_error;
1821                 rth->dst.error= -err;
1822                 rth->rt_flags   &= ~RTCF_LOCAL;
1823         }
1824         if (do_cache) {
1825                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1826                         rth->dst.flags |= DST_NOCACHE;
1827                         rt_add_uncached_list(rth);
1828                 }
1829         }
1830         skb_dst_set(skb, &rth->dst);
1831         err = 0;
1832         goto out;
1833
1834 no_route:
1835         RT_CACHE_STAT_INC(in_no_route);
1836         res.type = RTN_UNREACHABLE;
1837         res.fi = NULL;
1838         goto local_input;
1839
1840         /*
1841          *      Do not cache martian addresses: they should be logged (RFC1812)
1842          */
1843 martian_destination:
1844         RT_CACHE_STAT_INC(in_martian_dst);
1845 #ifdef CONFIG_IP_ROUTE_VERBOSE
1846         if (IN_DEV_LOG_MARTIANS(in_dev))
1847                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1848                                      &daddr, &saddr, dev->name);
1849 #endif
1850
1851 e_inval:
1852         err = -EINVAL;
1853         goto out;
1854
1855 e_nobufs:
1856         err = -ENOBUFS;
1857         goto out;
1858
1859 martian_source:
1860         err = -EINVAL;
1861 martian_source_keep_err:
1862         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1863         goto out;
1864 }
1865
1866 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1867                          u8 tos, struct net_device *dev)
1868 {
1869         int res;
1870
1871         rcu_read_lock();
1872
1873         /* Multicast recognition logic is moved from route cache to here.
1874            The problem was that too many Ethernet cards have broken/missing
1875            hardware multicast filters :-( As result the host on multicasting
1876            network acquires a lot of useless route cache entries, sort of
1877            SDR messages from all the world. Now we try to get rid of them.
1878            Really, provided software IP multicast filter is organized
1879            reasonably (at least, hashed), it does not result in a slowdown
1880            comparing with route cache reject entries.
1881            Note, that multicast routers are not affected, because
1882            route cache entry is created eventually.
1883          */
1884         if (ipv4_is_multicast(daddr)) {
1885                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1886
1887                 if (in_dev) {
1888                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1889                                                   ip_hdr(skb)->protocol);
1890                         if (our
1891 #ifdef CONFIG_IP_MROUTE
1892                                 ||
1893                             (!ipv4_is_local_multicast(daddr) &&
1894                              IN_DEV_MFORWARD(in_dev))
1895 #endif
1896                            ) {
1897                                 int res = ip_route_input_mc(skb, daddr, saddr,
1898                                                             tos, dev, our);
1899                                 rcu_read_unlock();
1900                                 return res;
1901                         }
1902                 }
1903                 rcu_read_unlock();
1904                 return -EINVAL;
1905         }
1906         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1907         rcu_read_unlock();
1908         return res;
1909 }
1910 EXPORT_SYMBOL(ip_route_input_noref);
1911
1912 /* called with rcu_read_lock() */
1913 static struct rtable *__mkroute_output(const struct fib_result *res,
1914                                        const struct flowi4 *fl4, int orig_oif,
1915                                        struct net_device *dev_out,
1916                                        unsigned int flags)
1917 {
1918         struct fib_info *fi = res->fi;
1919         struct fib_nh_exception *fnhe;
1920         struct in_device *in_dev;
1921         u16 type = res->type;
1922         struct rtable *rth;
1923         bool do_cache;
1924
1925         in_dev = __in_dev_get_rcu(dev_out);
1926         if (!in_dev)
1927                 return ERR_PTR(-EINVAL);
1928
1929         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1930                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1931                         return ERR_PTR(-EINVAL);
1932
1933         if (ipv4_is_lbcast(fl4->daddr))
1934                 type = RTN_BROADCAST;
1935         else if (ipv4_is_multicast(fl4->daddr))
1936                 type = RTN_MULTICAST;
1937         else if (ipv4_is_zeronet(fl4->daddr))
1938                 return ERR_PTR(-EINVAL);
1939
1940         if (dev_out->flags & IFF_LOOPBACK)
1941                 flags |= RTCF_LOCAL;
1942
1943         do_cache = true;
1944         if (type == RTN_BROADCAST) {
1945                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1946                 fi = NULL;
1947         } else if (type == RTN_MULTICAST) {
1948                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1949                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1950                                      fl4->flowi4_proto))
1951                         flags &= ~RTCF_LOCAL;
1952                 else
1953                         do_cache = false;
1954                 /* If multicast route do not exist use
1955                  * default one, but do not gateway in this case.
1956                  * Yes, it is hack.
1957                  */
1958                 if (fi && res->prefixlen < 4)
1959                         fi = NULL;
1960         }
1961
1962         fnhe = NULL;
1963         do_cache &= fi != NULL;
1964         if (do_cache) {
1965                 struct rtable __rcu **prth;
1966                 struct fib_nh *nh = &FIB_RES_NH(*res);
1967
1968                 fnhe = find_exception(nh, fl4->daddr);
1969                 if (fnhe)
1970                         prth = &fnhe->fnhe_rth_output;
1971                 else {
1972                         if (unlikely(fl4->flowi4_flags &
1973                                      FLOWI_FLAG_KNOWN_NH &&
1974                                      !(nh->nh_gw &&
1975                                        nh->nh_scope == RT_SCOPE_LINK))) {
1976                                 do_cache = false;
1977                                 goto add;
1978                         }
1979                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
1980                 }
1981                 rth = rcu_dereference(*prth);
1982                 if (rt_cache_valid(rth)) {
1983                         dst_hold(&rth->dst);
1984                         return rth;
1985                 }
1986         }
1987
1988 add:
1989         rth = rt_dst_alloc(dev_out,
1990                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1991                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1992                            do_cache);
1993         if (!rth)
1994                 return ERR_PTR(-ENOBUFS);
1995
1996         rth->dst.output = ip_output;
1997
1998         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1999         rth->rt_flags   = flags;
2000         rth->rt_type    = type;
2001         rth->rt_is_input = 0;
2002         rth->rt_iif     = orig_oif ? : 0;
2003         rth->rt_pmtu    = 0;
2004         rth->rt_gateway = 0;
2005         rth->rt_uses_gateway = 0;
2006         INIT_LIST_HEAD(&rth->rt_uncached);
2007         rth->rt_lwtstate = NULL;
2008         RT_CACHE_STAT_INC(out_slow_tot);
2009
2010         if (flags & RTCF_LOCAL)
2011                 rth->dst.input = ip_local_deliver;
2012         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2013                 if (flags & RTCF_LOCAL &&
2014                     !(dev_out->flags & IFF_LOOPBACK)) {
2015                         rth->dst.output = ip_mc_output;
2016                         RT_CACHE_STAT_INC(out_slow_mc);
2017                 }
2018 #ifdef CONFIG_IP_MROUTE
2019                 if (type == RTN_MULTICAST) {
2020                         if (IN_DEV_MFORWARD(in_dev) &&
2021                             !ipv4_is_local_multicast(fl4->daddr)) {
2022                                 rth->dst.input = ip_mr_input;
2023                                 rth->dst.output = ip_mc_output;
2024                         }
2025                 }
2026 #endif
2027         }
2028
2029         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2030
2031         return rth;
2032 }
2033
2034 /*
2035  * Major route resolver routine.
2036  */
2037
2038 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2039 {
2040         struct net_device *dev_out = NULL;
2041         __u8 tos = RT_FL_TOS(fl4);
2042         unsigned int flags = 0;
2043         struct fib_result res;
2044         struct rtable *rth;
2045         int orig_oif;
2046
2047         res.tclassid    = 0;
2048         res.fi          = NULL;
2049         res.table       = NULL;
2050
2051         orig_oif = fl4->flowi4_oif;
2052
2053         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2054         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2055         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2056                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2057
2058         rcu_read_lock();
2059         if (fl4->saddr) {
2060                 rth = ERR_PTR(-EINVAL);
2061                 if (ipv4_is_multicast(fl4->saddr) ||
2062                     ipv4_is_lbcast(fl4->saddr) ||
2063                     ipv4_is_zeronet(fl4->saddr))
2064                         goto out;
2065
2066                 /* I removed check for oif == dev_out->oif here.
2067                    It was wrong for two reasons:
2068                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2069                       is assigned to multiple interfaces.
2070                    2. Moreover, we are allowed to send packets with saddr
2071                       of another iface. --ANK
2072                  */
2073
2074                 if (fl4->flowi4_oif == 0 &&
2075                     (ipv4_is_multicast(fl4->daddr) ||
2076                      ipv4_is_lbcast(fl4->daddr))) {
2077                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2078                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2079                         if (!dev_out)
2080                                 goto out;
2081
2082                         /* Special hack: user can direct multicasts
2083                            and limited broadcast via necessary interface
2084                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2085                            This hack is not just for fun, it allows
2086                            vic,vat and friends to work.
2087                            They bind socket to loopback, set ttl to zero
2088                            and expect that it will work.
2089                            From the viewpoint of routing cache they are broken,
2090                            because we are not allowed to build multicast path
2091                            with loopback source addr (look, routing cache
2092                            cannot know, that ttl is zero, so that packet
2093                            will not leave this host and route is valid).
2094                            Luckily, this hack is good workaround.
2095                          */
2096
2097                         fl4->flowi4_oif = dev_out->ifindex;
2098                         goto make_route;
2099                 }
2100
2101                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2102                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2103                         if (!__ip_dev_find(net, fl4->saddr, false))
2104                                 goto out;
2105                 }
2106         }
2107
2108
2109         if (fl4->flowi4_oif) {
2110                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2111                 rth = ERR_PTR(-ENODEV);
2112                 if (!dev_out)
2113                         goto out;
2114
2115                 /* RACE: Check return value of inet_select_addr instead. */
2116                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2117                         rth = ERR_PTR(-ENETUNREACH);
2118                         goto out;
2119                 }
2120                 if (ipv4_is_local_multicast(fl4->daddr) ||
2121                     ipv4_is_lbcast(fl4->daddr) ||
2122                     fl4->flowi4_proto == IPPROTO_IGMP) {
2123                         if (!fl4->saddr)
2124                                 fl4->saddr = inet_select_addr(dev_out, 0,
2125                                                               RT_SCOPE_LINK);
2126                         goto make_route;
2127                 }
2128                 if (!fl4->saddr) {
2129                         if (ipv4_is_multicast(fl4->daddr))
2130                                 fl4->saddr = inet_select_addr(dev_out, 0,
2131                                                               fl4->flowi4_scope);
2132                         else if (!fl4->daddr)
2133                                 fl4->saddr = inet_select_addr(dev_out, 0,
2134                                                               RT_SCOPE_HOST);
2135                 }
2136         }
2137
2138         if (!fl4->daddr) {
2139                 fl4->daddr = fl4->saddr;
2140                 if (!fl4->daddr)
2141                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2142                 dev_out = net->loopback_dev;
2143                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2144                 res.type = RTN_LOCAL;
2145                 flags |= RTCF_LOCAL;
2146                 goto make_route;
2147         }
2148
2149         if (fib_lookup(net, fl4, &res, 0)) {
2150                 res.fi = NULL;
2151                 res.table = NULL;
2152                 if (fl4->flowi4_oif) {
2153                         /* Apparently, routing tables are wrong. Assume,
2154                            that the destination is on link.
2155
2156                            WHY? DW.
2157                            Because we are allowed to send to iface
2158                            even if it has NO routes and NO assigned
2159                            addresses. When oif is specified, routing
2160                            tables are looked up with only one purpose:
2161                            to catch if destination is gatewayed, rather than
2162                            direct. Moreover, if MSG_DONTROUTE is set,
2163                            we send packet, ignoring both routing tables
2164                            and ifaddr state. --ANK
2165
2166
2167                            We could make it even if oif is unknown,
2168                            likely IPv6, but we do not.
2169                          */
2170
2171                         if (fl4->saddr == 0)
2172                                 fl4->saddr = inet_select_addr(dev_out, 0,
2173                                                               RT_SCOPE_LINK);
2174                         res.type = RTN_UNICAST;
2175                         goto make_route;
2176                 }
2177                 rth = ERR_PTR(-ENETUNREACH);
2178                 goto out;
2179         }
2180
2181         if (res.type == RTN_LOCAL) {
2182                 if (!fl4->saddr) {
2183                         if (res.fi->fib_prefsrc)
2184                                 fl4->saddr = res.fi->fib_prefsrc;
2185                         else
2186                                 fl4->saddr = fl4->daddr;
2187                 }
2188                 dev_out = net->loopback_dev;
2189                 fl4->flowi4_oif = dev_out->ifindex;
2190                 flags |= RTCF_LOCAL;
2191                 goto make_route;
2192         }
2193
2194 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2195         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2196                 fib_select_multipath(&res);
2197         else
2198 #endif
2199         if (!res.prefixlen &&
2200             res.table->tb_num_default > 1 &&
2201             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2202                 fib_select_default(&res);
2203
2204         if (!fl4->saddr)
2205                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2206
2207         dev_out = FIB_RES_DEV(res);
2208         fl4->flowi4_oif = dev_out->ifindex;
2209
2210
2211 make_route:
2212         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2213
2214 out:
2215         rcu_read_unlock();
2216         return rth;
2217 }
2218 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2219
2220 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2221 {
2222         return NULL;
2223 }
2224
2225 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2226 {
2227         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2228
2229         return mtu ? : dst->dev->mtu;
2230 }
2231
2232 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2233                                           struct sk_buff *skb, u32 mtu)
2234 {
2235 }
2236
2237 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2238                                        struct sk_buff *skb)
2239 {
2240 }
2241
2242 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2243                                           unsigned long old)
2244 {
2245         return NULL;
2246 }
2247
2248 static struct dst_ops ipv4_dst_blackhole_ops = {
2249         .family                 =       AF_INET,
2250         .check                  =       ipv4_blackhole_dst_check,
2251         .mtu                    =       ipv4_blackhole_mtu,
2252         .default_advmss         =       ipv4_default_advmss,
2253         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2254         .redirect               =       ipv4_rt_blackhole_redirect,
2255         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2256         .neigh_lookup           =       ipv4_neigh_lookup,
2257 };
2258
2259 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2260 {
2261         struct rtable *ort = (struct rtable *) dst_orig;
2262         struct rtable *rt;
2263
2264         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2265         if (rt) {
2266                 struct dst_entry *new = &rt->dst;
2267
2268                 new->__use = 1;
2269                 new->input = dst_discard;
2270                 new->output = dst_discard_sk;
2271
2272                 new->dev = ort->dst.dev;
2273                 if (new->dev)
2274                         dev_hold(new->dev);
2275
2276                 rt->rt_is_input = ort->rt_is_input;
2277                 rt->rt_iif = ort->rt_iif;
2278                 rt->rt_pmtu = ort->rt_pmtu;
2279
2280                 rt->rt_genid = rt_genid_ipv4(net);
2281                 rt->rt_flags = ort->rt_flags;
2282                 rt->rt_type = ort->rt_type;
2283                 rt->rt_gateway = ort->rt_gateway;
2284                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2285
2286                 INIT_LIST_HEAD(&rt->rt_uncached);
2287                 rt->rt_lwtstate = NULL;
2288                 dst_free(new);
2289         }
2290
2291         dst_release(dst_orig);
2292
2293         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2294 }
2295
2296 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2297                                     struct sock *sk)
2298 {
2299         struct rtable *rt = __ip_route_output_key(net, flp4);
2300
2301         if (IS_ERR(rt))
2302                 return rt;
2303
2304         if (flp4->flowi4_proto)
2305                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2306                                                         flowi4_to_flowi(flp4),
2307                                                         sk, 0);
2308
2309         return rt;
2310 }
2311 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2312
2313 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2314                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2315                         u32 seq, int event, int nowait, unsigned int flags)
2316 {
2317         struct rtable *rt = skb_rtable(skb);
2318         struct rtmsg *r;
2319         struct nlmsghdr *nlh;
2320         unsigned long expires = 0;
2321         u32 error;
2322         u32 metrics[RTAX_MAX];
2323
2324         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2325         if (!nlh)
2326                 return -EMSGSIZE;
2327
2328         r = nlmsg_data(nlh);
2329         r->rtm_family    = AF_INET;
2330         r->rtm_dst_len  = 32;
2331         r->rtm_src_len  = 0;
2332         r->rtm_tos      = fl4->flowi4_tos;
2333         r->rtm_table    = RT_TABLE_MAIN;
2334         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2335                 goto nla_put_failure;
2336         r->rtm_type     = rt->rt_type;
2337         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2338         r->rtm_protocol = RTPROT_UNSPEC;
2339         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2340         if (rt->rt_flags & RTCF_NOTIFY)
2341                 r->rtm_flags |= RTM_F_NOTIFY;
2342         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2343                 r->rtm_flags |= RTCF_DOREDIRECT;
2344
2345         if (nla_put_in_addr(skb, RTA_DST, dst))
2346                 goto nla_put_failure;
2347         if (src) {
2348                 r->rtm_src_len = 32;
2349                 if (nla_put_in_addr(skb, RTA_SRC, src))
2350                         goto nla_put_failure;
2351         }
2352         if (rt->dst.dev &&
2353             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2354                 goto nla_put_failure;
2355 #ifdef CONFIG_IP_ROUTE_CLASSID
2356         if (rt->dst.tclassid &&
2357             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2358                 goto nla_put_failure;
2359 #endif
2360         if (!rt_is_input_route(rt) &&
2361             fl4->saddr != src) {
2362                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2363                         goto nla_put_failure;
2364         }
2365         if (rt->rt_uses_gateway &&
2366             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2367                 goto nla_put_failure;
2368
2369         expires = rt->dst.expires;
2370         if (expires) {
2371                 unsigned long now = jiffies;
2372
2373                 if (time_before(now, expires))
2374                         expires -= now;
2375                 else
2376                         expires = 0;
2377         }
2378
2379         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2380         if (rt->rt_pmtu && expires)
2381                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2382         if (rtnetlink_put_metrics(skb, metrics) < 0)
2383                 goto nla_put_failure;
2384
2385         if (fl4->flowi4_mark &&
2386             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2387                 goto nla_put_failure;
2388
2389         error = rt->dst.error;
2390
2391         if (rt_is_input_route(rt)) {
2392 #ifdef CONFIG_IP_MROUTE
2393                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2394                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2395                         int err = ipmr_get_route(net, skb,
2396                                                  fl4->saddr, fl4->daddr,
2397                                                  r, nowait);
2398                         if (err <= 0) {
2399                                 if (!nowait) {
2400                                         if (err == 0)
2401                                                 return 0;
2402                                         goto nla_put_failure;
2403                                 } else {
2404                                         if (err == -EMSGSIZE)
2405                                                 goto nla_put_failure;
2406                                         error = err;
2407                                 }
2408                         }
2409                 } else
2410 #endif
2411                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2412                                 goto nla_put_failure;
2413         }
2414
2415         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2416                 goto nla_put_failure;
2417
2418         nlmsg_end(skb, nlh);
2419         return 0;
2420
2421 nla_put_failure:
2422         nlmsg_cancel(skb, nlh);
2423         return -EMSGSIZE;
2424 }
2425
2426 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2427 {
2428         struct net *net = sock_net(in_skb->sk);
2429         struct rtmsg *rtm;
2430         struct nlattr *tb[RTA_MAX+1];
2431         struct rtable *rt = NULL;
2432         struct flowi4 fl4;
2433         __be32 dst = 0;
2434         __be32 src = 0;
2435         u32 iif;
2436         int err;
2437         int mark;
2438         struct sk_buff *skb;
2439
2440         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2441         if (err < 0)
2442                 goto errout;
2443
2444         rtm = nlmsg_data(nlh);
2445
2446         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2447         if (!skb) {
2448                 err = -ENOBUFS;
2449                 goto errout;
2450         }
2451
2452         /* Reserve room for dummy headers, this skb can pass
2453            through good chunk of routing engine.
2454          */
2455         skb_reset_mac_header(skb);
2456         skb_reset_network_header(skb);
2457
2458         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2459         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2460         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2461
2462         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2463         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2464         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2465         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2466
2467         memset(&fl4, 0, sizeof(fl4));
2468         fl4.daddr = dst;
2469         fl4.saddr = src;
2470         fl4.flowi4_tos = rtm->rtm_tos;
2471         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2472         fl4.flowi4_mark = mark;
2473
2474         if (iif) {
2475                 struct net_device *dev;
2476
2477                 dev = __dev_get_by_index(net, iif);
2478                 if (!dev) {
2479                         err = -ENODEV;
2480                         goto errout_free;
2481                 }
2482
2483                 skb->protocol   = htons(ETH_P_IP);
2484                 skb->dev        = dev;
2485                 skb->mark       = mark;
2486                 local_bh_disable();
2487                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2488                 local_bh_enable();
2489
2490                 rt = skb_rtable(skb);
2491                 if (err == 0 && rt->dst.error)
2492                         err = -rt->dst.error;
2493         } else {
2494                 rt = ip_route_output_key(net, &fl4);
2495
2496                 err = 0;
2497                 if (IS_ERR(rt))
2498                         err = PTR_ERR(rt);
2499         }
2500
2501         if (err)
2502                 goto errout_free;
2503
2504         skb_dst_set(skb, &rt->dst);
2505         if (rtm->rtm_flags & RTM_F_NOTIFY)
2506                 rt->rt_flags |= RTCF_NOTIFY;
2507
2508         err = rt_fill_info(net, dst, src, &fl4, skb,
2509                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2510                            RTM_NEWROUTE, 0, 0);
2511         if (err < 0)
2512                 goto errout_free;
2513
2514         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2515 errout:
2516         return err;
2517
2518 errout_free:
2519         kfree_skb(skb);
2520         goto errout;
2521 }
2522
2523 void ip_rt_multicast_event(struct in_device *in_dev)
2524 {
2525         rt_cache_flush(dev_net(in_dev->dev));
2526 }
2527
2528 #ifdef CONFIG_SYSCTL
2529 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2530 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2531 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2532 static int ip_rt_gc_elasticity __read_mostly    = 8;
2533
2534 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2535                                         void __user *buffer,
2536                                         size_t *lenp, loff_t *ppos)
2537 {
2538         struct net *net = (struct net *)__ctl->extra1;
2539
2540         if (write) {
2541                 rt_cache_flush(net);
2542                 fnhe_genid_bump(net);
2543                 return 0;
2544         }
2545
2546         return -EINVAL;
2547 }
2548
2549 static struct ctl_table ipv4_route_table[] = {
2550         {
2551                 .procname       = "gc_thresh",
2552                 .data           = &ipv4_dst_ops.gc_thresh,
2553                 .maxlen         = sizeof(int),
2554                 .mode           = 0644,
2555                 .proc_handler   = proc_dointvec,
2556         },
2557         {
2558                 .procname       = "max_size",
2559                 .data           = &ip_rt_max_size,
2560                 .maxlen         = sizeof(int),
2561                 .mode           = 0644,
2562                 .proc_handler   = proc_dointvec,
2563         },
2564         {
2565                 /*  Deprecated. Use gc_min_interval_ms */
2566
2567                 .procname       = "gc_min_interval",
2568                 .data           = &ip_rt_gc_min_interval,
2569                 .maxlen         = sizeof(int),
2570                 .mode           = 0644,
2571                 .proc_handler   = proc_dointvec_jiffies,
2572         },
2573         {
2574                 .procname       = "gc_min_interval_ms",
2575                 .data           = &ip_rt_gc_min_interval,
2576                 .maxlen         = sizeof(int),
2577                 .mode           = 0644,
2578                 .proc_handler   = proc_dointvec_ms_jiffies,
2579         },
2580         {
2581                 .procname       = "gc_timeout",
2582                 .data           = &ip_rt_gc_timeout,
2583                 .maxlen         = sizeof(int),
2584                 .mode           = 0644,
2585                 .proc_handler   = proc_dointvec_jiffies,
2586         },
2587         {
2588                 .procname       = "gc_interval",
2589                 .data           = &ip_rt_gc_interval,
2590                 .maxlen         = sizeof(int),
2591                 .mode           = 0644,
2592                 .proc_handler   = proc_dointvec_jiffies,
2593         },
2594         {
2595                 .procname       = "redirect_load",
2596                 .data           = &ip_rt_redirect_load,
2597                 .maxlen         = sizeof(int),
2598                 .mode           = 0644,
2599                 .proc_handler   = proc_dointvec,
2600         },
2601         {
2602                 .procname       = "redirect_number",
2603                 .data           = &ip_rt_redirect_number,
2604                 .maxlen         = sizeof(int),
2605                 .mode           = 0644,
2606                 .proc_handler   = proc_dointvec,
2607         },
2608         {
2609                 .procname       = "redirect_silence",
2610                 .data           = &ip_rt_redirect_silence,
2611                 .maxlen         = sizeof(int),
2612                 .mode           = 0644,
2613                 .proc_handler   = proc_dointvec,
2614         },
2615         {
2616                 .procname       = "error_cost",
2617                 .data           = &ip_rt_error_cost,
2618                 .maxlen         = sizeof(int),
2619                 .mode           = 0644,
2620                 .proc_handler   = proc_dointvec,
2621         },
2622         {
2623                 .procname       = "error_burst",
2624                 .data           = &ip_rt_error_burst,
2625                 .maxlen         = sizeof(int),
2626                 .mode           = 0644,
2627                 .proc_handler   = proc_dointvec,
2628         },
2629         {
2630                 .procname       = "gc_elasticity",
2631                 .data           = &ip_rt_gc_elasticity,
2632                 .maxlen         = sizeof(int),
2633                 .mode           = 0644,
2634                 .proc_handler   = proc_dointvec,
2635         },
2636         {
2637                 .procname       = "mtu_expires",
2638                 .data           = &ip_rt_mtu_expires,
2639                 .maxlen         = sizeof(int),
2640                 .mode           = 0644,
2641                 .proc_handler   = proc_dointvec_jiffies,
2642         },
2643         {
2644                 .procname       = "min_pmtu",
2645                 .data           = &ip_rt_min_pmtu,
2646                 .maxlen         = sizeof(int),
2647                 .mode           = 0644,
2648                 .proc_handler   = proc_dointvec,
2649         },
2650         {
2651                 .procname       = "min_adv_mss",
2652                 .data           = &ip_rt_min_advmss,
2653                 .maxlen         = sizeof(int),
2654                 .mode           = 0644,
2655                 .proc_handler   = proc_dointvec,
2656         },
2657         { }
2658 };
2659
2660 static struct ctl_table ipv4_route_flush_table[] = {
2661         {
2662                 .procname       = "flush",
2663                 .maxlen         = sizeof(int),
2664                 .mode           = 0200,
2665                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2666         },
2667         { },
2668 };
2669
2670 static __net_init int sysctl_route_net_init(struct net *net)
2671 {
2672         struct ctl_table *tbl;
2673
2674         tbl = ipv4_route_flush_table;
2675         if (!net_eq(net, &init_net)) {
2676                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2677                 if (!tbl)
2678                         goto err_dup;
2679
2680                 /* Don't export sysctls to unprivileged users */
2681                 if (net->user_ns != &init_user_ns)
2682                         tbl[0].procname = NULL;
2683         }
2684         tbl[0].extra1 = net;
2685
2686         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2687         if (!net->ipv4.route_hdr)
2688                 goto err_reg;
2689         return 0;
2690
2691 err_reg:
2692         if (tbl != ipv4_route_flush_table)
2693                 kfree(tbl);
2694 err_dup:
2695         return -ENOMEM;
2696 }
2697
2698 static __net_exit void sysctl_route_net_exit(struct net *net)
2699 {
2700         struct ctl_table *tbl;
2701
2702         tbl = net->ipv4.route_hdr->ctl_table_arg;
2703         unregister_net_sysctl_table(net->ipv4.route_hdr);
2704         BUG_ON(tbl == ipv4_route_flush_table);
2705         kfree(tbl);
2706 }
2707
2708 static __net_initdata struct pernet_operations sysctl_route_ops = {
2709         .init = sysctl_route_net_init,
2710         .exit = sysctl_route_net_exit,
2711 };
2712 #endif
2713
2714 static __net_init int rt_genid_init(struct net *net)
2715 {
2716         atomic_set(&net->ipv4.rt_genid, 0);
2717         atomic_set(&net->fnhe_genid, 0);
2718         get_random_bytes(&net->ipv4.dev_addr_genid,
2719                          sizeof(net->ipv4.dev_addr_genid));
2720         return 0;
2721 }
2722
2723 static __net_initdata struct pernet_operations rt_genid_ops = {
2724         .init = rt_genid_init,
2725 };
2726
2727 static int __net_init ipv4_inetpeer_init(struct net *net)
2728 {
2729         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2730
2731         if (!bp)
2732                 return -ENOMEM;
2733         inet_peer_base_init(bp);
2734         net->ipv4.peers = bp;
2735         return 0;
2736 }
2737
2738 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2739 {
2740         struct inet_peer_base *bp = net->ipv4.peers;
2741
2742         net->ipv4.peers = NULL;
2743         inetpeer_invalidate_tree(bp);
2744         kfree(bp);
2745 }
2746
2747 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2748         .init   =       ipv4_inetpeer_init,
2749         .exit   =       ipv4_inetpeer_exit,
2750 };
2751
2752 #ifdef CONFIG_IP_ROUTE_CLASSID
2753 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2754 #endif /* CONFIG_IP_ROUTE_CLASSID */
2755
2756 int __init ip_rt_init(void)
2757 {
2758         int rc = 0;
2759         int cpu;
2760
2761         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2762         if (!ip_idents)
2763                 panic("IP: failed to allocate ip_idents\n");
2764
2765         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2766
2767         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2768         if (!ip_tstamps)
2769                 panic("IP: failed to allocate ip_tstamps\n");
2770
2771         for_each_possible_cpu(cpu) {
2772                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2773
2774                 INIT_LIST_HEAD(&ul->head);
2775                 spin_lock_init(&ul->lock);
2776         }
2777 #ifdef CONFIG_IP_ROUTE_CLASSID
2778         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2779         if (!ip_rt_acct)
2780                 panic("IP: failed to allocate ip_rt_acct\n");
2781 #endif
2782
2783         ipv4_dst_ops.kmem_cachep =
2784                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2785                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2786
2787         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2788
2789         if (dst_entries_init(&ipv4_dst_ops) < 0)
2790                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2791
2792         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2793                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2794
2795         ipv4_dst_ops.gc_thresh = ~0;
2796         ip_rt_max_size = INT_MAX;
2797
2798         devinet_init();
2799         ip_fib_init();
2800
2801         if (ip_rt_proc_init())
2802                 pr_err("Unable to create route proc files\n");
2803 #ifdef CONFIG_XFRM
2804         xfrm_init();
2805         xfrm4_init();
2806 #endif
2807         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2808
2809 #ifdef CONFIG_SYSCTL
2810         register_pernet_subsys(&sysctl_route_ops);
2811 #endif
2812         register_pernet_subsys(&rt_genid_ops);
2813         register_pernet_subsys(&ipv4_inetpeer_ops);
2814         return rc;
2815 }
2816
2817 #ifdef CONFIG_SYSCTL
2818 /*
2819  * We really need to sanitize the damn ipv4 init order, then all
2820  * this nonsense will go away.
2821  */
2822 void __init ip_static_sysctl_init(void)
2823 {
2824         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2825 }
2826 #endif