ipmr: RCU protection for mfc_cache_array
[cascardo/linux.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requirement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 struct mr_table {
73         struct list_head        list;
74 #ifdef CONFIG_NET_NS
75         struct net              *net;
76 #endif
77         u32                     id;
78         struct sock __rcu       *mroute_sk;
79         struct timer_list       ipmr_expire_timer;
80         struct list_head        mfc_unres_queue;
81         struct list_head        mfc_cache_array[MFC_LINES];
82         struct vif_device       vif_table[MAXVIFS];
83         int                     maxvif;
84         atomic_t                cache_resolve_queue_len;
85         int                     mroute_do_assert;
86         int                     mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88         int                     mroute_reg_vif_num;
89 #endif
90 };
91
92 struct ipmr_rule {
93         struct fib_rule         common;
94 };
95
96 struct ipmr_result {
97         struct mr_table         *mrt;
98 };
99
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103
104 static DEFINE_RWLOCK(mrt_lock);
105
106 /*
107  *      Multicast router control variables
108  */
109
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119
120    In this case data path is free of exclusive locks at all.
121  */
122
123 static struct kmem_cache *mrt_cachep __read_mostly;
124
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127                          struct sk_buff *skb, struct mfc_cache *cache,
128                          int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130                              struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132                               struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137         list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141         struct mr_table *mrt;
142
143         ipmr_for_each_table(mrt, net) {
144                 if (mrt->id == id)
145                         return mrt;
146         }
147         return NULL;
148 }
149
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151                            struct mr_table **mrt)
152 {
153         struct ipmr_result res;
154         struct fib_lookup_arg arg = { .result = &res, };
155         int err;
156
157         err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158         if (err < 0)
159                 return err;
160         *mrt = res.mrt;
161         return 0;
162 }
163
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165                             int flags, struct fib_lookup_arg *arg)
166 {
167         struct ipmr_result *res = arg->result;
168         struct mr_table *mrt;
169
170         switch (rule->action) {
171         case FR_ACT_TO_TBL:
172                 break;
173         case FR_ACT_UNREACHABLE:
174                 return -ENETUNREACH;
175         case FR_ACT_PROHIBIT:
176                 return -EACCES;
177         case FR_ACT_BLACKHOLE:
178         default:
179                 return -EINVAL;
180         }
181
182         mrt = ipmr_get_table(rule->fr_net, rule->table);
183         if (mrt == NULL)
184                 return -EAGAIN;
185         res->mrt = mrt;
186         return 0;
187 }
188
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191         return 1;
192 }
193
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195         FRA_GENERIC_POLICY,
196 };
197
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199                                struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201         return 0;
202 }
203
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205                              struct nlattr **tb)
206 {
207         return 1;
208 }
209
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211                           struct fib_rule_hdr *frh)
212 {
213         frh->dst_len = 0;
214         frh->src_len = 0;
215         frh->tos     = 0;
216         return 0;
217 }
218
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220         .family         = RTNL_FAMILY_IPMR,
221         .rule_size      = sizeof(struct ipmr_rule),
222         .addr_size      = sizeof(u32),
223         .action         = ipmr_rule_action,
224         .match          = ipmr_rule_match,
225         .configure      = ipmr_rule_configure,
226         .compare        = ipmr_rule_compare,
227         .default_pref   = fib_default_rule_pref,
228         .fill           = ipmr_rule_fill,
229         .nlgroup        = RTNLGRP_IPV4_RULE,
230         .policy         = ipmr_rule_policy,
231         .owner          = THIS_MODULE,
232 };
233
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236         struct fib_rules_ops *ops;
237         struct mr_table *mrt;
238         int err;
239
240         ops = fib_rules_register(&ipmr_rules_ops_template, net);
241         if (IS_ERR(ops))
242                 return PTR_ERR(ops);
243
244         INIT_LIST_HEAD(&net->ipv4.mr_tables);
245
246         mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247         if (mrt == NULL) {
248                 err = -ENOMEM;
249                 goto err1;
250         }
251
252         err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253         if (err < 0)
254                 goto err2;
255
256         net->ipv4.mr_rules_ops = ops;
257         return 0;
258
259 err2:
260         kfree(mrt);
261 err1:
262         fib_rules_unregister(ops);
263         return err;
264 }
265
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268         struct mr_table *mrt, *next;
269
270         list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271                 list_del(&mrt->list);
272                 kfree(mrt);
273         }
274         fib_rules_unregister(net->ipv4.mr_rules_ops);
275 }
276 #else
277 #define ipmr_for_each_table(mrt, net) \
278         for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279
280 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281 {
282         return net->ipv4.mrt;
283 }
284
285 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286                            struct mr_table **mrt)
287 {
288         *mrt = net->ipv4.mrt;
289         return 0;
290 }
291
292 static int __net_init ipmr_rules_init(struct net *net)
293 {
294         net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295         return net->ipv4.mrt ? 0 : -ENOMEM;
296 }
297
298 static void __net_exit ipmr_rules_exit(struct net *net)
299 {
300         kfree(net->ipv4.mrt);
301 }
302 #endif
303
304 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305 {
306         struct mr_table *mrt;
307         unsigned int i;
308
309         mrt = ipmr_get_table(net, id);
310         if (mrt != NULL)
311                 return mrt;
312
313         mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314         if (mrt == NULL)
315                 return NULL;
316         write_pnet(&mrt->net, net);
317         mrt->id = id;
318
319         /* Forwarding cache */
320         for (i = 0; i < MFC_LINES; i++)
321                 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322
323         INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324
325         setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326                     (unsigned long)mrt);
327
328 #ifdef CONFIG_IP_PIMSM
329         mrt->mroute_reg_vif_num = -1;
330 #endif
331 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332         list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333 #endif
334         return mrt;
335 }
336
337 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
338
339 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
340 {
341         struct net *net = dev_net(dev);
342
343         dev_close(dev);
344
345         dev = __dev_get_by_name(net, "tunl0");
346         if (dev) {
347                 const struct net_device_ops *ops = dev->netdev_ops;
348                 struct ifreq ifr;
349                 struct ip_tunnel_parm p;
350
351                 memset(&p, 0, sizeof(p));
352                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
353                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
354                 p.iph.version = 4;
355                 p.iph.ihl = 5;
356                 p.iph.protocol = IPPROTO_IPIP;
357                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
358                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
359
360                 if (ops->ndo_do_ioctl) {
361                         mm_segment_t oldfs = get_fs();
362
363                         set_fs(KERNEL_DS);
364                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
365                         set_fs(oldfs);
366                 }
367         }
368 }
369
370 static
371 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
372 {
373         struct net_device  *dev;
374
375         dev = __dev_get_by_name(net, "tunl0");
376
377         if (dev) {
378                 const struct net_device_ops *ops = dev->netdev_ops;
379                 int err;
380                 struct ifreq ifr;
381                 struct ip_tunnel_parm p;
382                 struct in_device  *in_dev;
383
384                 memset(&p, 0, sizeof(p));
385                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
386                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
387                 p.iph.version = 4;
388                 p.iph.ihl = 5;
389                 p.iph.protocol = IPPROTO_IPIP;
390                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
391                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
392
393                 if (ops->ndo_do_ioctl) {
394                         mm_segment_t oldfs = get_fs();
395
396                         set_fs(KERNEL_DS);
397                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398                         set_fs(oldfs);
399                 } else
400                         err = -EOPNOTSUPP;
401
402                 dev = NULL;
403
404                 if (err == 0 &&
405                     (dev = __dev_get_by_name(net, p.name)) != NULL) {
406                         dev->flags |= IFF_MULTICAST;
407
408                         in_dev = __in_dev_get_rtnl(dev);
409                         if (in_dev == NULL)
410                                 goto failure;
411
412                         ipv4_devconf_setall(in_dev);
413                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
414
415                         if (dev_open(dev))
416                                 goto failure;
417                         dev_hold(dev);
418                 }
419         }
420         return dev;
421
422 failure:
423         /* allow the register to be completed before unregistering. */
424         rtnl_unlock();
425         rtnl_lock();
426
427         unregister_netdevice(dev);
428         return NULL;
429 }
430
431 #ifdef CONFIG_IP_PIMSM
432
433 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434 {
435         struct net *net = dev_net(dev);
436         struct mr_table *mrt;
437         struct flowi fl = {
438                 .oif            = dev->ifindex,
439                 .iif            = skb->skb_iif,
440                 .mark           = skb->mark,
441         };
442         int err;
443
444         err = ipmr_fib_lookup(net, &fl, &mrt);
445         if (err < 0) {
446                 kfree_skb(skb);
447                 return err;
448         }
449
450         read_lock(&mrt_lock);
451         dev->stats.tx_bytes += skb->len;
452         dev->stats.tx_packets++;
453         ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
454         read_unlock(&mrt_lock);
455         kfree_skb(skb);
456         return NETDEV_TX_OK;
457 }
458
459 static const struct net_device_ops reg_vif_netdev_ops = {
460         .ndo_start_xmit = reg_vif_xmit,
461 };
462
463 static void reg_vif_setup(struct net_device *dev)
464 {
465         dev->type               = ARPHRD_PIMREG;
466         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
467         dev->flags              = IFF_NOARP;
468         dev->netdev_ops         = &reg_vif_netdev_ops,
469         dev->destructor         = free_netdev;
470         dev->features           |= NETIF_F_NETNS_LOCAL;
471 }
472
473 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
474 {
475         struct net_device *dev;
476         struct in_device *in_dev;
477         char name[IFNAMSIZ];
478
479         if (mrt->id == RT_TABLE_DEFAULT)
480                 sprintf(name, "pimreg");
481         else
482                 sprintf(name, "pimreg%u", mrt->id);
483
484         dev = alloc_netdev(0, name, reg_vif_setup);
485
486         if (dev == NULL)
487                 return NULL;
488
489         dev_net_set(dev, net);
490
491         if (register_netdevice(dev)) {
492                 free_netdev(dev);
493                 return NULL;
494         }
495         dev->iflink = 0;
496
497         rcu_read_lock();
498         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
499                 rcu_read_unlock();
500                 goto failure;
501         }
502
503         ipv4_devconf_setall(in_dev);
504         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
505         rcu_read_unlock();
506
507         if (dev_open(dev))
508                 goto failure;
509
510         dev_hold(dev);
511
512         return dev;
513
514 failure:
515         /* allow the register to be completed before unregistering. */
516         rtnl_unlock();
517         rtnl_lock();
518
519         unregister_netdevice(dev);
520         return NULL;
521 }
522 #endif
523
524 /*
525  *      Delete a VIF entry
526  *      @notify: Set to 1, if the caller is a notifier_call
527  */
528
529 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
530                       struct list_head *head)
531 {
532         struct vif_device *v;
533         struct net_device *dev;
534         struct in_device *in_dev;
535
536         if (vifi < 0 || vifi >= mrt->maxvif)
537                 return -EADDRNOTAVAIL;
538
539         v = &mrt->vif_table[vifi];
540
541         write_lock_bh(&mrt_lock);
542         dev = v->dev;
543         v->dev = NULL;
544
545         if (!dev) {
546                 write_unlock_bh(&mrt_lock);
547                 return -EADDRNOTAVAIL;
548         }
549
550 #ifdef CONFIG_IP_PIMSM
551         if (vifi == mrt->mroute_reg_vif_num)
552                 mrt->mroute_reg_vif_num = -1;
553 #endif
554
555         if (vifi+1 == mrt->maxvif) {
556                 int tmp;
557                 for (tmp=vifi-1; tmp>=0; tmp--) {
558                         if (VIF_EXISTS(mrt, tmp))
559                                 break;
560                 }
561                 mrt->maxvif = tmp+1;
562         }
563
564         write_unlock_bh(&mrt_lock);
565
566         dev_set_allmulti(dev, -1);
567
568         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
569                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570                 ip_rt_multicast_event(in_dev);
571         }
572
573         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
574                 unregister_netdevice_queue(dev, head);
575
576         dev_put(dev);
577         return 0;
578 }
579
580 static void ipmr_cache_free_rcu(struct rcu_head *head)
581 {
582         struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
583
584         kmem_cache_free(mrt_cachep, c);
585 }
586
587 static inline void ipmr_cache_free(struct mfc_cache *c)
588 {
589         call_rcu(&c->rcu, ipmr_cache_free_rcu);
590 }
591
592 /* Destroy an unresolved cache entry, killing queued skbs
593    and reporting error to netlink readers.
594  */
595
596 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
597 {
598         struct net *net = read_pnet(&mrt->net);
599         struct sk_buff *skb;
600         struct nlmsgerr *e;
601
602         atomic_dec(&mrt->cache_resolve_queue_len);
603
604         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
605                 if (ip_hdr(skb)->version == 0) {
606                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
607                         nlh->nlmsg_type = NLMSG_ERROR;
608                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
609                         skb_trim(skb, nlh->nlmsg_len);
610                         e = NLMSG_DATA(nlh);
611                         e->error = -ETIMEDOUT;
612                         memset(&e->msg, 0, sizeof(e->msg));
613
614                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
615                 } else
616                         kfree_skb(skb);
617         }
618
619         ipmr_cache_free(c);
620 }
621
622
623 /* Timer process for the unresolved queue. */
624
625 static void ipmr_expire_process(unsigned long arg)
626 {
627         struct mr_table *mrt = (struct mr_table *)arg;
628         unsigned long now;
629         unsigned long expires;
630         struct mfc_cache *c, *next;
631
632         if (!spin_trylock(&mfc_unres_lock)) {
633                 mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
634                 return;
635         }
636
637         if (list_empty(&mrt->mfc_unres_queue))
638                 goto out;
639
640         now = jiffies;
641         expires = 10*HZ;
642
643         list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
644                 if (time_after(c->mfc_un.unres.expires, now)) {
645                         unsigned long interval = c->mfc_un.unres.expires - now;
646                         if (interval < expires)
647                                 expires = interval;
648                         continue;
649                 }
650
651                 list_del(&c->list);
652                 ipmr_destroy_unres(mrt, c);
653         }
654
655         if (!list_empty(&mrt->mfc_unres_queue))
656                 mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
657
658 out:
659         spin_unlock(&mfc_unres_lock);
660 }
661
662 /* Fill oifs list. It is called under write locked mrt_lock. */
663
664 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
665                                    unsigned char *ttls)
666 {
667         int vifi;
668
669         cache->mfc_un.res.minvif = MAXVIFS;
670         cache->mfc_un.res.maxvif = 0;
671         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
672
673         for (vifi = 0; vifi < mrt->maxvif; vifi++) {
674                 if (VIF_EXISTS(mrt, vifi) &&
675                     ttls[vifi] && ttls[vifi] < 255) {
676                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
677                         if (cache->mfc_un.res.minvif > vifi)
678                                 cache->mfc_un.res.minvif = vifi;
679                         if (cache->mfc_un.res.maxvif <= vifi)
680                                 cache->mfc_un.res.maxvif = vifi + 1;
681                 }
682         }
683 }
684
685 static int vif_add(struct net *net, struct mr_table *mrt,
686                    struct vifctl *vifc, int mrtsock)
687 {
688         int vifi = vifc->vifc_vifi;
689         struct vif_device *v = &mrt->vif_table[vifi];
690         struct net_device *dev;
691         struct in_device *in_dev;
692         int err;
693
694         /* Is vif busy ? */
695         if (VIF_EXISTS(mrt, vifi))
696                 return -EADDRINUSE;
697
698         switch (vifc->vifc_flags) {
699 #ifdef CONFIG_IP_PIMSM
700         case VIFF_REGISTER:
701                 /*
702                  * Special Purpose VIF in PIM
703                  * All the packets will be sent to the daemon
704                  */
705                 if (mrt->mroute_reg_vif_num >= 0)
706                         return -EADDRINUSE;
707                 dev = ipmr_reg_vif(net, mrt);
708                 if (!dev)
709                         return -ENOBUFS;
710                 err = dev_set_allmulti(dev, 1);
711                 if (err) {
712                         unregister_netdevice(dev);
713                         dev_put(dev);
714                         return err;
715                 }
716                 break;
717 #endif
718         case VIFF_TUNNEL:
719                 dev = ipmr_new_tunnel(net, vifc);
720                 if (!dev)
721                         return -ENOBUFS;
722                 err = dev_set_allmulti(dev, 1);
723                 if (err) {
724                         ipmr_del_tunnel(dev, vifc);
725                         dev_put(dev);
726                         return err;
727                 }
728                 break;
729
730         case VIFF_USE_IFINDEX:
731         case 0:
732                 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
733                         dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
734                         if (dev && __in_dev_get_rtnl(dev) == NULL) {
735                                 dev_put(dev);
736                                 return -EADDRNOTAVAIL;
737                         }
738                 } else
739                         dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
740
741                 if (!dev)
742                         return -EADDRNOTAVAIL;
743                 err = dev_set_allmulti(dev, 1);
744                 if (err) {
745                         dev_put(dev);
746                         return err;
747                 }
748                 break;
749         default:
750                 return -EINVAL;
751         }
752
753         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
754                 dev_put(dev);
755                 return -EADDRNOTAVAIL;
756         }
757         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
758         ip_rt_multicast_event(in_dev);
759
760         /*
761          *      Fill in the VIF structures
762          */
763         v->rate_limit = vifc->vifc_rate_limit;
764         v->local = vifc->vifc_lcl_addr.s_addr;
765         v->remote = vifc->vifc_rmt_addr.s_addr;
766         v->flags = vifc->vifc_flags;
767         if (!mrtsock)
768                 v->flags |= VIFF_STATIC;
769         v->threshold = vifc->vifc_threshold;
770         v->bytes_in = 0;
771         v->bytes_out = 0;
772         v->pkt_in = 0;
773         v->pkt_out = 0;
774         v->link = dev->ifindex;
775         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
776                 v->link = dev->iflink;
777
778         /* And finish update writing critical data */
779         write_lock_bh(&mrt_lock);
780         v->dev = dev;
781 #ifdef CONFIG_IP_PIMSM
782         if (v->flags&VIFF_REGISTER)
783                 mrt->mroute_reg_vif_num = vifi;
784 #endif
785         if (vifi+1 > mrt->maxvif)
786                 mrt->maxvif = vifi+1;
787         write_unlock_bh(&mrt_lock);
788         return 0;
789 }
790
791 /* called with rcu_read_lock() */
792 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
793                                          __be32 origin,
794                                          __be32 mcastgrp)
795 {
796         int line = MFC_HASH(mcastgrp, origin);
797         struct mfc_cache *c;
798
799         list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
800                 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
801                         return c;
802         }
803         return NULL;
804 }
805
806 /*
807  *      Allocate a multicast cache entry
808  */
809 static struct mfc_cache *ipmr_cache_alloc(void)
810 {
811         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
812
813         if (c)
814                 c->mfc_un.res.minvif = MAXVIFS;
815         return c;
816 }
817
818 static struct mfc_cache *ipmr_cache_alloc_unres(void)
819 {
820         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
821
822         if (c) {
823                 skb_queue_head_init(&c->mfc_un.unres.unresolved);
824                 c->mfc_un.unres.expires = jiffies + 10*HZ;
825         }
826         return c;
827 }
828
829 /*
830  *      A cache entry has gone into a resolved state from queued
831  */
832
833 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
834                                struct mfc_cache *uc, struct mfc_cache *c)
835 {
836         struct sk_buff *skb;
837         struct nlmsgerr *e;
838
839         /*
840          *      Play the pending entries through our router
841          */
842
843         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
844                 if (ip_hdr(skb)->version == 0) {
845                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
846
847                         if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
848                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
849                                                   (u8 *)nlh);
850                         } else {
851                                 nlh->nlmsg_type = NLMSG_ERROR;
852                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
853                                 skb_trim(skb, nlh->nlmsg_len);
854                                 e = NLMSG_DATA(nlh);
855                                 e->error = -EMSGSIZE;
856                                 memset(&e->msg, 0, sizeof(e->msg));
857                         }
858
859                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
860                 } else
861                         ip_mr_forward(net, mrt, skb, c, 0);
862         }
863 }
864
865 /*
866  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
867  *      expects the following bizarre scheme.
868  *
869  *      Called under mrt_lock.
870  */
871
872 static int ipmr_cache_report(struct mr_table *mrt,
873                              struct sk_buff *pkt, vifi_t vifi, int assert)
874 {
875         struct sk_buff *skb;
876         const int ihl = ip_hdrlen(pkt);
877         struct igmphdr *igmp;
878         struct igmpmsg *msg;
879         struct sock *mroute_sk;
880         int ret;
881
882 #ifdef CONFIG_IP_PIMSM
883         if (assert == IGMPMSG_WHOLEPKT)
884                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
885         else
886 #endif
887                 skb = alloc_skb(128, GFP_ATOMIC);
888
889         if (!skb)
890                 return -ENOBUFS;
891
892 #ifdef CONFIG_IP_PIMSM
893         if (assert == IGMPMSG_WHOLEPKT) {
894                 /* Ugly, but we have no choice with this interface.
895                    Duplicate old header, fix ihl, length etc.
896                    And all this only to mangle msg->im_msgtype and
897                    to set msg->im_mbz to "mbz" :-)
898                  */
899                 skb_push(skb, sizeof(struct iphdr));
900                 skb_reset_network_header(skb);
901                 skb_reset_transport_header(skb);
902                 msg = (struct igmpmsg *)skb_network_header(skb);
903                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
904                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
905                 msg->im_mbz = 0;
906                 msg->im_vif = mrt->mroute_reg_vif_num;
907                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
908                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
909                                              sizeof(struct iphdr));
910         } else
911 #endif
912         {
913
914         /*
915          *      Copy the IP header
916          */
917
918         skb->network_header = skb->tail;
919         skb_put(skb, ihl);
920         skb_copy_to_linear_data(skb, pkt->data, ihl);
921         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
922         msg = (struct igmpmsg *)skb_network_header(skb);
923         msg->im_vif = vifi;
924         skb_dst_set(skb, dst_clone(skb_dst(pkt)));
925
926         /*
927          *      Add our header
928          */
929
930         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
931         igmp->type      =
932         msg->im_msgtype = assert;
933         igmp->code      =       0;
934         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
935         skb->transport_header = skb->network_header;
936         }
937
938         rcu_read_lock();
939         mroute_sk = rcu_dereference(mrt->mroute_sk);
940         if (mroute_sk == NULL) {
941                 rcu_read_unlock();
942                 kfree_skb(skb);
943                 return -EINVAL;
944         }
945
946         /*
947          *      Deliver to mrouted
948          */
949         ret = sock_queue_rcv_skb(mroute_sk, skb);
950         rcu_read_unlock();
951         if (ret < 0) {
952                 if (net_ratelimit())
953                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
954                 kfree_skb(skb);
955         }
956
957         return ret;
958 }
959
960 /*
961  *      Queue a packet for resolution. It gets locked cache entry!
962  */
963
964 static int
965 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
966 {
967         bool found = false;
968         int err;
969         struct mfc_cache *c;
970         const struct iphdr *iph = ip_hdr(skb);
971
972         spin_lock_bh(&mfc_unres_lock);
973         list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
974                 if (c->mfc_mcastgrp == iph->daddr &&
975                     c->mfc_origin == iph->saddr) {
976                         found = true;
977                         break;
978                 }
979         }
980
981         if (!found) {
982                 /*
983                  *      Create a new entry if allowable
984                  */
985
986                 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
987                     (c = ipmr_cache_alloc_unres()) == NULL) {
988                         spin_unlock_bh(&mfc_unres_lock);
989
990                         kfree_skb(skb);
991                         return -ENOBUFS;
992                 }
993
994                 /*
995                  *      Fill in the new cache entry
996                  */
997                 c->mfc_parent   = -1;
998                 c->mfc_origin   = iph->saddr;
999                 c->mfc_mcastgrp = iph->daddr;
1000
1001                 /*
1002                  *      Reflect first query at mrouted.
1003                  */
1004                 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
1005                 if (err < 0) {
1006                         /* If the report failed throw the cache entry
1007                            out - Brad Parker
1008                          */
1009                         spin_unlock_bh(&mfc_unres_lock);
1010
1011                         ipmr_cache_free(c);
1012                         kfree_skb(skb);
1013                         return err;
1014                 }
1015
1016                 atomic_inc(&mrt->cache_resolve_queue_len);
1017                 list_add(&c->list, &mrt->mfc_unres_queue);
1018
1019                 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1020                         mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1021         }
1022
1023         /*
1024          *      See if we can append the packet
1025          */
1026         if (c->mfc_un.unres.unresolved.qlen>3) {
1027                 kfree_skb(skb);
1028                 err = -ENOBUFS;
1029         } else {
1030                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1031                 err = 0;
1032         }
1033
1034         spin_unlock_bh(&mfc_unres_lock);
1035         return err;
1036 }
1037
1038 /*
1039  *      MFC cache manipulation by user space mroute daemon
1040  */
1041
1042 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1043 {
1044         int line;
1045         struct mfc_cache *c, *next;
1046
1047         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1048
1049         list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1050                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1051                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1052                         list_del_rcu(&c->list);
1053
1054                         ipmr_cache_free(c);
1055                         return 0;
1056                 }
1057         }
1058         return -ENOENT;
1059 }
1060
1061 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1062                         struct mfcctl *mfc, int mrtsock)
1063 {
1064         bool found = false;
1065         int line;
1066         struct mfc_cache *uc, *c;
1067
1068         if (mfc->mfcc_parent >= MAXVIFS)
1069                 return -ENFILE;
1070
1071         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1072
1073         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1074                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1075                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1076                         found = true;
1077                         break;
1078                 }
1079         }
1080
1081         if (found) {
1082                 write_lock_bh(&mrt_lock);
1083                 c->mfc_parent = mfc->mfcc_parent;
1084                 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1085                 if (!mrtsock)
1086                         c->mfc_flags |= MFC_STATIC;
1087                 write_unlock_bh(&mrt_lock);
1088                 return 0;
1089         }
1090
1091         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1092                 return -EINVAL;
1093
1094         c = ipmr_cache_alloc();
1095         if (c == NULL)
1096                 return -ENOMEM;
1097
1098         c->mfc_origin = mfc->mfcc_origin.s_addr;
1099         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1100         c->mfc_parent = mfc->mfcc_parent;
1101         ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1102         if (!mrtsock)
1103                 c->mfc_flags |= MFC_STATIC;
1104
1105         list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
1106
1107         /*
1108          *      Check to see if we resolved a queued list. If so we
1109          *      need to send on the frames and tidy up.
1110          */
1111         found = false;
1112         spin_lock_bh(&mfc_unres_lock);
1113         list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1114                 if (uc->mfc_origin == c->mfc_origin &&
1115                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1116                         list_del(&uc->list);
1117                         atomic_dec(&mrt->cache_resolve_queue_len);
1118                         found = true;
1119                         break;
1120                 }
1121         }
1122         if (list_empty(&mrt->mfc_unres_queue))
1123                 del_timer(&mrt->ipmr_expire_timer);
1124         spin_unlock_bh(&mfc_unres_lock);
1125
1126         if (found) {
1127                 ipmr_cache_resolve(net, mrt, uc, c);
1128                 ipmr_cache_free(uc);
1129         }
1130         return 0;
1131 }
1132
1133 /*
1134  *      Close the multicast socket, and clear the vif tables etc
1135  */
1136
1137 static void mroute_clean_tables(struct mr_table *mrt)
1138 {
1139         int i;
1140         LIST_HEAD(list);
1141         struct mfc_cache *c, *next;
1142
1143         /*
1144          *      Shut down all active vif entries
1145          */
1146         for (i = 0; i < mrt->maxvif; i++) {
1147                 if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1148                         vif_delete(mrt, i, 0, &list);
1149         }
1150         unregister_netdevice_many(&list);
1151
1152         /*
1153          *      Wipe the cache
1154          */
1155         for (i = 0; i < MFC_LINES; i++) {
1156                 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1157                         if (c->mfc_flags & MFC_STATIC)
1158                                 continue;
1159                         list_del_rcu(&c->list);
1160                         ipmr_cache_free(c);
1161                 }
1162         }
1163
1164         if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1165                 spin_lock_bh(&mfc_unres_lock);
1166                 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1167                         list_del(&c->list);
1168                         ipmr_destroy_unres(mrt, c);
1169                 }
1170                 spin_unlock_bh(&mfc_unres_lock);
1171         }
1172 }
1173
1174 /* called from ip_ra_control(), before an RCU grace period,
1175  * we dont need to call synchronize_rcu() here
1176  */
1177 static void mrtsock_destruct(struct sock *sk)
1178 {
1179         struct net *net = sock_net(sk);
1180         struct mr_table *mrt;
1181
1182         rtnl_lock();
1183         ipmr_for_each_table(mrt, net) {
1184                 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1185                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1186                         rcu_assign_pointer(mrt->mroute_sk, NULL);
1187                         mroute_clean_tables(mrt);
1188                 }
1189         }
1190         rtnl_unlock();
1191 }
1192
1193 /*
1194  *      Socket options and virtual interface manipulation. The whole
1195  *      virtual interface system is a complete heap, but unfortunately
1196  *      that's how BSD mrouted happens to think. Maybe one day with a proper
1197  *      MOSPF/PIM router set up we can clean this up.
1198  */
1199
1200 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1201 {
1202         int ret;
1203         struct vifctl vif;
1204         struct mfcctl mfc;
1205         struct net *net = sock_net(sk);
1206         struct mr_table *mrt;
1207
1208         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1209         if (mrt == NULL)
1210                 return -ENOENT;
1211
1212         if (optname != MRT_INIT) {
1213                 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1214                     !capable(CAP_NET_ADMIN))
1215                         return -EACCES;
1216         }
1217
1218         switch (optname) {
1219         case MRT_INIT:
1220                 if (sk->sk_type != SOCK_RAW ||
1221                     inet_sk(sk)->inet_num != IPPROTO_IGMP)
1222                         return -EOPNOTSUPP;
1223                 if (optlen != sizeof(int))
1224                         return -ENOPROTOOPT;
1225
1226                 rtnl_lock();
1227                 if (rtnl_dereference(mrt->mroute_sk)) {
1228                         rtnl_unlock();
1229                         return -EADDRINUSE;
1230                 }
1231
1232                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1233                 if (ret == 0) {
1234                         rcu_assign_pointer(mrt->mroute_sk, sk);
1235                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1236                 }
1237                 rtnl_unlock();
1238                 return ret;
1239         case MRT_DONE:
1240                 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1241                         return -EACCES;
1242                 return ip_ra_control(sk, 0, NULL);
1243         case MRT_ADD_VIF:
1244         case MRT_DEL_VIF:
1245                 if (optlen != sizeof(vif))
1246                         return -EINVAL;
1247                 if (copy_from_user(&vif, optval, sizeof(vif)))
1248                         return -EFAULT;
1249                 if (vif.vifc_vifi >= MAXVIFS)
1250                         return -ENFILE;
1251                 rtnl_lock();
1252                 if (optname == MRT_ADD_VIF) {
1253                         ret = vif_add(net, mrt, &vif,
1254                                       sk == rtnl_dereference(mrt->mroute_sk));
1255                 } else {
1256                         ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1257                 }
1258                 rtnl_unlock();
1259                 return ret;
1260
1261                 /*
1262                  *      Manipulate the forwarding caches. These live
1263                  *      in a sort of kernel/user symbiosis.
1264                  */
1265         case MRT_ADD_MFC:
1266         case MRT_DEL_MFC:
1267                 if (optlen != sizeof(mfc))
1268                         return -EINVAL;
1269                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1270                         return -EFAULT;
1271                 rtnl_lock();
1272                 if (optname == MRT_DEL_MFC)
1273                         ret = ipmr_mfc_delete(mrt, &mfc);
1274                 else
1275                         ret = ipmr_mfc_add(net, mrt, &mfc,
1276                                            sk == rtnl_dereference(mrt->mroute_sk));
1277                 rtnl_unlock();
1278                 return ret;
1279                 /*
1280                  *      Control PIM assert.
1281                  */
1282         case MRT_ASSERT:
1283         {
1284                 int v;
1285                 if (get_user(v,(int __user *)optval))
1286                         return -EFAULT;
1287                 mrt->mroute_do_assert = (v) ? 1 : 0;
1288                 return 0;
1289         }
1290 #ifdef CONFIG_IP_PIMSM
1291         case MRT_PIM:
1292         {
1293                 int v;
1294
1295                 if (get_user(v,(int __user *)optval))
1296                         return -EFAULT;
1297                 v = (v) ? 1 : 0;
1298
1299                 rtnl_lock();
1300                 ret = 0;
1301                 if (v != mrt->mroute_do_pim) {
1302                         mrt->mroute_do_pim = v;
1303                         mrt->mroute_do_assert = v;
1304                 }
1305                 rtnl_unlock();
1306                 return ret;
1307         }
1308 #endif
1309 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1310         case MRT_TABLE:
1311         {
1312                 u32 v;
1313
1314                 if (optlen != sizeof(u32))
1315                         return -EINVAL;
1316                 if (get_user(v, (u32 __user *)optval))
1317                         return -EFAULT;
1318
1319                 rtnl_lock();
1320                 ret = 0;
1321                 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1322                         ret = -EBUSY;
1323                 } else {
1324                         if (!ipmr_new_table(net, v))
1325                                 ret = -ENOMEM;
1326                         raw_sk(sk)->ipmr_table = v;
1327                 }
1328                 rtnl_unlock();
1329                 return ret;
1330         }
1331 #endif
1332         /*
1333          *      Spurious command, or MRT_VERSION which you cannot
1334          *      set.
1335          */
1336         default:
1337                 return -ENOPROTOOPT;
1338         }
1339 }
1340
1341 /*
1342  *      Getsock opt support for the multicast routing system.
1343  */
1344
1345 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1346 {
1347         int olr;
1348         int val;
1349         struct net *net = sock_net(sk);
1350         struct mr_table *mrt;
1351
1352         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1353         if (mrt == NULL)
1354                 return -ENOENT;
1355
1356         if (optname != MRT_VERSION &&
1357 #ifdef CONFIG_IP_PIMSM
1358            optname!=MRT_PIM &&
1359 #endif
1360            optname!=MRT_ASSERT)
1361                 return -ENOPROTOOPT;
1362
1363         if (get_user(olr, optlen))
1364                 return -EFAULT;
1365
1366         olr = min_t(unsigned int, olr, sizeof(int));
1367         if (olr < 0)
1368                 return -EINVAL;
1369
1370         if (put_user(olr, optlen))
1371                 return -EFAULT;
1372         if (optname == MRT_VERSION)
1373                 val = 0x0305;
1374 #ifdef CONFIG_IP_PIMSM
1375         else if (optname == MRT_PIM)
1376                 val = mrt->mroute_do_pim;
1377 #endif
1378         else
1379                 val = mrt->mroute_do_assert;
1380         if (copy_to_user(optval, &val, olr))
1381                 return -EFAULT;
1382         return 0;
1383 }
1384
1385 /*
1386  *      The IP multicast ioctl support routines.
1387  */
1388
1389 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1390 {
1391         struct sioc_sg_req sr;
1392         struct sioc_vif_req vr;
1393         struct vif_device *vif;
1394         struct mfc_cache *c;
1395         struct net *net = sock_net(sk);
1396         struct mr_table *mrt;
1397
1398         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1399         if (mrt == NULL)
1400                 return -ENOENT;
1401
1402         switch (cmd) {
1403         case SIOCGETVIFCNT:
1404                 if (copy_from_user(&vr, arg, sizeof(vr)))
1405                         return -EFAULT;
1406                 if (vr.vifi >= mrt->maxvif)
1407                         return -EINVAL;
1408                 read_lock(&mrt_lock);
1409                 vif = &mrt->vif_table[vr.vifi];
1410                 if (VIF_EXISTS(mrt, vr.vifi)) {
1411                         vr.icount = vif->pkt_in;
1412                         vr.ocount = vif->pkt_out;
1413                         vr.ibytes = vif->bytes_in;
1414                         vr.obytes = vif->bytes_out;
1415                         read_unlock(&mrt_lock);
1416
1417                         if (copy_to_user(arg, &vr, sizeof(vr)))
1418                                 return -EFAULT;
1419                         return 0;
1420                 }
1421                 read_unlock(&mrt_lock);
1422                 return -EADDRNOTAVAIL;
1423         case SIOCGETSGCNT:
1424                 if (copy_from_user(&sr, arg, sizeof(sr)))
1425                         return -EFAULT;
1426
1427                 rcu_read_lock();
1428                 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1429                 if (c) {
1430                         sr.pktcnt = c->mfc_un.res.pkt;
1431                         sr.bytecnt = c->mfc_un.res.bytes;
1432                         sr.wrong_if = c->mfc_un.res.wrong_if;
1433                         rcu_read_unlock();
1434
1435                         if (copy_to_user(arg, &sr, sizeof(sr)))
1436                                 return -EFAULT;
1437                         return 0;
1438                 }
1439                 rcu_read_unlock();
1440                 return -EADDRNOTAVAIL;
1441         default:
1442                 return -ENOIOCTLCMD;
1443         }
1444 }
1445
1446
1447 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1448 {
1449         struct net_device *dev = ptr;
1450         struct net *net = dev_net(dev);
1451         struct mr_table *mrt;
1452         struct vif_device *v;
1453         int ct;
1454         LIST_HEAD(list);
1455
1456         if (event != NETDEV_UNREGISTER)
1457                 return NOTIFY_DONE;
1458
1459         ipmr_for_each_table(mrt, net) {
1460                 v = &mrt->vif_table[0];
1461                 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1462                         if (v->dev == dev)
1463                                 vif_delete(mrt, ct, 1, &list);
1464                 }
1465         }
1466         unregister_netdevice_many(&list);
1467         return NOTIFY_DONE;
1468 }
1469
1470
1471 static struct notifier_block ip_mr_notifier = {
1472         .notifier_call = ipmr_device_event,
1473 };
1474
1475 /*
1476  *      Encapsulate a packet by attaching a valid IPIP header to it.
1477  *      This avoids tunnel drivers and other mess and gives us the speed so
1478  *      important for multicast video.
1479  */
1480
1481 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1482 {
1483         struct iphdr *iph;
1484         struct iphdr *old_iph = ip_hdr(skb);
1485
1486         skb_push(skb, sizeof(struct iphdr));
1487         skb->transport_header = skb->network_header;
1488         skb_reset_network_header(skb);
1489         iph = ip_hdr(skb);
1490
1491         iph->version    =       4;
1492         iph->tos        =       old_iph->tos;
1493         iph->ttl        =       old_iph->ttl;
1494         iph->frag_off   =       0;
1495         iph->daddr      =       daddr;
1496         iph->saddr      =       saddr;
1497         iph->protocol   =       IPPROTO_IPIP;
1498         iph->ihl        =       5;
1499         iph->tot_len    =       htons(skb->len);
1500         ip_select_ident(iph, skb_dst(skb), NULL);
1501         ip_send_check(iph);
1502
1503         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1504         nf_reset(skb);
1505 }
1506
1507 static inline int ipmr_forward_finish(struct sk_buff *skb)
1508 {
1509         struct ip_options * opt = &(IPCB(skb)->opt);
1510
1511         IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1512
1513         if (unlikely(opt->optlen))
1514                 ip_forward_options(skb);
1515
1516         return dst_output(skb);
1517 }
1518
1519 /*
1520  *      Processing handlers for ipmr_forward
1521  */
1522
1523 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1524                             struct sk_buff *skb, struct mfc_cache *c, int vifi)
1525 {
1526         const struct iphdr *iph = ip_hdr(skb);
1527         struct vif_device *vif = &mrt->vif_table[vifi];
1528         struct net_device *dev;
1529         struct rtable *rt;
1530         int    encap = 0;
1531
1532         if (vif->dev == NULL)
1533                 goto out_free;
1534
1535 #ifdef CONFIG_IP_PIMSM
1536         if (vif->flags & VIFF_REGISTER) {
1537                 vif->pkt_out++;
1538                 vif->bytes_out += skb->len;
1539                 vif->dev->stats.tx_bytes += skb->len;
1540                 vif->dev->stats.tx_packets++;
1541                 ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1542                 goto out_free;
1543         }
1544 #endif
1545
1546         if (vif->flags&VIFF_TUNNEL) {
1547                 struct flowi fl = { .oif = vif->link,
1548                                     .nl_u = { .ip4_u =
1549                                               { .daddr = vif->remote,
1550                                                 .saddr = vif->local,
1551                                                 .tos = RT_TOS(iph->tos) } },
1552                                     .proto = IPPROTO_IPIP };
1553                 if (ip_route_output_key(net, &rt, &fl))
1554                         goto out_free;
1555                 encap = sizeof(struct iphdr);
1556         } else {
1557                 struct flowi fl = { .oif = vif->link,
1558                                     .nl_u = { .ip4_u =
1559                                               { .daddr = iph->daddr,
1560                                                 .tos = RT_TOS(iph->tos) } },
1561                                     .proto = IPPROTO_IPIP };
1562                 if (ip_route_output_key(net, &rt, &fl))
1563                         goto out_free;
1564         }
1565
1566         dev = rt->dst.dev;
1567
1568         if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1569                 /* Do not fragment multicasts. Alas, IPv4 does not
1570                    allow to send ICMP, so that packets will disappear
1571                    to blackhole.
1572                  */
1573
1574                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1575                 ip_rt_put(rt);
1576                 goto out_free;
1577         }
1578
1579         encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1580
1581         if (skb_cow(skb, encap)) {
1582                 ip_rt_put(rt);
1583                 goto out_free;
1584         }
1585
1586         vif->pkt_out++;
1587         vif->bytes_out += skb->len;
1588
1589         skb_dst_drop(skb);
1590         skb_dst_set(skb, &rt->dst);
1591         ip_decrease_ttl(ip_hdr(skb));
1592
1593         /* FIXME: forward and output firewalls used to be called here.
1594          * What do we do with netfilter? -- RR */
1595         if (vif->flags & VIFF_TUNNEL) {
1596                 ip_encap(skb, vif->local, vif->remote);
1597                 /* FIXME: extra output firewall step used to be here. --RR */
1598                 vif->dev->stats.tx_packets++;
1599                 vif->dev->stats.tx_bytes += skb->len;
1600         }
1601
1602         IPCB(skb)->flags |= IPSKB_FORWARDED;
1603
1604         /*
1605          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1606          * not only before forwarding, but after forwarding on all output
1607          * interfaces. It is clear, if mrouter runs a multicasting
1608          * program, it should receive packets not depending to what interface
1609          * program is joined.
1610          * If we will not make it, the program will have to join on all
1611          * interfaces. On the other hand, multihoming host (or router, but
1612          * not mrouter) cannot join to more than one interface - it will
1613          * result in receiving multiple packets.
1614          */
1615         NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1616                 ipmr_forward_finish);
1617         return;
1618
1619 out_free:
1620         kfree_skb(skb);
1621 }
1622
1623 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1624 {
1625         int ct;
1626
1627         for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1628                 if (mrt->vif_table[ct].dev == dev)
1629                         break;
1630         }
1631         return ct;
1632 }
1633
1634 /* "local" means that we should preserve one skb (for local delivery) */
1635
1636 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1637                          struct sk_buff *skb, struct mfc_cache *cache,
1638                          int local)
1639 {
1640         int psend = -1;
1641         int vif, ct;
1642
1643         vif = cache->mfc_parent;
1644         cache->mfc_un.res.pkt++;
1645         cache->mfc_un.res.bytes += skb->len;
1646
1647         /*
1648          * Wrong interface: drop packet and (maybe) send PIM assert.
1649          */
1650         if (mrt->vif_table[vif].dev != skb->dev) {
1651                 int true_vifi;
1652
1653                 if (skb_rtable(skb)->fl.iif == 0) {
1654                         /* It is our own packet, looped back.
1655                            Very complicated situation...
1656
1657                            The best workaround until routing daemons will be
1658                            fixed is not to redistribute packet, if it was
1659                            send through wrong interface. It means, that
1660                            multicast applications WILL NOT work for
1661                            (S,G), which have default multicast route pointing
1662                            to wrong oif. In any case, it is not a good
1663                            idea to use multicasting applications on router.
1664                          */
1665                         goto dont_forward;
1666                 }
1667
1668                 cache->mfc_un.res.wrong_if++;
1669                 true_vifi = ipmr_find_vif(mrt, skb->dev);
1670
1671                 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1672                     /* pimsm uses asserts, when switching from RPT to SPT,
1673                        so that we cannot check that packet arrived on an oif.
1674                        It is bad, but otherwise we would need to move pretty
1675                        large chunk of pimd to kernel. Ough... --ANK
1676                      */
1677                     (mrt->mroute_do_pim ||
1678                      cache->mfc_un.res.ttls[true_vifi] < 255) &&
1679                     time_after(jiffies,
1680                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1681                         cache->mfc_un.res.last_assert = jiffies;
1682                         ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1683                 }
1684                 goto dont_forward;
1685         }
1686
1687         mrt->vif_table[vif].pkt_in++;
1688         mrt->vif_table[vif].bytes_in += skb->len;
1689
1690         /*
1691          *      Forward the frame
1692          */
1693         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1694                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1695                         if (psend != -1) {
1696                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1697                                 if (skb2)
1698                                         ipmr_queue_xmit(net, mrt, skb2, cache,
1699                                                         psend);
1700                         }
1701                         psend = ct;
1702                 }
1703         }
1704         if (psend != -1) {
1705                 if (local) {
1706                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1707                         if (skb2)
1708                                 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1709                 } else {
1710                         ipmr_queue_xmit(net, mrt, skb, cache, psend);
1711                         return 0;
1712                 }
1713         }
1714
1715 dont_forward:
1716         if (!local)
1717                 kfree_skb(skb);
1718         return 0;
1719 }
1720
1721
1722 /*
1723  *      Multicast packets for forwarding arrive here
1724  *      Called with rcu_read_lock();
1725  */
1726
1727 int ip_mr_input(struct sk_buff *skb)
1728 {
1729         struct mfc_cache *cache;
1730         struct net *net = dev_net(skb->dev);
1731         int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1732         struct mr_table *mrt;
1733         int err;
1734
1735         /* Packet is looped back after forward, it should not be
1736            forwarded second time, but still can be delivered locally.
1737          */
1738         if (IPCB(skb)->flags & IPSKB_FORWARDED)
1739                 goto dont_forward;
1740
1741         err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1742         if (err < 0) {
1743                 kfree_skb(skb);
1744                 return err;
1745         }
1746
1747         if (!local) {
1748                 if (IPCB(skb)->opt.router_alert) {
1749                         if (ip_call_ra_chain(skb))
1750                                 return 0;
1751                 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1752                         /* IGMPv1 (and broken IGMPv2 implementations sort of
1753                          * Cisco IOS <= 11.2(8)) do not put router alert
1754                          * option to IGMP packets destined to routable
1755                          * groups. It is very bad, because it means
1756                          * that we can forward NO IGMP messages.
1757                          */
1758                         struct sock *mroute_sk;
1759
1760                         mroute_sk = rcu_dereference(mrt->mroute_sk);
1761                         if (mroute_sk) {
1762                                 nf_reset(skb);
1763                                 raw_rcv(mroute_sk, skb);
1764                                 return 0;
1765                         }
1766                     }
1767         }
1768
1769         /* already under rcu_read_lock() */
1770         cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1771
1772         /*
1773          *      No usable cache entry
1774          */
1775         if (cache == NULL) {
1776                 int vif;
1777
1778                 if (local) {
1779                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1780                         ip_local_deliver(skb);
1781                         if (skb2 == NULL)
1782                                 return -ENOBUFS;
1783                         skb = skb2;
1784                 }
1785
1786                 read_lock(&mrt_lock);
1787                 vif = ipmr_find_vif(mrt, skb->dev);
1788                 if (vif >= 0) {
1789                         int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1790                         read_unlock(&mrt_lock);
1791
1792                         return err2;
1793                 }
1794                 read_unlock(&mrt_lock);
1795                 kfree_skb(skb);
1796                 return -ENODEV;
1797         }
1798
1799         read_lock(&mrt_lock);
1800         ip_mr_forward(net, mrt, skb, cache, local);
1801         read_unlock(&mrt_lock);
1802
1803         if (local)
1804                 return ip_local_deliver(skb);
1805
1806         return 0;
1807
1808 dont_forward:
1809         if (local)
1810                 return ip_local_deliver(skb);
1811         kfree_skb(skb);
1812         return 0;
1813 }
1814
1815 #ifdef CONFIG_IP_PIMSM
1816 /* called with rcu_read_lock() */
1817 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1818                      unsigned int pimlen)
1819 {
1820         struct net_device *reg_dev = NULL;
1821         struct iphdr *encap;
1822
1823         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1824         /*
1825            Check that:
1826            a. packet is really destinted to a multicast group
1827            b. packet is not a NULL-REGISTER
1828            c. packet is not truncated
1829          */
1830         if (!ipv4_is_multicast(encap->daddr) ||
1831             encap->tot_len == 0 ||
1832             ntohs(encap->tot_len) + pimlen > skb->len)
1833                 return 1;
1834
1835         read_lock(&mrt_lock);
1836         if (mrt->mroute_reg_vif_num >= 0)
1837                 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1838         read_unlock(&mrt_lock);
1839
1840         if (reg_dev == NULL)
1841                 return 1;
1842
1843         skb->mac_header = skb->network_header;
1844         skb_pull(skb, (u8 *)encap - skb->data);
1845         skb_reset_network_header(skb);
1846         skb->protocol = htons(ETH_P_IP);
1847         skb->ip_summed = CHECKSUM_NONE;
1848         skb->pkt_type = PACKET_HOST;
1849
1850         skb_tunnel_rx(skb, reg_dev);
1851
1852         netif_rx(skb);
1853
1854         return NET_RX_SUCCESS;
1855 }
1856 #endif
1857
1858 #ifdef CONFIG_IP_PIMSM_V1
1859 /*
1860  * Handle IGMP messages of PIMv1
1861  */
1862
1863 int pim_rcv_v1(struct sk_buff * skb)
1864 {
1865         struct igmphdr *pim;
1866         struct net *net = dev_net(skb->dev);
1867         struct mr_table *mrt;
1868
1869         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1870                 goto drop;
1871
1872         pim = igmp_hdr(skb);
1873
1874         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1875                 goto drop;
1876
1877         if (!mrt->mroute_do_pim ||
1878             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1879                 goto drop;
1880
1881         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1882 drop:
1883                 kfree_skb(skb);
1884         }
1885         return 0;
1886 }
1887 #endif
1888
1889 #ifdef CONFIG_IP_PIMSM_V2
1890 static int pim_rcv(struct sk_buff * skb)
1891 {
1892         struct pimreghdr *pim;
1893         struct net *net = dev_net(skb->dev);
1894         struct mr_table *mrt;
1895
1896         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1897                 goto drop;
1898
1899         pim = (struct pimreghdr *)skb_transport_header(skb);
1900         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1901             (pim->flags&PIM_NULL_REGISTER) ||
1902             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1903              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1904                 goto drop;
1905
1906         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1907                 goto drop;
1908
1909         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1910 drop:
1911                 kfree_skb(skb);
1912         }
1913         return 0;
1914 }
1915 #endif
1916
1917 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1918                               struct mfc_cache *c, struct rtmsg *rtm)
1919 {
1920         int ct;
1921         struct rtnexthop *nhp;
1922         u8 *b = skb_tail_pointer(skb);
1923         struct rtattr *mp_head;
1924
1925         /* If cache is unresolved, don't try to parse IIF and OIF */
1926         if (c->mfc_parent >= MAXVIFS)
1927                 return -ENOENT;
1928
1929         if (VIF_EXISTS(mrt, c->mfc_parent))
1930                 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1931
1932         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1933
1934         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1935                 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1936                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1937                                 goto rtattr_failure;
1938                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1939                         nhp->rtnh_flags = 0;
1940                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1941                         nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1942                         nhp->rtnh_len = sizeof(*nhp);
1943                 }
1944         }
1945         mp_head->rta_type = RTA_MULTIPATH;
1946         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1947         rtm->rtm_type = RTN_MULTICAST;
1948         return 1;
1949
1950 rtattr_failure:
1951         nlmsg_trim(skb, b);
1952         return -EMSGSIZE;
1953 }
1954
1955 int ipmr_get_route(struct net *net,
1956                    struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1957 {
1958         int err;
1959         struct mr_table *mrt;
1960         struct mfc_cache *cache;
1961         struct rtable *rt = skb_rtable(skb);
1962
1963         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1964         if (mrt == NULL)
1965                 return -ENOENT;
1966
1967         rcu_read_lock();
1968         cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1969
1970         if (cache == NULL) {
1971                 struct sk_buff *skb2;
1972                 struct iphdr *iph;
1973                 struct net_device *dev;
1974                 int vif;
1975
1976                 if (nowait) {
1977                         rcu_read_unlock();
1978                         return -EAGAIN;
1979                 }
1980
1981                 dev = skb->dev;
1982                 read_lock(&mrt_lock);
1983                 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1984                         read_unlock(&mrt_lock);
1985                         rcu_read_unlock();
1986                         return -ENODEV;
1987                 }
1988                 skb2 = skb_clone(skb, GFP_ATOMIC);
1989                 if (!skb2) {
1990                         read_unlock(&mrt_lock);
1991                         rcu_read_unlock();
1992                         return -ENOMEM;
1993                 }
1994
1995                 skb_push(skb2, sizeof(struct iphdr));
1996                 skb_reset_network_header(skb2);
1997                 iph = ip_hdr(skb2);
1998                 iph->ihl = sizeof(struct iphdr) >> 2;
1999                 iph->saddr = rt->rt_src;
2000                 iph->daddr = rt->rt_dst;
2001                 iph->version = 0;
2002                 err = ipmr_cache_unresolved(mrt, vif, skb2);
2003                 read_unlock(&mrt_lock);
2004                 rcu_read_unlock();
2005                 return err;
2006         }
2007
2008         read_lock(&mrt_lock);
2009         if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
2010                 cache->mfc_flags |= MFC_NOTIFY;
2011         err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2012         read_unlock(&mrt_lock);
2013         rcu_read_unlock();
2014         return err;
2015 }
2016
2017 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2018                             u32 pid, u32 seq, struct mfc_cache *c)
2019 {
2020         struct nlmsghdr *nlh;
2021         struct rtmsg *rtm;
2022
2023         nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2024         if (nlh == NULL)
2025                 return -EMSGSIZE;
2026
2027         rtm = nlmsg_data(nlh);
2028         rtm->rtm_family   = RTNL_FAMILY_IPMR;
2029         rtm->rtm_dst_len  = 32;
2030         rtm->rtm_src_len  = 32;
2031         rtm->rtm_tos      = 0;
2032         rtm->rtm_table    = mrt->id;
2033         NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2034         rtm->rtm_type     = RTN_MULTICAST;
2035         rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2036         rtm->rtm_protocol = RTPROT_UNSPEC;
2037         rtm->rtm_flags    = 0;
2038
2039         NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2040         NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2041
2042         if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2043                 goto nla_put_failure;
2044
2045         return nlmsg_end(skb, nlh);
2046
2047 nla_put_failure:
2048         nlmsg_cancel(skb, nlh);
2049         return -EMSGSIZE;
2050 }
2051
2052 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2053 {
2054         struct net *net = sock_net(skb->sk);
2055         struct mr_table *mrt;
2056         struct mfc_cache *mfc;
2057         unsigned int t = 0, s_t;
2058         unsigned int h = 0, s_h;
2059         unsigned int e = 0, s_e;
2060
2061         s_t = cb->args[0];
2062         s_h = cb->args[1];
2063         s_e = cb->args[2];
2064
2065         rcu_read_lock();
2066         ipmr_for_each_table(mrt, net) {
2067                 if (t < s_t)
2068                         goto next_table;
2069                 if (t > s_t)
2070                         s_h = 0;
2071                 for (h = s_h; h < MFC_LINES; h++) {
2072                         list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2073                                 if (e < s_e)
2074                                         goto next_entry;
2075                                 if (ipmr_fill_mroute(mrt, skb,
2076                                                      NETLINK_CB(cb->skb).pid,
2077                                                      cb->nlh->nlmsg_seq,
2078                                                      mfc) < 0)
2079                                         goto done;
2080 next_entry:
2081                                 e++;
2082                         }
2083                         e = s_e = 0;
2084                 }
2085                 s_h = 0;
2086 next_table:
2087                 t++;
2088         }
2089 done:
2090         rcu_read_unlock();
2091
2092         cb->args[2] = e;
2093         cb->args[1] = h;
2094         cb->args[0] = t;
2095
2096         return skb->len;
2097 }
2098
2099 #ifdef CONFIG_PROC_FS
2100 /*
2101  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2102  */
2103 struct ipmr_vif_iter {
2104         struct seq_net_private p;
2105         struct mr_table *mrt;
2106         int ct;
2107 };
2108
2109 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2110                                            struct ipmr_vif_iter *iter,
2111                                            loff_t pos)
2112 {
2113         struct mr_table *mrt = iter->mrt;
2114
2115         for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2116                 if (!VIF_EXISTS(mrt, iter->ct))
2117                         continue;
2118                 if (pos-- == 0)
2119                         return &mrt->vif_table[iter->ct];
2120         }
2121         return NULL;
2122 }
2123
2124 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2125         __acquires(mrt_lock)
2126 {
2127         struct ipmr_vif_iter *iter = seq->private;
2128         struct net *net = seq_file_net(seq);
2129         struct mr_table *mrt;
2130
2131         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2132         if (mrt == NULL)
2133                 return ERR_PTR(-ENOENT);
2134
2135         iter->mrt = mrt;
2136
2137         read_lock(&mrt_lock);
2138         return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2139                 : SEQ_START_TOKEN;
2140 }
2141
2142 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2143 {
2144         struct ipmr_vif_iter *iter = seq->private;
2145         struct net *net = seq_file_net(seq);
2146         struct mr_table *mrt = iter->mrt;
2147
2148         ++*pos;
2149         if (v == SEQ_START_TOKEN)
2150                 return ipmr_vif_seq_idx(net, iter, 0);
2151
2152         while (++iter->ct < mrt->maxvif) {
2153                 if (!VIF_EXISTS(mrt, iter->ct))
2154                         continue;
2155                 return &mrt->vif_table[iter->ct];
2156         }
2157         return NULL;
2158 }
2159
2160 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2161         __releases(mrt_lock)
2162 {
2163         read_unlock(&mrt_lock);
2164 }
2165
2166 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2167 {
2168         struct ipmr_vif_iter *iter = seq->private;
2169         struct mr_table *mrt = iter->mrt;
2170
2171         if (v == SEQ_START_TOKEN) {
2172                 seq_puts(seq,
2173                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2174         } else {
2175                 const struct vif_device *vif = v;
2176                 const char *name =  vif->dev ? vif->dev->name : "none";
2177
2178                 seq_printf(seq,
2179                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2180                            vif - mrt->vif_table,
2181                            name, vif->bytes_in, vif->pkt_in,
2182                            vif->bytes_out, vif->pkt_out,
2183                            vif->flags, vif->local, vif->remote);
2184         }
2185         return 0;
2186 }
2187
2188 static const struct seq_operations ipmr_vif_seq_ops = {
2189         .start = ipmr_vif_seq_start,
2190         .next  = ipmr_vif_seq_next,
2191         .stop  = ipmr_vif_seq_stop,
2192         .show  = ipmr_vif_seq_show,
2193 };
2194
2195 static int ipmr_vif_open(struct inode *inode, struct file *file)
2196 {
2197         return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2198                             sizeof(struct ipmr_vif_iter));
2199 }
2200
2201 static const struct file_operations ipmr_vif_fops = {
2202         .owner   = THIS_MODULE,
2203         .open    = ipmr_vif_open,
2204         .read    = seq_read,
2205         .llseek  = seq_lseek,
2206         .release = seq_release_net,
2207 };
2208
2209 struct ipmr_mfc_iter {
2210         struct seq_net_private p;
2211         struct mr_table *mrt;
2212         struct list_head *cache;
2213         int ct;
2214 };
2215
2216
2217 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2218                                           struct ipmr_mfc_iter *it, loff_t pos)
2219 {
2220         struct mr_table *mrt = it->mrt;
2221         struct mfc_cache *mfc;
2222
2223         rcu_read_lock();
2224         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2225                 it->cache = &mrt->mfc_cache_array[it->ct];
2226                 list_for_each_entry_rcu(mfc, it->cache, list)
2227                         if (pos-- == 0)
2228                                 return mfc;
2229         }
2230         rcu_read_unlock();
2231
2232         spin_lock_bh(&mfc_unres_lock);
2233         it->cache = &mrt->mfc_unres_queue;
2234         list_for_each_entry(mfc, it->cache, list)
2235                 if (pos-- == 0)
2236                         return mfc;
2237         spin_unlock_bh(&mfc_unres_lock);
2238
2239         it->cache = NULL;
2240         return NULL;
2241 }
2242
2243
2244 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2245 {
2246         struct ipmr_mfc_iter *it = seq->private;
2247         struct net *net = seq_file_net(seq);
2248         struct mr_table *mrt;
2249
2250         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2251         if (mrt == NULL)
2252                 return ERR_PTR(-ENOENT);
2253
2254         it->mrt = mrt;
2255         it->cache = NULL;
2256         it->ct = 0;
2257         return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2258                 : SEQ_START_TOKEN;
2259 }
2260
2261 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2262 {
2263         struct mfc_cache *mfc = v;
2264         struct ipmr_mfc_iter *it = seq->private;
2265         struct net *net = seq_file_net(seq);
2266         struct mr_table *mrt = it->mrt;
2267
2268         ++*pos;
2269
2270         if (v == SEQ_START_TOKEN)
2271                 return ipmr_mfc_seq_idx(net, seq->private, 0);
2272
2273         if (mfc->list.next != it->cache)
2274                 return list_entry(mfc->list.next, struct mfc_cache, list);
2275
2276         if (it->cache == &mrt->mfc_unres_queue)
2277                 goto end_of_list;
2278
2279         BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2280
2281         while (++it->ct < MFC_LINES) {
2282                 it->cache = &mrt->mfc_cache_array[it->ct];
2283                 if (list_empty(it->cache))
2284                         continue;
2285                 return list_first_entry(it->cache, struct mfc_cache, list);
2286         }
2287
2288         /* exhausted cache_array, show unresolved */
2289         rcu_read_unlock();
2290         it->cache = &mrt->mfc_unres_queue;
2291         it->ct = 0;
2292
2293         spin_lock_bh(&mfc_unres_lock);
2294         if (!list_empty(it->cache))
2295                 return list_first_entry(it->cache, struct mfc_cache, list);
2296
2297  end_of_list:
2298         spin_unlock_bh(&mfc_unres_lock);
2299         it->cache = NULL;
2300
2301         return NULL;
2302 }
2303
2304 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2305 {
2306         struct ipmr_mfc_iter *it = seq->private;
2307         struct mr_table *mrt = it->mrt;
2308
2309         if (it->cache == &mrt->mfc_unres_queue)
2310                 spin_unlock_bh(&mfc_unres_lock);
2311         else if (it->cache == &mrt->mfc_cache_array[it->ct])
2312                 rcu_read_unlock();
2313 }
2314
2315 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2316 {
2317         int n;
2318
2319         if (v == SEQ_START_TOKEN) {
2320                 seq_puts(seq,
2321                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2322         } else {
2323                 const struct mfc_cache *mfc = v;
2324                 const struct ipmr_mfc_iter *it = seq->private;
2325                 const struct mr_table *mrt = it->mrt;
2326
2327                 seq_printf(seq, "%08X %08X %-3hd",
2328                            (__force u32) mfc->mfc_mcastgrp,
2329                            (__force u32) mfc->mfc_origin,
2330                            mfc->mfc_parent);
2331
2332                 if (it->cache != &mrt->mfc_unres_queue) {
2333                         seq_printf(seq, " %8lu %8lu %8lu",
2334                                    mfc->mfc_un.res.pkt,
2335                                    mfc->mfc_un.res.bytes,
2336                                    mfc->mfc_un.res.wrong_if);
2337                         for (n = mfc->mfc_un.res.minvif;
2338                              n < mfc->mfc_un.res.maxvif; n++ ) {
2339                                 if (VIF_EXISTS(mrt, n) &&
2340                                     mfc->mfc_un.res.ttls[n] < 255)
2341                                         seq_printf(seq,
2342                                            " %2d:%-3d",
2343                                            n, mfc->mfc_un.res.ttls[n]);
2344                         }
2345                 } else {
2346                         /* unresolved mfc_caches don't contain
2347                          * pkt, bytes and wrong_if values
2348                          */
2349                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2350                 }
2351                 seq_putc(seq, '\n');
2352         }
2353         return 0;
2354 }
2355
2356 static const struct seq_operations ipmr_mfc_seq_ops = {
2357         .start = ipmr_mfc_seq_start,
2358         .next  = ipmr_mfc_seq_next,
2359         .stop  = ipmr_mfc_seq_stop,
2360         .show  = ipmr_mfc_seq_show,
2361 };
2362
2363 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2364 {
2365         return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2366                             sizeof(struct ipmr_mfc_iter));
2367 }
2368
2369 static const struct file_operations ipmr_mfc_fops = {
2370         .owner   = THIS_MODULE,
2371         .open    = ipmr_mfc_open,
2372         .read    = seq_read,
2373         .llseek  = seq_lseek,
2374         .release = seq_release_net,
2375 };
2376 #endif
2377
2378 #ifdef CONFIG_IP_PIMSM_V2
2379 static const struct net_protocol pim_protocol = {
2380         .handler        =       pim_rcv,
2381         .netns_ok       =       1,
2382 };
2383 #endif
2384
2385
2386 /*
2387  *      Setup for IP multicast routing
2388  */
2389 static int __net_init ipmr_net_init(struct net *net)
2390 {
2391         int err;
2392
2393         err = ipmr_rules_init(net);
2394         if (err < 0)
2395                 goto fail;
2396
2397 #ifdef CONFIG_PROC_FS
2398         err = -ENOMEM;
2399         if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2400                 goto proc_vif_fail;
2401         if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2402                 goto proc_cache_fail;
2403 #endif
2404         return 0;
2405
2406 #ifdef CONFIG_PROC_FS
2407 proc_cache_fail:
2408         proc_net_remove(net, "ip_mr_vif");
2409 proc_vif_fail:
2410         ipmr_rules_exit(net);
2411 #endif
2412 fail:
2413         return err;
2414 }
2415
2416 static void __net_exit ipmr_net_exit(struct net *net)
2417 {
2418 #ifdef CONFIG_PROC_FS
2419         proc_net_remove(net, "ip_mr_cache");
2420         proc_net_remove(net, "ip_mr_vif");
2421 #endif
2422         ipmr_rules_exit(net);
2423 }
2424
2425 static struct pernet_operations ipmr_net_ops = {
2426         .init = ipmr_net_init,
2427         .exit = ipmr_net_exit,
2428 };
2429
2430 int __init ip_mr_init(void)
2431 {
2432         int err;
2433
2434         mrt_cachep = kmem_cache_create("ip_mrt_cache",
2435                                        sizeof(struct mfc_cache),
2436                                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2437                                        NULL);
2438         if (!mrt_cachep)
2439                 return -ENOMEM;
2440
2441         err = register_pernet_subsys(&ipmr_net_ops);
2442         if (err)
2443                 goto reg_pernet_fail;
2444
2445         err = register_netdevice_notifier(&ip_mr_notifier);
2446         if (err)
2447                 goto reg_notif_fail;
2448 #ifdef CONFIG_IP_PIMSM_V2
2449         if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2450                 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2451                 err = -EAGAIN;
2452                 goto add_proto_fail;
2453         }
2454 #endif
2455         rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2456         return 0;
2457
2458 #ifdef CONFIG_IP_PIMSM_V2
2459 add_proto_fail:
2460         unregister_netdevice_notifier(&ip_mr_notifier);
2461 #endif
2462 reg_notif_fail:
2463         unregister_pernet_subsys(&ipmr_net_ops);
2464 reg_pernet_fail:
2465         kmem_cache_destroy(mrt_cachep);
2466         return err;
2467 }