c9e2b5e6305e2288966735800d950814899ca53b
[cascardo/linux.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
37  *                                      for decreased register pressure on x86
38  *                                      and more readibility.
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/mm.h>
53 #include <linux/string.h>
54 #include <linux/errno.h>
55 #include <linux/highmem.h>
56
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/in.h>
60 #include <linux/inet.h>
61 #include <linux/netdevice.h>
62 #include <linux/etherdevice.h>
63 #include <linux/proc_fs.h>
64 #include <linux/stat.h>
65 #include <linux/init.h>
66
67 #include <net/snmp.h>
68 #include <net/ip.h>
69 #include <net/protocol.h>
70 #include <net/route.h>
71 #include <net/xfrm.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         skb_reset_mac_header(newskb);
99         __skb_pull(newskb, skb_network_offset(newskb));
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /*
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           __be32 saddr, __be32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
129         skb_reset_network_header(skb);
130         iph = ip_hdr(skb);
131         iph->version  = 4;
132         iph->ihl      = 5;
133         iph->tos      = inet->tos;
134         if (ip_dont_fragment(sk, &rt->u.dst))
135                 iph->frag_off = htons(IP_DF);
136         else
137                 iph->frag_off = 0;
138         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
139         iph->daddr    = rt->rt_dst;
140         iph->saddr    = rt->rt_src;
141         iph->protocol = sk->sk_protocol;
142         iph->tot_len  = htons(skb->len);
143         ip_select_ident(iph, &rt->u.dst, sk);
144
145         if (opt && opt->optlen) {
146                 iph->ihl += opt->optlen>>2;
147                 ip_options_build(skb, opt, daddr, rt, 0);
148         }
149         ip_send_check(iph);
150
151         skb->priority = sk->sk_priority;
152
153         /* Send it out. */
154         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
155                        dst_output);
156 }
157
158 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
159
160 static inline int ip_finish_output2(struct sk_buff *skb)
161 {
162         struct dst_entry *dst = skb->dst;
163         struct rtable *rt = (struct rtable *)dst;
164         struct net_device *dev = dst->dev;
165         int hh_len = LL_RESERVED_SPACE(dev);
166
167         if (rt->rt_type == RTN_MULTICAST)
168                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
169         else if (rt->rt_type == RTN_BROADCAST)
170                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
171
172         /* Be paranoid, rather than too clever. */
173         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
174                 struct sk_buff *skb2;
175
176                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
177                 if (skb2 == NULL) {
178                         kfree_skb(skb);
179                         return -ENOMEM;
180                 }
181                 if (skb->sk)
182                         skb_set_owner_w(skb2, skb->sk);
183                 kfree_skb(skb);
184                 skb = skb2;
185         }
186
187         if (dst->hh)
188                 return neigh_hh_output(dst->hh, skb);
189         else if (dst->neighbour)
190                 return dst->neighbour->output(skb);
191
192         if (net_ratelimit())
193                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
194         kfree_skb(skb);
195         return -EINVAL;
196 }
197
198 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
199 {
200         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
201
202         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
203                skb->dst->dev->mtu : dst_mtu(skb->dst);
204 }
205
206 static inline int ip_finish_output(struct sk_buff *skb)
207 {
208 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
209         /* Policy lookup after SNAT yielded a new policy */
210         if (skb->dst->xfrm != NULL) {
211                 IPCB(skb)->flags |= IPSKB_REROUTED;
212                 return dst_output(skb);
213         }
214 #endif
215         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
216                 return ip_fragment(skb, ip_finish_output2);
217         else
218                 return ip_finish_output2(skb);
219 }
220
221 int ip_mc_output(struct sk_buff *skb)
222 {
223         struct sock *sk = skb->sk;
224         struct rtable *rt = (struct rtable*)skb->dst;
225         struct net_device *dev = rt->u.dst.dev;
226
227         /*
228          *      If the indicated interface is up and running, send the packet.
229          */
230         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
231
232         skb->dev = dev;
233         skb->protocol = htons(ETH_P_IP);
234
235         /*
236          *      Multicasts are looped back for other local users
237          */
238
239         if (rt->rt_flags&RTCF_MULTICAST) {
240                 if ((!sk || inet_sk(sk)->mc_loop)
241 #ifdef CONFIG_IP_MROUTE
242                 /* Small optimization: do not loopback not local frames,
243                    which returned after forwarding; they will be  dropped
244                    by ip_mr_input in any case.
245                    Note, that local frames are looped back to be delivered
246                    to local recipients.
247
248                    This check is duplicated in ip_mr_input at the moment.
249                  */
250                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
251 #endif
252                 ) {
253                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
254                         if (newskb)
255                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
256                                         newskb->dev,
257                                         ip_dev_loopback_xmit);
258                 }
259
260                 /* Multicasts with ttl 0 must not go beyond the host */
261
262                 if (ip_hdr(skb)->ttl == 0) {
263                         kfree_skb(skb);
264                         return 0;
265                 }
266         }
267
268         if (rt->rt_flags&RTCF_BROADCAST) {
269                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
270                 if (newskb)
271                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
272                                 newskb->dev, ip_dev_loopback_xmit);
273         }
274
275         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
276                             ip_finish_output,
277                             !(IPCB(skb)->flags & IPSKB_REROUTED));
278 }
279
280 int ip_output(struct sk_buff *skb)
281 {
282         struct net_device *dev = skb->dst->dev;
283
284         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
285
286         skb->dev = dev;
287         skb->protocol = htons(ETH_P_IP);
288
289         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
290                             ip_finish_output,
291                             !(IPCB(skb)->flags & IPSKB_REROUTED));
292 }
293
294 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
295 {
296         struct sock *sk = skb->sk;
297         struct inet_sock *inet = inet_sk(sk);
298         struct ip_options *opt = inet->opt;
299         struct rtable *rt;
300         struct iphdr *iph;
301
302         /* Skip all of this if the packet is already routed,
303          * f.e. by something like SCTP.
304          */
305         rt = (struct rtable *) skb->dst;
306         if (rt != NULL)
307                 goto packet_routed;
308
309         /* Make sure we can route this packet. */
310         rt = (struct rtable *)__sk_dst_check(sk, 0);
311         if (rt == NULL) {
312                 __be32 daddr;
313
314                 /* Use correct destination address if we have options. */
315                 daddr = inet->daddr;
316                 if(opt && opt->srr)
317                         daddr = opt->faddr;
318
319                 {
320                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
321                                             .nl_u = { .ip4_u =
322                                                       { .daddr = daddr,
323                                                         .saddr = inet->saddr,
324                                                         .tos = RT_CONN_FLAGS(sk) } },
325                                             .proto = sk->sk_protocol,
326                                             .uli_u = { .ports =
327                                                        { .sport = inet->sport,
328                                                          .dport = inet->dport } } };
329
330                         /* If this fails, retransmit mechanism of transport layer will
331                          * keep trying until route appears or the connection times
332                          * itself out.
333                          */
334                         security_sk_classify_flow(sk, &fl);
335                         if (ip_route_output_flow(&rt, &fl, sk, 0))
336                                 goto no_route;
337                 }
338                 sk_setup_caps(sk, &rt->u.dst);
339         }
340         skb->dst = dst_clone(&rt->u.dst);
341
342 packet_routed:
343         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
344                 goto no_route;
345
346         /* OK, we know where to send it, allocate and build IP header. */
347         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
348         skb_reset_network_header(skb);
349         iph = ip_hdr(skb);
350         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
351         iph->tot_len = htons(skb->len);
352         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
353                 iph->frag_off = htons(IP_DF);
354         else
355                 iph->frag_off = 0;
356         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
357         iph->protocol = sk->sk_protocol;
358         iph->saddr    = rt->rt_src;
359         iph->daddr    = rt->rt_dst;
360         /* Transport layer set skb->h.foo itself. */
361
362         if (opt && opt->optlen) {
363                 iph->ihl += opt->optlen >> 2;
364                 ip_options_build(skb, opt, inet->daddr, rt, 0);
365         }
366
367         ip_select_ident_more(iph, &rt->u.dst, sk,
368                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
369
370         /* Add an IP checksum. */
371         ip_send_check(iph);
372
373         skb->priority = sk->sk_priority;
374
375         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
376                        dst_output);
377
378 no_route:
379         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
380         kfree_skb(skb);
381         return -EHOSTUNREACH;
382 }
383
384
385 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
386 {
387         to->pkt_type = from->pkt_type;
388         to->priority = from->priority;
389         to->protocol = from->protocol;
390         dst_release(to->dst);
391         to->dst = dst_clone(from->dst);
392         to->dev = from->dev;
393         to->mark = from->mark;
394
395         /* Copy the flags to each fragment. */
396         IPCB(to)->flags = IPCB(from)->flags;
397
398 #ifdef CONFIG_NET_SCHED
399         to->tc_index = from->tc_index;
400 #endif
401         nf_copy(to, from);
402 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
403     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
404         to->nf_trace = from->nf_trace;
405 #endif
406 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
407         to->ipvs_property = from->ipvs_property;
408 #endif
409         skb_copy_secmark(to, from);
410 }
411
412 /*
413  *      This IP datagram is too large to be sent in one piece.  Break it up into
414  *      smaller pieces (each of size equal to IP header plus
415  *      a block of the data of the original IP data part) that will yet fit in a
416  *      single device frame, and queue such a frame for sending.
417  */
418
419 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
420 {
421         struct iphdr *iph;
422         int raw = 0;
423         int ptr;
424         struct net_device *dev;
425         struct sk_buff *skb2;
426         unsigned int mtu, hlen, left, len, ll_rs, pad;
427         int offset;
428         __be16 not_last_frag;
429         struct rtable *rt = (struct rtable*)skb->dst;
430         int err = 0;
431
432         dev = rt->u.dst.dev;
433
434         /*
435          *      Point into the IP datagram header.
436          */
437
438         iph = ip_hdr(skb);
439
440         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
441                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
442                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
443                           htonl(ip_skb_dst_mtu(skb)));
444                 kfree_skb(skb);
445                 return -EMSGSIZE;
446         }
447
448         /*
449          *      Setup starting values.
450          */
451
452         hlen = iph->ihl * 4;
453         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
454         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
455
456         /* When frag_list is given, use it. First, check its validity:
457          * some transformers could create wrong frag_list or break existing
458          * one, it is not prohibited. In this case fall back to copying.
459          *
460          * LATER: this step can be merged to real generation of fragments,
461          * we can switch to copy when see the first bad fragment.
462          */
463         if (skb_shinfo(skb)->frag_list) {
464                 struct sk_buff *frag;
465                 int first_len = skb_pagelen(skb);
466
467                 if (first_len - hlen > mtu ||
468                     ((first_len - hlen) & 7) ||
469                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
470                     skb_cloned(skb))
471                         goto slow_path;
472
473                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
474                         /* Correct geometry. */
475                         if (frag->len > mtu ||
476                             ((frag->len & 7) && frag->next) ||
477                             skb_headroom(frag) < hlen)
478                             goto slow_path;
479
480                         /* Partially cloned skb? */
481                         if (skb_shared(frag))
482                                 goto slow_path;
483
484                         BUG_ON(frag->sk);
485                         if (skb->sk) {
486                                 sock_hold(skb->sk);
487                                 frag->sk = skb->sk;
488                                 frag->destructor = sock_wfree;
489                                 skb->truesize -= frag->truesize;
490                         }
491                 }
492
493                 /* Everything is OK. Generate! */
494
495                 err = 0;
496                 offset = 0;
497                 frag = skb_shinfo(skb)->frag_list;
498                 skb_shinfo(skb)->frag_list = NULL;
499                 skb->data_len = first_len - skb_headlen(skb);
500                 skb->len = first_len;
501                 iph->tot_len = htons(first_len);
502                 iph->frag_off = htons(IP_MF);
503                 ip_send_check(iph);
504
505                 for (;;) {
506                         /* Prepare header of the next frame,
507                          * before previous one went down. */
508                         if (frag) {
509                                 frag->ip_summed = CHECKSUM_NONE;
510                                 skb_reset_transport_header(frag);
511                                 __skb_push(frag, hlen);
512                                 skb_reset_network_header(frag);
513                                 memcpy(skb_network_header(frag), iph, hlen);
514                                 iph = ip_hdr(frag);
515                                 iph->tot_len = htons(frag->len);
516                                 ip_copy_metadata(frag, skb);
517                                 if (offset == 0)
518                                         ip_options_fragment(frag);
519                                 offset += skb->len - hlen;
520                                 iph->frag_off = htons(offset>>3);
521                                 if (frag->next != NULL)
522                                         iph->frag_off |= htons(IP_MF);
523                                 /* Ready, complete checksum */
524                                 ip_send_check(iph);
525                         }
526
527                         err = output(skb);
528
529                         if (!err)
530                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
531                         if (err || !frag)
532                                 break;
533
534                         skb = frag;
535                         frag = skb->next;
536                         skb->next = NULL;
537                 }
538
539                 if (err == 0) {
540                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
541                         return 0;
542                 }
543
544                 while (frag) {
545                         skb = frag->next;
546                         kfree_skb(frag);
547                         frag = skb;
548                 }
549                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
550                 return err;
551         }
552
553 slow_path:
554         left = skb->len - hlen;         /* Space per frame */
555         ptr = raw + hlen;               /* Where to start from */
556
557         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
558          * we need to make room for the encapsulating header
559          */
560         pad = nf_bridge_pad(skb);
561         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
562         mtu -= pad;
563
564         /*
565          *      Fragment the datagram.
566          */
567
568         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
569         not_last_frag = iph->frag_off & htons(IP_MF);
570
571         /*
572          *      Keep copying data until we run out.
573          */
574
575         while (left > 0) {
576                 len = left;
577                 /* IF: it doesn't fit, use 'mtu' - the data space left */
578                 if (len > mtu)
579                         len = mtu;
580                 /* IF: we are not sending upto and including the packet end
581                    then align the next start on an eight byte boundary */
582                 if (len < left) {
583                         len &= ~7;
584                 }
585                 /*
586                  *      Allocate buffer.
587                  */
588
589                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
590                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
591                         err = -ENOMEM;
592                         goto fail;
593                 }
594
595                 /*
596                  *      Set up data on packet
597                  */
598
599                 ip_copy_metadata(skb2, skb);
600                 skb_reserve(skb2, ll_rs);
601                 skb_put(skb2, len + hlen);
602                 skb_reset_network_header(skb2);
603                 skb2->transport_header = skb2->network_header + hlen;
604
605                 /*
606                  *      Charge the memory for the fragment to any owner
607                  *      it might possess
608                  */
609
610                 if (skb->sk)
611                         skb_set_owner_w(skb2, skb->sk);
612
613                 /*
614                  *      Copy the packet header into the new buffer.
615                  */
616
617                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
618
619                 /*
620                  *      Copy a block of the IP datagram.
621                  */
622                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
623                         BUG();
624                 left -= len;
625
626                 /*
627                  *      Fill in the new header fields.
628                  */
629                 iph = ip_hdr(skb2);
630                 iph->frag_off = htons((offset >> 3));
631
632                 /* ANK: dirty, but effective trick. Upgrade options only if
633                  * the segment to be fragmented was THE FIRST (otherwise,
634                  * options are already fixed) and make it ONCE
635                  * on the initial skb, so that all the following fragments
636                  * will inherit fixed options.
637                  */
638                 if (offset == 0)
639                         ip_options_fragment(skb);
640
641                 /*
642                  *      Added AC : If we are fragmenting a fragment that's not the
643                  *                 last fragment then keep MF on each bit
644                  */
645                 if (left > 0 || not_last_frag)
646                         iph->frag_off |= htons(IP_MF);
647                 ptr += len;
648                 offset += len;
649
650                 /*
651                  *      Put this fragment into the sending queue.
652                  */
653                 iph->tot_len = htons(len + hlen);
654
655                 ip_send_check(iph);
656
657                 err = output(skb2);
658                 if (err)
659                         goto fail;
660
661                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
662         }
663         kfree_skb(skb);
664         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
665         return err;
666
667 fail:
668         kfree_skb(skb);
669         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
670         return err;
671 }
672
673 EXPORT_SYMBOL(ip_fragment);
674
675 int
676 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
677 {
678         struct iovec *iov = from;
679
680         if (skb->ip_summed == CHECKSUM_PARTIAL) {
681                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
682                         return -EFAULT;
683         } else {
684                 __wsum csum = 0;
685                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
686                         return -EFAULT;
687                 skb->csum = csum_block_add(skb->csum, csum, odd);
688         }
689         return 0;
690 }
691
692 static inline __wsum
693 csum_page(struct page *page, int offset, int copy)
694 {
695         char *kaddr;
696         __wsum csum;
697         kaddr = kmap(page);
698         csum = csum_partial(kaddr + offset, copy, 0);
699         kunmap(page);
700         return csum;
701 }
702
703 static inline int ip_ufo_append_data(struct sock *sk,
704                         int getfrag(void *from, char *to, int offset, int len,
705                                int odd, struct sk_buff *skb),
706                         void *from, int length, int hh_len, int fragheaderlen,
707                         int transhdrlen, int mtu,unsigned int flags)
708 {
709         struct sk_buff *skb;
710         int err;
711
712         /* There is support for UDP fragmentation offload by network
713          * device, so create one single skb packet containing complete
714          * udp datagram
715          */
716         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
717                 skb = sock_alloc_send_skb(sk,
718                         hh_len + fragheaderlen + transhdrlen + 20,
719                         (flags & MSG_DONTWAIT), &err);
720
721                 if (skb == NULL)
722                         return err;
723
724                 /* reserve space for Hardware header */
725                 skb_reserve(skb, hh_len);
726
727                 /* create space for UDP/IP header */
728                 skb_put(skb,fragheaderlen + transhdrlen);
729
730                 /* initialize network header pointer */
731                 skb_reset_network_header(skb);
732
733                 /* initialize protocol header pointer */
734                 skb->transport_header = skb->network_header + fragheaderlen;
735
736                 skb->ip_summed = CHECKSUM_PARTIAL;
737                 skb->csum = 0;
738                 sk->sk_sndmsg_off = 0;
739         }
740
741         err = skb_append_datato_frags(sk,skb, getfrag, from,
742                                (length - transhdrlen));
743         if (!err) {
744                 /* specify the length of each IP datagram fragment*/
745                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
746                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
747                 __skb_queue_tail(&sk->sk_write_queue, skb);
748
749                 return 0;
750         }
751         /* There is not enough support do UFO ,
752          * so follow normal path
753          */
754         kfree_skb(skb);
755         return err;
756 }
757
758 /*
759  *      ip_append_data() and ip_append_page() can make one large IP datagram
760  *      from many pieces of data. Each pieces will be holded on the socket
761  *      until ip_push_pending_frames() is called. Each piece can be a page
762  *      or non-page data.
763  *
764  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
765  *      this interface potentially.
766  *
767  *      LATER: length must be adjusted by pad at tail, when it is required.
768  */
769 int ip_append_data(struct sock *sk,
770                    int getfrag(void *from, char *to, int offset, int len,
771                                int odd, struct sk_buff *skb),
772                    void *from, int length, int transhdrlen,
773                    struct ipcm_cookie *ipc, struct rtable *rt,
774                    unsigned int flags)
775 {
776         struct inet_sock *inet = inet_sk(sk);
777         struct sk_buff *skb;
778
779         struct ip_options *opt = NULL;
780         int hh_len;
781         int exthdrlen;
782         int mtu;
783         int copy;
784         int err;
785         int offset = 0;
786         unsigned int maxfraglen, fragheaderlen;
787         int csummode = CHECKSUM_NONE;
788
789         if (flags&MSG_PROBE)
790                 return 0;
791
792         if (skb_queue_empty(&sk->sk_write_queue)) {
793                 /*
794                  * setup for corking.
795                  */
796                 opt = ipc->opt;
797                 if (opt) {
798                         if (inet->cork.opt == NULL) {
799                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
800                                 if (unlikely(inet->cork.opt == NULL))
801                                         return -ENOBUFS;
802                         }
803                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
804                         inet->cork.flags |= IPCORK_OPT;
805                         inet->cork.addr = ipc->addr;
806                 }
807                 dst_hold(&rt->u.dst);
808                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
809                                             rt->u.dst.dev->mtu :
810                                             dst_mtu(rt->u.dst.path);
811                 inet->cork.rt = rt;
812                 inet->cork.length = 0;
813                 sk->sk_sndmsg_page = NULL;
814                 sk->sk_sndmsg_off = 0;
815                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
816                         length += exthdrlen;
817                         transhdrlen += exthdrlen;
818                 }
819         } else {
820                 rt = inet->cork.rt;
821                 if (inet->cork.flags & IPCORK_OPT)
822                         opt = inet->cork.opt;
823
824                 transhdrlen = 0;
825                 exthdrlen = 0;
826                 mtu = inet->cork.fragsize;
827         }
828         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
829
830         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
831         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
832
833         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
834                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
835                 return -EMSGSIZE;
836         }
837
838         /*
839          * transhdrlen > 0 means that this is the first fragment and we wish
840          * it won't be fragmented in the future.
841          */
842         if (transhdrlen &&
843             length + fragheaderlen <= mtu &&
844             rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
845             !exthdrlen)
846                 csummode = CHECKSUM_PARTIAL;
847
848         inet->cork.length += length;
849         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
850                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
851
852                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
853                                          fragheaderlen, transhdrlen, mtu,
854                                          flags);
855                 if (err)
856                         goto error;
857                 return 0;
858         }
859
860         /* So, what's going on in the loop below?
861          *
862          * We use calculated fragment length to generate chained skb,
863          * each of segments is IP fragment ready for sending to network after
864          * adding appropriate IP header.
865          */
866
867         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
868                 goto alloc_new_skb;
869
870         while (length > 0) {
871                 /* Check if the remaining data fits into current packet. */
872                 copy = mtu - skb->len;
873                 if (copy < length)
874                         copy = maxfraglen - skb->len;
875                 if (copy <= 0) {
876                         char *data;
877                         unsigned int datalen;
878                         unsigned int fraglen;
879                         unsigned int fraggap;
880                         unsigned int alloclen;
881                         struct sk_buff *skb_prev;
882 alloc_new_skb:
883                         skb_prev = skb;
884                         if (skb_prev)
885                                 fraggap = skb_prev->len - maxfraglen;
886                         else
887                                 fraggap = 0;
888
889                         /*
890                          * If remaining data exceeds the mtu,
891                          * we know we need more fragment(s).
892                          */
893                         datalen = length + fraggap;
894                         if (datalen > mtu - fragheaderlen)
895                                 datalen = maxfraglen - fragheaderlen;
896                         fraglen = datalen + fragheaderlen;
897
898                         if ((flags & MSG_MORE) &&
899                             !(rt->u.dst.dev->features&NETIF_F_SG))
900                                 alloclen = mtu;
901                         else
902                                 alloclen = datalen + fragheaderlen;
903
904                         /* The last fragment gets additional space at tail.
905                          * Note, with MSG_MORE we overallocate on fragments,
906                          * because we have no idea what fragment will be
907                          * the last.
908                          */
909                         if (datalen == length + fraggap)
910                                 alloclen += rt->u.dst.trailer_len;
911
912                         if (transhdrlen) {
913                                 skb = sock_alloc_send_skb(sk,
914                                                 alloclen + hh_len + 15,
915                                                 (flags & MSG_DONTWAIT), &err);
916                         } else {
917                                 skb = NULL;
918                                 if (atomic_read(&sk->sk_wmem_alloc) <=
919                                     2 * sk->sk_sndbuf)
920                                         skb = sock_wmalloc(sk,
921                                                            alloclen + hh_len + 15, 1,
922                                                            sk->sk_allocation);
923                                 if (unlikely(skb == NULL))
924                                         err = -ENOBUFS;
925                         }
926                         if (skb == NULL)
927                                 goto error;
928
929                         /*
930                          *      Fill in the control structures
931                          */
932                         skb->ip_summed = csummode;
933                         skb->csum = 0;
934                         skb_reserve(skb, hh_len);
935
936                         /*
937                          *      Find where to start putting bytes.
938                          */
939                         data = skb_put(skb, fraglen);
940                         skb_set_network_header(skb, exthdrlen);
941                         skb->transport_header = (skb->network_header +
942                                                  fragheaderlen);
943                         data += fragheaderlen;
944
945                         if (fraggap) {
946                                 skb->csum = skb_copy_and_csum_bits(
947                                         skb_prev, maxfraglen,
948                                         data + transhdrlen, fraggap, 0);
949                                 skb_prev->csum = csum_sub(skb_prev->csum,
950                                                           skb->csum);
951                                 data += fraggap;
952                                 pskb_trim_unique(skb_prev, maxfraglen);
953                         }
954
955                         copy = datalen - transhdrlen - fraggap;
956                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
957                                 err = -EFAULT;
958                                 kfree_skb(skb);
959                                 goto error;
960                         }
961
962                         offset += copy;
963                         length -= datalen - fraggap;
964                         transhdrlen = 0;
965                         exthdrlen = 0;
966                         csummode = CHECKSUM_NONE;
967
968                         /*
969                          * Put the packet on the pending queue.
970                          */
971                         __skb_queue_tail(&sk->sk_write_queue, skb);
972                         continue;
973                 }
974
975                 if (copy > length)
976                         copy = length;
977
978                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
979                         unsigned int off;
980
981                         off = skb->len;
982                         if (getfrag(from, skb_put(skb, copy),
983                                         offset, copy, off, skb) < 0) {
984                                 __skb_trim(skb, off);
985                                 err = -EFAULT;
986                                 goto error;
987                         }
988                 } else {
989                         int i = skb_shinfo(skb)->nr_frags;
990                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991                         struct page *page = sk->sk_sndmsg_page;
992                         int off = sk->sk_sndmsg_off;
993                         unsigned int left;
994
995                         if (page && (left = PAGE_SIZE - off) > 0) {
996                                 if (copy >= left)
997                                         copy = left;
998                                 if (page != frag->page) {
999                                         if (i == MAX_SKB_FRAGS) {
1000                                                 err = -EMSGSIZE;
1001                                                 goto error;
1002                                         }
1003                                         get_page(page);
1004                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1005                                         frag = &skb_shinfo(skb)->frags[i];
1006                                 }
1007                         } else if (i < MAX_SKB_FRAGS) {
1008                                 if (copy > PAGE_SIZE)
1009                                         copy = PAGE_SIZE;
1010                                 page = alloc_pages(sk->sk_allocation, 0);
1011                                 if (page == NULL)  {
1012                                         err = -ENOMEM;
1013                                         goto error;
1014                                 }
1015                                 sk->sk_sndmsg_page = page;
1016                                 sk->sk_sndmsg_off = 0;
1017
1018                                 skb_fill_page_desc(skb, i, page, 0, 0);
1019                                 frag = &skb_shinfo(skb)->frags[i];
1020                                 skb->truesize += PAGE_SIZE;
1021                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1022                         } else {
1023                                 err = -EMSGSIZE;
1024                                 goto error;
1025                         }
1026                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1027                                 err = -EFAULT;
1028                                 goto error;
1029                         }
1030                         sk->sk_sndmsg_off += copy;
1031                         frag->size += copy;
1032                         skb->len += copy;
1033                         skb->data_len += copy;
1034                 }
1035                 offset += copy;
1036                 length -= copy;
1037         }
1038
1039         return 0;
1040
1041 error:
1042         inet->cork.length -= length;
1043         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1044         return err;
1045 }
1046
1047 ssize_t ip_append_page(struct sock *sk, struct page *page,
1048                        int offset, size_t size, int flags)
1049 {
1050         struct inet_sock *inet = inet_sk(sk);
1051         struct sk_buff *skb;
1052         struct rtable *rt;
1053         struct ip_options *opt = NULL;
1054         int hh_len;
1055         int mtu;
1056         int len;
1057         int err;
1058         unsigned int maxfraglen, fragheaderlen, fraggap;
1059
1060         if (inet->hdrincl)
1061                 return -EPERM;
1062
1063         if (flags&MSG_PROBE)
1064                 return 0;
1065
1066         if (skb_queue_empty(&sk->sk_write_queue))
1067                 return -EINVAL;
1068
1069         rt = inet->cork.rt;
1070         if (inet->cork.flags & IPCORK_OPT)
1071                 opt = inet->cork.opt;
1072
1073         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1074                 return -EOPNOTSUPP;
1075
1076         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1077         mtu = inet->cork.fragsize;
1078
1079         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1080         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1081
1082         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1083                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1084                 return -EMSGSIZE;
1085         }
1086
1087         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1088                 return -EINVAL;
1089
1090         inet->cork.length += size;
1091         if ((sk->sk_protocol == IPPROTO_UDP) &&
1092             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1093                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1094                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1095         }
1096
1097
1098         while (size > 0) {
1099                 int i;
1100
1101                 if (skb_is_gso(skb))
1102                         len = size;
1103                 else {
1104
1105                         /* Check if the remaining data fits into current packet. */
1106                         len = mtu - skb->len;
1107                         if (len < size)
1108                                 len = maxfraglen - skb->len;
1109                 }
1110                 if (len <= 0) {
1111                         struct sk_buff *skb_prev;
1112                         int alloclen;
1113
1114                         skb_prev = skb;
1115                         fraggap = skb_prev->len - maxfraglen;
1116
1117                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1118                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1119                         if (unlikely(!skb)) {
1120                                 err = -ENOBUFS;
1121                                 goto error;
1122                         }
1123
1124                         /*
1125                          *      Fill in the control structures
1126                          */
1127                         skb->ip_summed = CHECKSUM_NONE;
1128                         skb->csum = 0;
1129                         skb_reserve(skb, hh_len);
1130
1131                         /*
1132                          *      Find where to start putting bytes.
1133                          */
1134                         skb_put(skb, fragheaderlen + fraggap);
1135                         skb_reset_network_header(skb);
1136                         skb->transport_header = (skb->network_header +
1137                                                  fragheaderlen);
1138                         if (fraggap) {
1139                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1140                                                                    maxfraglen,
1141                                                     skb_transport_header(skb),
1142                                                                    fraggap, 0);
1143                                 skb_prev->csum = csum_sub(skb_prev->csum,
1144                                                           skb->csum);
1145                                 pskb_trim_unique(skb_prev, maxfraglen);
1146                         }
1147
1148                         /*
1149                          * Put the packet on the pending queue.
1150                          */
1151                         __skb_queue_tail(&sk->sk_write_queue, skb);
1152                         continue;
1153                 }
1154
1155                 i = skb_shinfo(skb)->nr_frags;
1156                 if (len > size)
1157                         len = size;
1158                 if (skb_can_coalesce(skb, i, page, offset)) {
1159                         skb_shinfo(skb)->frags[i-1].size += len;
1160                 } else if (i < MAX_SKB_FRAGS) {
1161                         get_page(page);
1162                         skb_fill_page_desc(skb, i, page, offset, len);
1163                 } else {
1164                         err = -EMSGSIZE;
1165                         goto error;
1166                 }
1167
1168                 if (skb->ip_summed == CHECKSUM_NONE) {
1169                         __wsum csum;
1170                         csum = csum_page(page, offset, len);
1171                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1172                 }
1173
1174                 skb->len += len;
1175                 skb->data_len += len;
1176                 offset += len;
1177                 size -= len;
1178         }
1179         return 0;
1180
1181 error:
1182         inet->cork.length -= size;
1183         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1184         return err;
1185 }
1186
1187 /*
1188  *      Combined all pending IP fragments on the socket as one IP datagram
1189  *      and push them out.
1190  */
1191 int ip_push_pending_frames(struct sock *sk)
1192 {
1193         struct sk_buff *skb, *tmp_skb;
1194         struct sk_buff **tail_skb;
1195         struct inet_sock *inet = inet_sk(sk);
1196         struct ip_options *opt = NULL;
1197         struct rtable *rt = inet->cork.rt;
1198         struct iphdr *iph;
1199         __be16 df = 0;
1200         __u8 ttl;
1201         int err = 0;
1202
1203         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1204                 goto out;
1205         tail_skb = &(skb_shinfo(skb)->frag_list);
1206
1207         /* move skb->data to ip header from ext header */
1208         if (skb->data < skb_network_header(skb))
1209                 __skb_pull(skb, skb_network_offset(skb));
1210         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1211                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1212                 *tail_skb = tmp_skb;
1213                 tail_skb = &(tmp_skb->next);
1214                 skb->len += tmp_skb->len;
1215                 skb->data_len += tmp_skb->len;
1216                 skb->truesize += tmp_skb->truesize;
1217                 __sock_put(tmp_skb->sk);
1218                 tmp_skb->destructor = NULL;
1219                 tmp_skb->sk = NULL;
1220         }
1221
1222         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1223          * to fragment the frame generated here. No matter, what transforms
1224          * how transforms change size of the packet, it will come out.
1225          */
1226         if (inet->pmtudisc < IP_PMTUDISC_DO)
1227                 skb->local_df = 1;
1228
1229         /* DF bit is set when we want to see DF on outgoing frames.
1230          * If local_df is set too, we still allow to fragment this frame
1231          * locally. */
1232         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1233             (skb->len <= dst_mtu(&rt->u.dst) &&
1234              ip_dont_fragment(sk, &rt->u.dst)))
1235                 df = htons(IP_DF);
1236
1237         if (inet->cork.flags & IPCORK_OPT)
1238                 opt = inet->cork.opt;
1239
1240         if (rt->rt_type == RTN_MULTICAST)
1241                 ttl = inet->mc_ttl;
1242         else
1243                 ttl = ip_select_ttl(inet, &rt->u.dst);
1244
1245         iph = (struct iphdr *)skb->data;
1246         iph->version = 4;
1247         iph->ihl = 5;
1248         if (opt) {
1249                 iph->ihl += opt->optlen>>2;
1250                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1251         }
1252         iph->tos = inet->tos;
1253         iph->tot_len = htons(skb->len);
1254         iph->frag_off = df;
1255         ip_select_ident(iph, &rt->u.dst, sk);
1256         iph->ttl = ttl;
1257         iph->protocol = sk->sk_protocol;
1258         iph->saddr = rt->rt_src;
1259         iph->daddr = rt->rt_dst;
1260         ip_send_check(iph);
1261
1262         skb->priority = sk->sk_priority;
1263         skb->dst = dst_clone(&rt->u.dst);
1264
1265         /* Netfilter gets whole the not fragmented skb. */
1266         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1267                       skb->dst->dev, dst_output);
1268         if (err) {
1269                 if (err > 0)
1270                         err = inet->recverr ? net_xmit_errno(err) : 0;
1271                 if (err)
1272                         goto error;
1273         }
1274
1275 out:
1276         inet->cork.flags &= ~IPCORK_OPT;
1277         kfree(inet->cork.opt);
1278         inet->cork.opt = NULL;
1279         if (inet->cork.rt) {
1280                 ip_rt_put(inet->cork.rt);
1281                 inet->cork.rt = NULL;
1282         }
1283         return err;
1284
1285 error:
1286         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1287         goto out;
1288 }
1289
1290 /*
1291  *      Throw away all pending data on the socket.
1292  */
1293 void ip_flush_pending_frames(struct sock *sk)
1294 {
1295         struct inet_sock *inet = inet_sk(sk);
1296         struct sk_buff *skb;
1297
1298         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1299                 kfree_skb(skb);
1300
1301         inet->cork.flags &= ~IPCORK_OPT;
1302         kfree(inet->cork.opt);
1303         inet->cork.opt = NULL;
1304         if (inet->cork.rt) {
1305                 ip_rt_put(inet->cork.rt);
1306                 inet->cork.rt = NULL;
1307         }
1308 }
1309
1310
1311 /*
1312  *      Fetch data from kernel space and fill in checksum if needed.
1313  */
1314 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1315                               int len, int odd, struct sk_buff *skb)
1316 {
1317         __wsum csum;
1318
1319         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1320         skb->csum = csum_block_add(skb->csum, csum, odd);
1321         return 0;
1322 }
1323
1324 /*
1325  *      Generic function to send a packet as reply to another packet.
1326  *      Used to send TCP resets so far. ICMP should use this function too.
1327  *
1328  *      Should run single threaded per socket because it uses the sock
1329  *      structure to pass arguments.
1330  *
1331  *      LATER: switch from ip_build_xmit to ip_append_*
1332  */
1333 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1334                    unsigned int len)
1335 {
1336         struct inet_sock *inet = inet_sk(sk);
1337         struct {
1338                 struct ip_options       opt;
1339                 char                    data[40];
1340         } replyopts;
1341         struct ipcm_cookie ipc;
1342         __be32 daddr;
1343         struct rtable *rt = (struct rtable*)skb->dst;
1344
1345         if (ip_options_echo(&replyopts.opt, skb))
1346                 return;
1347
1348         daddr = ipc.addr = rt->rt_src;
1349         ipc.opt = NULL;
1350
1351         if (replyopts.opt.optlen) {
1352                 ipc.opt = &replyopts.opt;
1353
1354                 if (ipc.opt->srr)
1355                         daddr = replyopts.opt.faddr;
1356         }
1357
1358         {
1359                 struct flowi fl = { .oif = arg->bound_dev_if,
1360                                     .nl_u = { .ip4_u =
1361                                               { .daddr = daddr,
1362                                                 .saddr = rt->rt_spec_dst,
1363                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1364                                     /* Not quite clean, but right. */
1365                                     .uli_u = { .ports =
1366                                                { .sport = tcp_hdr(skb)->dest,
1367                                                  .dport = tcp_hdr(skb)->source } },
1368                                     .proto = sk->sk_protocol };
1369                 security_skb_classify_flow(skb, &fl);
1370                 if (ip_route_output_key(&rt, &fl))
1371                         return;
1372         }
1373
1374         /* And let IP do all the hard work.
1375
1376            This chunk is not reenterable, hence spinlock.
1377            Note that it uses the fact, that this function is called
1378            with locally disabled BH and that sk cannot be already spinlocked.
1379          */
1380         bh_lock_sock(sk);
1381         inet->tos = ip_hdr(skb)->tos;
1382         sk->sk_priority = skb->priority;
1383         sk->sk_protocol = ip_hdr(skb)->protocol;
1384         sk->sk_bound_dev_if = arg->bound_dev_if;
1385         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1386                        &ipc, rt, MSG_DONTWAIT);
1387         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1388                 if (arg->csumoffset >= 0)
1389                         *((__sum16 *)skb_transport_header(skb) +
1390                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1391                                                                 arg->csum));
1392                 skb->ip_summed = CHECKSUM_NONE;
1393                 ip_push_pending_frames(sk);
1394         }
1395
1396         bh_unlock_sock(sk);
1397
1398         ip_rt_put(rt);
1399 }
1400
1401 void __init ip_init(void)
1402 {
1403         ip_rt_init();
1404         inet_initpeers();
1405
1406 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1407         igmp_mc_proc_init();
1408 #endif
1409 }
1410
1411 EXPORT_SYMBOL(ip_generic_getfrag);
1412 EXPORT_SYMBOL(ip_queue_xmit);
1413 EXPORT_SYMBOL(ip_send_check);