net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58 #include <net/l3mdev.h>
  59 #include <net/lwtunnel.h>
  60
  61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  62 {
  63         struct dst_entry *dst = skb_dst(skb);
  64         struct net_device *dev = dst->dev;
  65         struct neighbour *neigh;
  66         struct in6_addr *nexthop;
  67         int ret;
  68
  69         skb->protocol = htons(ETH_P_IPV6);
  70         skb->dev = dev;
  71
  72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  74
  75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  76                     ((mroute6_socket(net, skb) &&
  77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  79                                          &ipv6_hdr(skb)->saddr))) {
  80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  81
  82                         /* Do not check for IFF_ALLMULTI; multicast routing
  83                            is not supported in any case.
  84                          */
  85                         if (newskb)
  86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  87                                         net, sk, newskb, NULL, newskb->dev,
  88                                         dev_loopback_xmit);
  89
  90                         if (ipv6_hdr(skb)->hop_limit == 0) {
  91                                 IP6_INC_STATS(net, idev,
  92                                               IPSTATS_MIB_OUTDISCARDS);
  93                                 kfree_skb(skb);
  94                                 return 0;
  95                         }
  96                 }
  97
  98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  99
 100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 101                     IPV6_ADDR_SCOPE_NODELOCAL &&
 102                     !(dev->flags & IFF_LOOPBACK)) {
 103                         kfree_skb(skb);
 104                         return 0;
 105                 }
 106         }
 107
 108         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 109                 int res = lwtunnel_xmit(skb);
 110
 111                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 112                         return res;
 113         }
 114
 115         rcu_read_lock_bh();
 116         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 117         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 118         if (unlikely(!neigh))
 119                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 120         if (!IS_ERR(neigh)) {
 121                 ret = dst_neigh_output(dst, neigh, skb);
 122                 rcu_read_unlock_bh();
 123                 return ret;
 124         }
 125         rcu_read_unlock_bh();
 126
 127         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 128         kfree_skb(skb);
 129         return -EINVAL;
 130 }
 131
 132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 133 {
 134         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 135             dst_allfrag(skb_dst(skb)) ||
 136             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 137                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 138         else
 139                 return ip6_finish_output2(net, sk, skb);
 140 }
 141
 142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 143 {
 144         struct net_device *dev = skb_dst(skb)->dev;
 145         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 146
 147         if (unlikely(idev->cnf.disable_ipv6)) {
 148                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 149                 kfree_skb(skb);
 150                 return 0;
 151         }
 152
 153         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 154                             net, sk, skb, NULL, dev,
 155                             ip6_finish_output,
 156                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 157 }
 158
 159 /*
 160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 161  * Note : socket lock is not held for SYNACK packets, but might be modified
 162  * by calls to skb_set_owner_w() and ipv6_local_error(),
 163  * which are using proper atomic operations or spinlocks.
 164  */
 165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 166              struct ipv6_txoptions *opt, int tclass)
 167 {
 168         struct net *net = sock_net(sk);
 169         const struct ipv6_pinfo *np = inet6_sk(sk);
 170         struct in6_addr *first_hop = &fl6->daddr;
 171         struct dst_entry *dst = skb_dst(skb);
 172         struct ipv6hdr *hdr;
 173         u8  proto = fl6->flowi6_proto;
 174         int seg_len = skb->len;
 175         int hlimit = -1;
 176         u32 mtu;
 177
 178         if (opt) {
 179                 unsigned int head_room;
 180
 181                 /* First: exthdrs may take lots of space (~8K for now)
 182                    MAX_HEADER is not enough.
 183                  */
 184                 head_room = opt->opt_nflen + opt->opt_flen;
 185                 seg_len += head_room;
 186                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 187
 188                 if (skb_headroom(skb) < head_room) {
 189                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 190                         if (!skb2) {
 191                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 192                                               IPSTATS_MIB_OUTDISCARDS);
 193                                 kfree_skb(skb);
 194                                 return -ENOBUFS;
 195                         }
 196                         consume_skb(skb);
 197                         skb = skb2;
 198                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 199                          * it is safe to call in our context (socket lock not held)
 200                          */
 201                         skb_set_owner_w(skb, (struct sock *)sk);
 202                 }
 203                 if (opt->opt_flen)
 204                         ipv6_push_frag_opts(skb, opt, &proto);
 205                 if (opt->opt_nflen)
 206                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 207         }
 208
 209         skb_push(skb, sizeof(struct ipv6hdr));
 210         skb_reset_network_header(skb);
 211         hdr = ipv6_hdr(skb);
 212
 213         /*
 214          *      Fill in the IPv6 header
 215          */
 216         if (np)
 217                 hlimit = np->hop_limit;
 218         if (hlimit < 0)
 219                 hlimit = ip6_dst_hoplimit(dst);
 220
 221         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 222                                                      np->autoflowlabel, fl6));
 223
 224         hdr->payload_len = htons(seg_len);
 225         hdr->nexthdr = proto;
 226         hdr->hop_limit = hlimit;
 227
 228         hdr->saddr = fl6->saddr;
 229         hdr->daddr = *first_hop;
 230
 231         skb->protocol = htons(ETH_P_IPV6);
 232         skb->priority = sk->sk_priority;
 233         skb->mark = sk->sk_mark;
 234
 235         mtu = dst_mtu(dst);
 236         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 237                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 238                               IPSTATS_MIB_OUT, skb->len);
 239                 /* hooks should never assume socket lock is held.
 240                  * we promote our socket to non const
 241                  */
 242                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 243                                net, (struct sock *)sk, skb, NULL, dst->dev,
 244                                dst_output);
 245         }
 246
 247         skb->dev = dst->dev;
 248         /* ipv6_local_error() does not require socket lock,
 249          * we promote our socket to non const
 250          */
 251         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 252
 253         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 254         kfree_skb(skb);
 255         return -EMSGSIZE;
 256 }
 257 EXPORT_SYMBOL(ip6_xmit);
 258
 259 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 260 {
 261         struct ip6_ra_chain *ra;
 262         struct sock *last = NULL;
 263
 264         read_lock(&ip6_ra_lock);
 265         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 266                 struct sock *sk = ra->sk;
 267                 if (sk && ra->sel == sel &&
 268                     (!sk->sk_bound_dev_if ||
 269                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 270                         if (last) {
 271                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 272                                 if (skb2)
 273                                         rawv6_rcv(last, skb2);
 274                         }
 275                         last = sk;
 276                 }
 277         }
 278
 279         if (last) {
 280                 rawv6_rcv(last, skb);
 281                 read_unlock(&ip6_ra_lock);
 282                 return 1;
 283         }
 284         read_unlock(&ip6_ra_lock);
 285         return 0;
 286 }
 287
 288 static int ip6_forward_proxy_check(struct sk_buff *skb)
 289 {
 290         struct ipv6hdr *hdr = ipv6_hdr(skb);
 291         u8 nexthdr = hdr->nexthdr;
 292         __be16 frag_off;
 293         int offset;
 294
 295         if (ipv6_ext_hdr(nexthdr)) {
 296                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 297                 if (offset < 0)
 298                         return 0;
 299         } else
 300                 offset = sizeof(struct ipv6hdr);
 301
 302         if (nexthdr == IPPROTO_ICMPV6) {
 303                 struct icmp6hdr *icmp6;
 304
 305                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 306                                          offset + 1 - skb->data)))
 307                         return 0;
 308
 309                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 310
 311                 switch (icmp6->icmp6_type) {
 312                 case NDISC_ROUTER_SOLICITATION:
 313                 case NDISC_ROUTER_ADVERTISEMENT:
 314                 case NDISC_NEIGHBOUR_SOLICITATION:
 315                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 316                 case NDISC_REDIRECT:
 317                         /* For reaction involving unicast neighbor discovery
 318                          * message destined to the proxied address, pass it to
 319                          * input function.
 320                          */
 321                         return 1;
 322                 default:
 323                         break;
 324                 }
 325         }
 326
 327         /*
 328          * The proxying router can't forward traffic sent to a link-local
 329          * address, so signal the sender and discard the packet. This
 330          * behavior is clarified by the MIPv6 specification.
 331          */
 332         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 333                 dst_link_failure(skb);
 334                 return -1;
 335         }
 336
 337         return 0;
 338 }
 339
 340 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 341                                      struct sk_buff *skb)
 342 {
 343         return dst_output(net, sk, skb);
 344 }
 345
 346 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 347 {
 348         unsigned int mtu;
 349         struct inet6_dev *idev;
 350
 351         if (dst_metric_locked(dst, RTAX_MTU)) {
 352                 mtu = dst_metric_raw(dst, RTAX_MTU);
 353                 if (mtu)
 354                         return mtu;
 355         }
 356
 357         mtu = IPV6_MIN_MTU;
 358         rcu_read_lock();
 359         idev = __in6_dev_get(dst->dev);
 360         if (idev)
 361                 mtu = idev->cnf.mtu6;
 362         rcu_read_unlock();
 363
 364         return mtu;
 365 }
 366
 367 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 368 {
 369         if (skb->len <= mtu)
 370                 return false;
 371
 372         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 373         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 374                 return true;
 375
 376         if (skb->ignore_df)
 377                 return false;
 378
 379         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 380                 return false;
 381
 382         return true;
 383 }
 384
 385 int ip6_forward(struct sk_buff *skb)
 386 {
 387         struct dst_entry *dst = skb_dst(skb);
 388         struct ipv6hdr *hdr = ipv6_hdr(skb);
 389         struct inet6_skb_parm *opt = IP6CB(skb);
 390         struct net *net = dev_net(dst->dev);
 391         u32 mtu;
 392
 393         if (net->ipv6.devconf_all->forwarding == 0)
 394                 goto error;
 395
 396         if (skb->pkt_type != PACKET_HOST)
 397                 goto drop;
 398
 399         if (unlikely(skb->sk))
 400                 goto drop;
 401
 402         if (skb_warn_if_lro(skb))
 403                 goto drop;
 404
 405         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 406                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 407                                 IPSTATS_MIB_INDISCARDS);
 408                 goto drop;
 409         }
 410
 411         skb_forward_csum(skb);
 412
 413         /*
 414          *      We DO NOT make any processing on
 415          *      RA packets, pushing them to user level AS IS
 416          *      without ane WARRANTY that application will be able
 417          *      to interpret them. The reason is that we
 418          *      cannot make anything clever here.
 419          *
 420          *      We are not end-node, so that if packet contains
 421          *      AH/ESP, we cannot make anything.
 422          *      Defragmentation also would be mistake, RA packets
 423          *      cannot be fragmented, because there is no warranty
 424          *      that different fragments will go along one path. --ANK
 425          */
 426         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 427                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 428                         return 0;
 429         }
 430
 431         /*
 432          *      check and decrement ttl
 433          */
 434         if (hdr->hop_limit <= 1) {
 435                 /* Force OUTPUT device used as source address */
 436                 skb->dev = dst->dev;
 437                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 438                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 439                                 IPSTATS_MIB_INHDRERRORS);
 440
 441                 kfree_skb(skb);
 442                 return -ETIMEDOUT;
 443         }
 444
 445         /* XXX: idev->cnf.proxy_ndp? */
 446         if (net->ipv6.devconf_all->proxy_ndp &&
 447             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 448                 int proxied = ip6_forward_proxy_check(skb);
 449                 if (proxied > 0)
 450                         return ip6_input(skb);
 451                 else if (proxied < 0) {
 452                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 453                                         IPSTATS_MIB_INDISCARDS);
 454                         goto drop;
 455                 }
 456         }
 457
 458         if (!xfrm6_route_forward(skb)) {
 459                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 460                                 IPSTATS_MIB_INDISCARDS);
 461                 goto drop;
 462         }
 463         dst = skb_dst(skb);
 464
 465         /* IPv6 specs say nothing about it, but it is clear that we cannot
 466            send redirects to source routed frames.
 467            We don't send redirects to frames decapsulated from IPsec.
 468          */
 469         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 470                 struct in6_addr *target = NULL;
 471                 struct inet_peer *peer;
 472                 struct rt6_info *rt;
 473
 474                 /*
 475                  *      incoming and outgoing devices are the same
 476                  *      send a redirect.
 477                  */
 478
 479                 rt = (struct rt6_info *) dst;
 480                 if (rt->rt6i_flags & RTF_GATEWAY)
 481                         target = &rt->rt6i_gateway;
 482                 else
 483                         target = &hdr->daddr;
 484
 485                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 486
 487                 /* Limit redirects both by destination (here)
 488                    and by source (inside ndisc_send_redirect)
 489                  */
 490                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 491                         ndisc_send_redirect(skb, target);
 492                 if (peer)
 493                         inet_putpeer(peer);
 494         } else {
 495                 int addrtype = ipv6_addr_type(&hdr->saddr);
 496
 497                 /* This check is security critical. */
 498                 if (addrtype == IPV6_ADDR_ANY ||
 499                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 500                         goto error;
 501                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 502                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 503                                     ICMPV6_NOT_NEIGHBOUR, 0);
 504                         goto error;
 505                 }
 506         }
 507
 508         mtu = ip6_dst_mtu_forward(dst);
 509         if (mtu < IPV6_MIN_MTU)
 510                 mtu = IPV6_MIN_MTU;
 511
 512         if (ip6_pkt_too_big(skb, mtu)) {
 513                 /* Again, force OUTPUT device used as source address */
 514                 skb->dev = dst->dev;
 515                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 516                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 517                                 IPSTATS_MIB_INTOOBIGERRORS);
 518                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 519                                 IPSTATS_MIB_FRAGFAILS);
 520                 kfree_skb(skb);
 521                 return -EMSGSIZE;
 522         }
 523
 524         if (skb_cow(skb, dst->dev->hard_header_len)) {
 525                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 526                                 IPSTATS_MIB_OUTDISCARDS);
 527                 goto drop;
 528         }
 529
 530         hdr = ipv6_hdr(skb);
 531
 532         /* Mangling hops number delayed to point after skb COW */
 533
 534         hdr->hop_limit--;
 535
 536         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 537         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 538         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 539                        net, NULL, skb, skb->dev, dst->dev,
 540                        ip6_forward_finish);
 541
 542 error:
 543         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 544 drop:
 545         kfree_skb(skb);
 546         return -EINVAL;
 547 }
 548
 549 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 550 {
 551         to->pkt_type = from->pkt_type;
 552         to->priority = from->priority;
 553         to->protocol = from->protocol;
 554         skb_dst_drop(to);
 555         skb_dst_set(to, dst_clone(skb_dst(from)));
 556         to->dev = from->dev;
 557         to->mark = from->mark;
 558
 559 #ifdef CONFIG_NET_SCHED
 560         to->tc_index = from->tc_index;
 561 #endif
 562         nf_copy(to, from);
 563         skb_copy_secmark(to, from);
 564 }
 565
 566 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 567                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 568 {
 569         struct sk_buff *frag;
 570         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 571         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 572                                 inet6_sk(skb->sk) : NULL;
 573         struct ipv6hdr *tmp_hdr;
 574         struct frag_hdr *fh;
 575         unsigned int mtu, hlen, left, len;
 576         int hroom, troom;
 577         __be32 frag_id;
 578         int ptr, offset = 0, err = 0;
 579         u8 *prevhdr, nexthdr = 0;
 580
 581         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 582         nexthdr = *prevhdr;
 583
 584         mtu = ip6_skb_dst_mtu(skb);
 585
 586         /* We must not fragment if the socket is set to force MTU discovery
 587          * or if the skb it not generated by a local socket.
 588          */
 589         if (unlikely(!skb->ignore_df && skb->len > mtu))
 590                 goto fail_toobig;
 591
 592         if (IP6CB(skb)->frag_max_size) {
 593                 if (IP6CB(skb)->frag_max_size > mtu)
 594                         goto fail_toobig;
 595
 596                 /* don't send fragments larger than what we received */
 597                 mtu = IP6CB(skb)->frag_max_size;
 598                 if (mtu < IPV6_MIN_MTU)
 599                         mtu = IPV6_MIN_MTU;
 600         }
 601
 602         if (np && np->frag_size < mtu) {
 603                 if (np->frag_size)
 604                         mtu = np->frag_size;
 605         }
 606         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 607                 goto fail_toobig;
 608         mtu -= hlen + sizeof(struct frag_hdr);
 609
 610         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 611                                     &ipv6_hdr(skb)->saddr);
 612
 613         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 614             (err = skb_checksum_help(skb)))
 615                 goto fail;
 616
 617         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 618         if (skb_has_frag_list(skb)) {
 619                 int first_len = skb_pagelen(skb);
 620                 struct sk_buff *frag2;
 621
 622                 if (first_len - hlen > mtu ||
 623                     ((first_len - hlen) & 7) ||
 624                     skb_cloned(skb) ||
 625                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 626                         goto slow_path;
 627
 628                 skb_walk_frags(skb, frag) {
 629                         /* Correct geometry. */
 630                         if (frag->len > mtu ||
 631                             ((frag->len & 7) && frag->next) ||
 632                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 633                                 goto slow_path_clean;
 634
 635                         /* Partially cloned skb? */
 636                         if (skb_shared(frag))
 637                                 goto slow_path_clean;
 638
 639                         BUG_ON(frag->sk);
 640                         if (skb->sk) {
 641                                 frag->sk = skb->sk;
 642                                 frag->destructor = sock_wfree;
 643                         }
 644                         skb->truesize -= frag->truesize;
 645                 }
 646
 647                 err = 0;
 648                 offset = 0;
 649                 /* BUILD HEADER */
 650
 651                 *prevhdr = NEXTHDR_FRAGMENT;
 652                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 653                 if (!tmp_hdr) {
 654                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 655                                       IPSTATS_MIB_FRAGFAILS);
 656                         err = -ENOMEM;
 657                         goto fail;
 658                 }
 659                 frag = skb_shinfo(skb)->frag_list;
 660                 skb_frag_list_init(skb);
 661
 662                 __skb_pull(skb, hlen);
 663                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 664                 __skb_push(skb, hlen);
 665                 skb_reset_network_header(skb);
 666                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 667
 668                 fh->nexthdr = nexthdr;
 669                 fh->reserved = 0;
 670                 fh->frag_off = htons(IP6_MF);
 671                 fh->identification = frag_id;
 672
 673                 first_len = skb_pagelen(skb);
 674                 skb->data_len = first_len - skb_headlen(skb);
 675                 skb->len = first_len;
 676                 ipv6_hdr(skb)->payload_len = htons(first_len -
 677                                                    sizeof(struct ipv6hdr));
 678
 679                 dst_hold(&rt->dst);
 680
 681                 for (;;) {
 682                         /* Prepare header of the next frame,
 683                          * before previous one went down. */
 684                         if (frag) {
 685                                 frag->ip_summed = CHECKSUM_NONE;
 686                                 skb_reset_transport_header(frag);
 687                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 688                                 __skb_push(frag, hlen);
 689                                 skb_reset_network_header(frag);
 690                                 memcpy(skb_network_header(frag), tmp_hdr,
 691                                        hlen);
 692                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 693                                 fh->nexthdr = nexthdr;
 694                                 fh->reserved = 0;
 695                                 fh->frag_off = htons(offset);
 696                                 if (frag->next)
 697                                         fh->frag_off |= htons(IP6_MF);
 698                                 fh->identification = frag_id;
 699                                 ipv6_hdr(frag)->payload_len =
 700                                                 htons(frag->len -
 701                                                       sizeof(struct ipv6hdr));
 702                                 ip6_copy_metadata(frag, skb);
 703                         }
 704
 705                         err = output(net, sk, skb);
 706                         if (!err)
 707                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 708                                               IPSTATS_MIB_FRAGCREATES);
 709
 710                         if (err || !frag)
 711                                 break;
 712
 713                         skb = frag;
 714                         frag = skb->next;
 715                         skb->next = NULL;
 716                 }
 717
 718                 kfree(tmp_hdr);
 719
 720                 if (err == 0) {
 721                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 722                                       IPSTATS_MIB_FRAGOKS);
 723                         ip6_rt_put(rt);
 724                         return 0;
 725                 }
 726
 727                 kfree_skb_list(frag);
 728
 729                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 730                               IPSTATS_MIB_FRAGFAILS);
 731                 ip6_rt_put(rt);
 732                 return err;
 733
 734 slow_path_clean:
 735                 skb_walk_frags(skb, frag2) {
 736                         if (frag2 == frag)
 737                                 break;
 738                         frag2->sk = NULL;
 739                         frag2->destructor = NULL;
 740                         skb->truesize += frag2->truesize;
 741                 }
 742         }
 743
 744 slow_path:
 745         left = skb->len - hlen;         /* Space per frame */
 746         ptr = hlen;                     /* Where to start from */
 747
 748         /*
 749          *      Fragment the datagram.
 750          */
 751
 752         *prevhdr = NEXTHDR_FRAGMENT;
 753         troom = rt->dst.dev->needed_tailroom;
 754
 755         /*
 756          *      Keep copying data until we run out.
 757          */
 758         while (left > 0)        {
 759                 len = left;
 760                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 761                 if (len > mtu)
 762                         len = mtu;
 763                 /* IF: we are not sending up to and including the packet end
 764                    then align the next start on an eight byte boundary */
 765                 if (len < left) {
 766                         len &= ~7;
 767                 }
 768
 769                 /* Allocate buffer */
 770                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 771                                  hroom + troom, GFP_ATOMIC);
 772                 if (!frag) {
 773                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 774                                       IPSTATS_MIB_FRAGFAILS);
 775                         err = -ENOMEM;
 776                         goto fail;
 777                 }
 778
 779                 /*
 780                  *      Set up data on packet
 781                  */
 782
 783                 ip6_copy_metadata(frag, skb);
 784                 skb_reserve(frag, hroom);
 785                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 786                 skb_reset_network_header(frag);
 787                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 788                 frag->transport_header = (frag->network_header + hlen +
 789                                           sizeof(struct frag_hdr));
 790
 791                 /*
 792                  *      Charge the memory for the fragment to any owner
 793                  *      it might possess
 794                  */
 795                 if (skb->sk)
 796                         skb_set_owner_w(frag, skb->sk);
 797
 798                 /*
 799                  *      Copy the packet header into the new buffer.
 800                  */
 801                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 802
 803                 /*
 804                  *      Build fragment header.
 805                  */
 806                 fh->nexthdr = nexthdr;
 807                 fh->reserved = 0;
 808                 fh->identification = frag_id;
 809
 810                 /*
 811                  *      Copy a block of the IP datagram.
 812                  */
 813                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 814                                      len));
 815                 left -= len;
 816
 817                 fh->frag_off = htons(offset);
 818                 if (left > 0)
 819                         fh->frag_off |= htons(IP6_MF);
 820                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 821                                                     sizeof(struct ipv6hdr));
 822
 823                 ptr += len;
 824                 offset += len;
 825
 826                 /*
 827                  *      Put this fragment into the sending queue.
 828                  */
 829                 err = output(net, sk, frag);
 830                 if (err)
 831                         goto fail;
 832
 833                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 834                               IPSTATS_MIB_FRAGCREATES);
 835         }
 836         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 837                       IPSTATS_MIB_FRAGOKS);
 838         consume_skb(skb);
 839         return err;
 840
 841 fail_toobig:
 842         if (skb->sk && dst_allfrag(skb_dst(skb)))
 843                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 844
 845         skb->dev = skb_dst(skb)->dev;
 846         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 847         err = -EMSGSIZE;
 848
 849 fail:
 850         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 851                       IPSTATS_MIB_FRAGFAILS);
 852         kfree_skb(skb);
 853         return err;
 854 }
 855
 856 static inline int ip6_rt_check(const struct rt6key *rt_key,
 857                                const struct in6_addr *fl_addr,
 858                                const struct in6_addr *addr_cache)
 859 {
 860         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 861                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 862 }
 863
 864 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 865                                           struct dst_entry *dst,
 866                                           const struct flowi6 *fl6)
 867 {
 868         struct ipv6_pinfo *np = inet6_sk(sk);
 869         struct rt6_info *rt;
 870
 871         if (!dst)
 872                 goto out;
 873
 874         if (dst->ops->family != AF_INET6) {
 875                 dst_release(dst);
 876                 return NULL;
 877         }
 878
 879         rt = (struct rt6_info *)dst;
 880         /* Yes, checking route validity in not connected
 881          * case is not very simple. Take into account,
 882          * that we do not support routing by source, TOS,
 883          * and MSG_DONTROUTE            --ANK (980726)
 884          *
 885          * 1. ip6_rt_check(): If route was host route,
 886          *    check that cached destination is current.
 887          *    If it is network route, we still may
 888          *    check its validity using saved pointer
 889          *    to the last used address: daddr_cache.
 890          *    We do not want to save whole address now,
 891          *    (because main consumer of this service
 892          *    is tcp, which has not this problem),
 893          *    so that the last trick works only on connected
 894          *    sockets.
 895          * 2. oif also should be the same.
 896          */
 897         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 898 #ifdef CONFIG_IPV6_SUBTREES
 899             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 900 #endif
 901            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 902               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 903                 dst_release(dst);
 904                 dst = NULL;
 905         }
 906
 907 out:
 908         return dst;
 909 }
 910
 911 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 912                                struct dst_entry **dst, struct flowi6 *fl6)
 913 {
 914 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 915         struct neighbour *n;
 916         struct rt6_info *rt;
 917 #endif
 918         int err;
 919         int flags = 0;
 920
 921         if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
 922             (!*dst || !(*dst)->error)) {
 923                 err = l3mdev_get_saddr6(net, sk, fl6);
 924                 if (err)
 925                         goto out_err;
 926         }
 927
 928         /* The correct way to handle this would be to do
 929          * ip6_route_get_saddr, and then ip6_route_output; however,
 930          * the route-specific preferred source forces the
 931          * ip6_route_output call _before_ ip6_route_get_saddr.
 932          *
 933          * In source specific routing (no src=any default route),
 934          * ip6_route_output will fail given src=any saddr, though, so
 935          * that's why we try it again later.
 936          */
 937         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 938                 struct rt6_info *rt;
 939                 bool had_dst = *dst != NULL;
 940
 941                 if (!had_dst)
 942                         *dst = ip6_route_output(net, sk, fl6);
 943                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 944                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 945                                           sk ? inet6_sk(sk)->srcprefs : 0,
 946                                           &fl6->saddr);
 947                 if (err)
 948                         goto out_err_release;
 949
 950                 /* If we had an erroneous initial result, pretend it
 951                  * never existed and let the SA-enabled version take
 952                  * over.
 953                  */
 954                 if (!had_dst && (*dst)->error) {
 955                         dst_release(*dst);
 956                         *dst = NULL;
 957                 }
 958
 959                 if (fl6->flowi6_oif)
 960                         flags |= RT6_LOOKUP_F_IFACE;
 961         }
 962
 963         if (!*dst)
 964                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 965
 966         err = (*dst)->error;
 967         if (err)
 968                 goto out_err_release;
 969
 970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 971         /*
 972          * Here if the dst entry we've looked up
 973          * has a neighbour entry that is in the INCOMPLETE
 974          * state and the src address from the flow is
 975          * marked as OPTIMISTIC, we release the found
 976          * dst entry and replace it instead with the
 977          * dst entry of the nexthop router
 978          */
 979         rt = (struct rt6_info *) *dst;
 980         rcu_read_lock_bh();
 981         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 982                                       rt6_nexthop(rt, &fl6->daddr));
 983         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 984         rcu_read_unlock_bh();
 985
 986         if (err) {
 987                 struct inet6_ifaddr *ifp;
 988                 struct flowi6 fl_gw6;
 989                 int redirect;
 990
 991                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 992                                       (*dst)->dev, 1);
 993
 994                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 995                 if (ifp)
 996                         in6_ifa_put(ifp);
 997
 998                 if (redirect) {
 999                         /*
1000                          * We need to get the dst entry for the
1001                          * default router instead
1002                          */
1003                         dst_release(*dst);
1004                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006                         *dst = ip6_route_output(net, sk, &fl_gw6);
1007                         err = (*dst)->error;
1008                         if (err)
1009                                 goto out_err_release;
1010                 }
1011         }
1012 #endif
1013
1014         return 0;
1015
1016 out_err_release:
1017         dst_release(*dst);
1018         *dst = NULL;
1019 out_err:
1020         if (err == -ENETUNREACH)
1021                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1022         return err;
1023 }
1024
1025 /**
1026  *      ip6_dst_lookup - perform route lookup on flow
1027  *      @sk: socket which provides route info
1028  *      @dst: pointer to dst_entry * for result
1029  *      @fl6: flow to lookup
1030  *
1031  *      This function performs a route lookup on the given flow.
1032  *
1033  *      It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1036                    struct flowi6 *fl6)
1037 {
1038         *dst = NULL;
1039         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1040 }
1041 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1042
1043 /**
1044  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1045  *      @sk: socket which provides route info
1046  *      @fl6: flow to lookup
1047  *      @final_dst: final destination address for ipsec lookup
1048  *
1049  *      This function performs a route lookup on the given flow.
1050  *
1051  *      It returns a valid dst pointer on success, or a pointer encoded
1052  *      error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1055                                       const struct in6_addr *final_dst)
1056 {
1057         struct dst_entry *dst = NULL;
1058         int err;
1059
1060         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1061         if (err)
1062                 return ERR_PTR(err);
1063         if (final_dst)
1064                 fl6->daddr = *final_dst;
1065         if (!fl6->flowi6_oif)
1066                 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1067
1068         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1069 }
1070 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1071
1072 /**
1073  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1074  *      @sk: socket which provides the dst cache and route info
1075  *      @fl6: flow to lookup
1076  *      @final_dst: final destination address for ipsec lookup
1077  *
1078  *      This function performs a route lookup on the given flow with the
1079  *      possibility of using the cached route in the socket if it is valid.
1080  *      It will take the socket dst lock when operating on the dst cache.
1081  *      As a result, this function can only be used in process context.
1082  *
1083  *      It returns a valid dst pointer on success, or a pointer encoded
1084  *      error code.
1085  */
1086 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1087                                          const struct in6_addr *final_dst)
1088 {
1089         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090
1091         dst = ip6_sk_dst_check(sk, dst, fl6);
1092         if (!dst)
1093                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1094
1095         return dst;
1096 }
1097 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1098
1099 static inline int ip6_ufo_append_data(struct sock *sk,
1100                         struct sk_buff_head *queue,
1101                         int getfrag(void *from, char *to, int offset, int len,
1102                         int odd, struct sk_buff *skb),
1103                         void *from, int length, int hh_len, int fragheaderlen,
1104                         int exthdrlen, int transhdrlen, int mtu,
1105                         unsigned int flags, const struct flowi6 *fl6)
1106
1107 {
1108         struct sk_buff *skb;
1109         int err;
1110
1111         /* There is support for UDP large send offload by network
1112          * device, so create one single skb packet containing complete
1113          * udp datagram
1114          */
1115         skb = skb_peek_tail(queue);
1116         if (!skb) {
1117                 skb = sock_alloc_send_skb(sk,
1118                         hh_len + fragheaderlen + transhdrlen + 20,
1119                         (flags & MSG_DONTWAIT), &err);
1120                 if (!skb)
1121                         return err;
1122
1123                 /* reserve space for Hardware header */
1124                 skb_reserve(skb, hh_len);
1125
1126                 /* create space for UDP/IP header */
1127                 skb_put(skb, fragheaderlen + transhdrlen);
1128
1129                 /* initialize network header pointer */
1130                 skb_set_network_header(skb, exthdrlen);
1131
1132                 /* initialize protocol header pointer */
1133                 skb->transport_header = skb->network_header + fragheaderlen;
1134
1135                 skb->protocol = htons(ETH_P_IPV6);
1136                 skb->csum = 0;
1137
1138                 __skb_queue_tail(queue, skb);
1139         } else if (skb_is_gso(skb)) {
1140                 goto append;
1141         }
1142
1143         skb->ip_summed = CHECKSUM_PARTIAL;
1144         /* Specify the length of each IPv6 datagram fragment.
1145          * It has to be a multiple of 8.
1146          */
1147         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1148                                      sizeof(struct frag_hdr)) & ~7;
1149         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1150         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1151                                                          &fl6->daddr,
1152                                                          &fl6->saddr);
1153
1154 append:
1155         return skb_append_datato_frags(sk, skb, getfrag, from,
1156                                        (length - transhdrlen));
1157 }
1158
1159 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1160                                                gfp_t gfp)
1161 {
1162         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1163 }
1164
1165 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1166                                                 gfp_t gfp)
1167 {
1168         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1169 }
1170
1171 static void ip6_append_data_mtu(unsigned int *mtu,
1172                                 int *maxfraglen,
1173                                 unsigned int fragheaderlen,
1174                                 struct sk_buff *skb,
1175                                 struct rt6_info *rt,
1176                                 unsigned int orig_mtu)
1177 {
1178         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1179                 if (!skb) {
1180                         /* first fragment, reserve header_len */
1181                         *mtu = orig_mtu - rt->dst.header_len;
1182
1183                 } else {
1184                         /*
1185                          * this fragment is not first, the headers
1186                          * space is regarded as data space.
1187                          */
1188                         *mtu = orig_mtu;
1189                 }
1190                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1191                               + fragheaderlen - sizeof(struct frag_hdr);
1192         }
1193 }
1194
1195 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1196                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1197                           struct rt6_info *rt, struct flowi6 *fl6)
1198 {
1199         struct ipv6_pinfo *np = inet6_sk(sk);
1200         unsigned int mtu;
1201         struct ipv6_txoptions *opt = ipc6->opt;
1202
1203         /*
1204          * setup for corking
1205          */
1206         if (opt) {
1207                 if (WARN_ON(v6_cork->opt))
1208                         return -EINVAL;
1209
1210                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1211                 if (unlikely(!v6_cork->opt))
1212                         return -ENOBUFS;
1213
1214                 v6_cork->opt->tot_len = opt->tot_len;
1215                 v6_cork->opt->opt_flen = opt->opt_flen;
1216                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1217
1218                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1219                                                     sk->sk_allocation);
1220                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1221                         return -ENOBUFS;
1222
1223                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1224                                                     sk->sk_allocation);
1225                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1226                         return -ENOBUFS;
1227
1228                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1229                                                    sk->sk_allocation);
1230                 if (opt->hopopt && !v6_cork->opt->hopopt)
1231                         return -ENOBUFS;
1232
1233                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1234                                                     sk->sk_allocation);
1235                 if (opt->srcrt && !v6_cork->opt->srcrt)
1236                         return -ENOBUFS;
1237
1238                 /* need source address above miyazawa*/
1239         }
1240         dst_hold(&rt->dst);
1241         cork->base.dst = &rt->dst;
1242         cork->fl.u.ip6 = *fl6;
1243         v6_cork->hop_limit = ipc6->hlimit;
1244         v6_cork->tclass = ipc6->tclass;
1245         if (rt->dst.flags & DST_XFRM_TUNNEL)
1246                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1247                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1248         else
1249                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1250                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1251         if (np->frag_size < mtu) {
1252                 if (np->frag_size)
1253                         mtu = np->frag_size;
1254         }
1255         cork->base.fragsize = mtu;
1256         if (dst_allfrag(rt->dst.path))
1257                 cork->base.flags |= IPCORK_ALLFRAG;
1258         cork->base.length = 0;
1259
1260         return 0;
1261 }
1262
1263 static int __ip6_append_data(struct sock *sk,
1264                              struct flowi6 *fl6,
1265                              struct sk_buff_head *queue,
1266                              struct inet_cork *cork,
1267                              struct inet6_cork *v6_cork,
1268                              struct page_frag *pfrag,
1269                              int getfrag(void *from, char *to, int offset,
1270                                          int len, int odd, struct sk_buff *skb),
1271                              void *from, int length, int transhdrlen,
1272                              unsigned int flags, struct ipcm6_cookie *ipc6,
1273                              const struct sockcm_cookie *sockc)
1274 {
1275         struct sk_buff *skb, *skb_prev = NULL;
1276         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1277         int exthdrlen = 0;
1278         int dst_exthdrlen = 0;
1279         int hh_len;
1280         int copy;
1281         int err;
1282         int offset = 0;
1283         __u8 tx_flags = 0;
1284         u32 tskey = 0;
1285         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1286         struct ipv6_txoptions *opt = v6_cork->opt;
1287         int csummode = CHECKSUM_NONE;
1288         unsigned int maxnonfragsize, headersize;
1289
1290         skb = skb_peek_tail(queue);
1291         if (!skb) {
1292                 exthdrlen = opt ? opt->opt_flen : 0;
1293                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1294         }
1295
1296         mtu = cork->fragsize;
1297         orig_mtu = mtu;
1298
1299         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1300
1301         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1302                         (opt ? opt->opt_nflen : 0);
1303         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1304                      sizeof(struct frag_hdr);
1305
1306         headersize = sizeof(struct ipv6hdr) +
1307                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1308                      (dst_allfrag(&rt->dst) ?
1309                       sizeof(struct frag_hdr) : 0) +
1310                      rt->rt6i_nfheader_len;
1311
1312         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1313             (sk->sk_protocol == IPPROTO_UDP ||
1314              sk->sk_protocol == IPPROTO_RAW)) {
1315                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1316                                 sizeof(struct ipv6hdr));
1317                 goto emsgsize;
1318         }
1319
1320         if (ip6_sk_ignore_df(sk))
1321                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1322         else
1323                 maxnonfragsize = mtu;
1324
1325         if (cork->length + length > maxnonfragsize - headersize) {
1326 emsgsize:
1327                 ipv6_local_error(sk, EMSGSIZE, fl6,
1328                                  mtu - headersize +
1329                                  sizeof(struct ipv6hdr));
1330                 return -EMSGSIZE;
1331         }
1332
1333         /* CHECKSUM_PARTIAL only with no extension headers and when
1334          * we are not going to fragment
1335          */
1336         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337             headersize == sizeof(struct ipv6hdr) &&
1338             length < mtu - headersize &&
1339             !(flags & MSG_MORE) &&
1340             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341                 csummode = CHECKSUM_PARTIAL;
1342
1343         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1344                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1345                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1346                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1347                         tskey = sk->sk_tskey++;
1348         }
1349
1350         /*
1351          * Let's try using as much space as possible.
1352          * Use MTU if total length of the message fits into the MTU.
1353          * Otherwise, we need to reserve fragment header and
1354          * fragment alignment (= 8-15 octects, in total).
1355          *
1356          * Note that we may need to "move" the data from the tail of
1357          * of the buffer to the new fragment when we split
1358          * the message.
1359          *
1360          * FIXME: It may be fragmented into multiple chunks
1361          *        at once if non-fragmentable extension headers
1362          *        are too large.
1363          * --yoshfuji
1364          */
1365
1366         cork->length += length;
1367         if (((length > mtu) ||
1368              (skb && skb_is_gso(skb))) &&
1369             (sk->sk_protocol == IPPROTO_UDP) &&
1370             (rt->dst.dev->features & NETIF_F_UFO) &&
1371             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1372                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1373                                           hh_len, fragheaderlen, exthdrlen,
1374                                           transhdrlen, mtu, flags, fl6);
1375                 if (err)
1376                         goto error;
1377                 return 0;
1378         }
1379
1380         if (!skb)
1381                 goto alloc_new_skb;
1382
1383         while (length > 0) {
1384                 /* Check if the remaining data fits into current packet. */
1385                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1386                 if (copy < length)
1387                         copy = maxfraglen - skb->len;
1388
1389                 if (copy <= 0) {
1390                         char *data;
1391                         unsigned int datalen;
1392                         unsigned int fraglen;
1393                         unsigned int fraggap;
1394                         unsigned int alloclen;
1395 alloc_new_skb:
1396                         /* There's no room in the current skb */
1397                         if (skb)
1398                                 fraggap = skb->len - maxfraglen;
1399                         else
1400                                 fraggap = 0;
1401                         /* update mtu and maxfraglen if necessary */
1402                         if (!skb || !skb_prev)
1403                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1404                                                     fragheaderlen, skb, rt,
1405                                                     orig_mtu);
1406
1407                         skb_prev = skb;
1408
1409                         /*
1410                          * If remaining data exceeds the mtu,
1411                          * we know we need more fragment(s).
1412                          */
1413                         datalen = length + fraggap;
1414
1415                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1416                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1417                         if ((flags & MSG_MORE) &&
1418                             !(rt->dst.dev->features&NETIF_F_SG))
1419                                 alloclen = mtu;
1420                         else
1421                                 alloclen = datalen + fragheaderlen;
1422
1423                         alloclen += dst_exthdrlen;
1424
1425                         if (datalen != length + fraggap) {
1426                                 /*
1427                                  * this is not the last fragment, the trailer
1428                                  * space is regarded as data space.
1429                                  */
1430                                 datalen += rt->dst.trailer_len;
1431                         }
1432
1433                         alloclen += rt->dst.trailer_len;
1434                         fraglen = datalen + fragheaderlen;
1435
1436                         /*
1437                          * We just reserve space for fragment header.
1438                          * Note: this may be overallocation if the message
1439                          * (without MSG_MORE) fits into the MTU.
1440                          */
1441                         alloclen += sizeof(struct frag_hdr);
1442
1443                         if (transhdrlen) {
1444                                 skb = sock_alloc_send_skb(sk,
1445                                                 alloclen + hh_len,
1446                                                 (flags & MSG_DONTWAIT), &err);
1447                         } else {
1448                                 skb = NULL;
1449                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1450                                     2 * sk->sk_sndbuf)
1451                                         skb = sock_wmalloc(sk,
1452                                                            alloclen + hh_len, 1,
1453                                                            sk->sk_allocation);
1454                                 if (unlikely(!skb))
1455                                         err = -ENOBUFS;
1456                         }
1457                         if (!skb)
1458                                 goto error;
1459                         /*
1460                          *      Fill in the control structures
1461                          */
1462                         skb->protocol = htons(ETH_P_IPV6);
1463                         skb->ip_summed = csummode;
1464                         skb->csum = 0;
1465                         /* reserve for fragmentation and ipsec header */
1466                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1467                                     dst_exthdrlen);
1468
1469                         /* Only the initial fragment is time stamped */
1470                         skb_shinfo(skb)->tx_flags = tx_flags;
1471                         tx_flags = 0;
1472                         skb_shinfo(skb)->tskey = tskey;
1473                         tskey = 0;
1474
1475                         /*
1476                          *      Find where to start putting bytes
1477                          */
1478                         data = skb_put(skb, fraglen);
1479                         skb_set_network_header(skb, exthdrlen);
1480                         data += fragheaderlen;
1481                         skb->transport_header = (skb->network_header +
1482                                                  fragheaderlen);
1483                         if (fraggap) {
1484                                 skb->csum = skb_copy_and_csum_bits(
1485                                         skb_prev, maxfraglen,
1486                                         data + transhdrlen, fraggap, 0);
1487                                 skb_prev->csum = csum_sub(skb_prev->csum,
1488                                                           skb->csum);
1489                                 data += fraggap;
1490                                 pskb_trim_unique(skb_prev, maxfraglen);
1491                         }
1492                         copy = datalen - transhdrlen - fraggap;
1493
1494                         if (copy < 0) {
1495                                 err = -EINVAL;
1496                                 kfree_skb(skb);
1497                                 goto error;
1498                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1499                                 err = -EFAULT;
1500                                 kfree_skb(skb);
1501                                 goto error;
1502                         }
1503
1504                         offset += copy;
1505                         length -= datalen - fraggap;
1506                         transhdrlen = 0;
1507                         exthdrlen = 0;
1508                         dst_exthdrlen = 0;
1509
1510                         /*
1511                          * Put the packet on the pending queue
1512                          */
1513                         __skb_queue_tail(queue, skb);
1514                         continue;
1515                 }
1516
1517                 if (copy > length)
1518                         copy = length;
1519
1520                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1521                         unsigned int off;
1522
1523                         off = skb->len;
1524                         if (getfrag(from, skb_put(skb, copy),
1525                                                 offset, copy, off, skb) < 0) {
1526                                 __skb_trim(skb, off);
1527                                 err = -EFAULT;
1528                                 goto error;
1529                         }
1530                 } else {
1531                         int i = skb_shinfo(skb)->nr_frags;
1532
1533                         err = -ENOMEM;
1534                         if (!sk_page_frag_refill(sk, pfrag))
1535                                 goto error;
1536
1537                         if (!skb_can_coalesce(skb, i, pfrag->page,
1538                                               pfrag->offset)) {
1539                                 err = -EMSGSIZE;
1540                                 if (i == MAX_SKB_FRAGS)
1541                                         goto error;
1542
1543                                 __skb_fill_page_desc(skb, i, pfrag->page,
1544                                                      pfrag->offset, 0);
1545                                 skb_shinfo(skb)->nr_frags = ++i;
1546                                 get_page(pfrag->page);
1547                         }
1548                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1549                         if (getfrag(from,
1550                                     page_address(pfrag->page) + pfrag->offset,
1551                                     offset, copy, skb->len, skb) < 0)
1552                                 goto error_efault;
1553
1554                         pfrag->offset += copy;
1555                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1556                         skb->len += copy;
1557                         skb->data_len += copy;
1558                         skb->truesize += copy;
1559                         atomic_add(copy, &sk->sk_wmem_alloc);
1560                 }
1561                 offset += copy;
1562                 length -= copy;
1563         }
1564
1565         return 0;
1566
1567 error_efault:
1568         err = -EFAULT;
1569 error:
1570         cork->length -= length;
1571         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1572         return err;
1573 }
1574
1575 int ip6_append_data(struct sock *sk,
1576                     int getfrag(void *from, char *to, int offset, int len,
1577                                 int odd, struct sk_buff *skb),
1578                     void *from, int length, int transhdrlen,
1579                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1580                     struct rt6_info *rt, unsigned int flags,
1581                     const struct sockcm_cookie *sockc)
1582 {
1583         struct inet_sock *inet = inet_sk(sk);
1584         struct ipv6_pinfo *np = inet6_sk(sk);
1585         int exthdrlen;
1586         int err;
1587
1588         if (flags&MSG_PROBE)
1589                 return 0;
1590         if (skb_queue_empty(&sk->sk_write_queue)) {
1591                 /*
1592                  * setup for corking
1593                  */
1594                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1595                                      ipc6, rt, fl6);
1596                 if (err)
1597                         return err;
1598
1599                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1600                 length += exthdrlen;
1601                 transhdrlen += exthdrlen;
1602         } else {
1603                 fl6 = &inet->cork.fl.u.ip6;
1604                 transhdrlen = 0;
1605         }
1606
1607         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1608                                  &np->cork, sk_page_frag(sk), getfrag,
1609                                  from, length, transhdrlen, flags, ipc6, sockc);
1610 }
1611 EXPORT_SYMBOL_GPL(ip6_append_data);
1612
1613 static void ip6_cork_release(struct inet_cork_full *cork,
1614                              struct inet6_cork *v6_cork)
1615 {
1616         if (v6_cork->opt) {
1617                 kfree(v6_cork->opt->dst0opt);
1618                 kfree(v6_cork->opt->dst1opt);
1619                 kfree(v6_cork->opt->hopopt);
1620                 kfree(v6_cork->opt->srcrt);
1621                 kfree(v6_cork->opt);
1622                 v6_cork->opt = NULL;
1623         }
1624
1625         if (cork->base.dst) {
1626                 dst_release(cork->base.dst);
1627                 cork->base.dst = NULL;
1628                 cork->base.flags &= ~IPCORK_ALLFRAG;
1629         }
1630         memset(&cork->fl, 0, sizeof(cork->fl));
1631 }
1632
1633 struct sk_buff *__ip6_make_skb(struct sock *sk,
1634                                struct sk_buff_head *queue,
1635                                struct inet_cork_full *cork,
1636                                struct inet6_cork *v6_cork)
1637 {
1638         struct sk_buff *skb, *tmp_skb;
1639         struct sk_buff **tail_skb;
1640         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1641         struct ipv6_pinfo *np = inet6_sk(sk);
1642         struct net *net = sock_net(sk);
1643         struct ipv6hdr *hdr;
1644         struct ipv6_txoptions *opt = v6_cork->opt;
1645         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1646         struct flowi6 *fl6 = &cork->fl.u.ip6;
1647         unsigned char proto = fl6->flowi6_proto;
1648
1649         skb = __skb_dequeue(queue);
1650         if (!skb)
1651                 goto out;
1652         tail_skb = &(skb_shinfo(skb)->frag_list);
1653
1654         /* move skb->data to ip header from ext header */
1655         if (skb->data < skb_network_header(skb))
1656                 __skb_pull(skb, skb_network_offset(skb));
1657         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1658                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1659                 *tail_skb = tmp_skb;
1660                 tail_skb = &(tmp_skb->next);
1661                 skb->len += tmp_skb->len;
1662                 skb->data_len += tmp_skb->len;
1663                 skb->truesize += tmp_skb->truesize;
1664                 tmp_skb->destructor = NULL;
1665                 tmp_skb->sk = NULL;
1666         }
1667
1668         /* Allow local fragmentation. */
1669         skb->ignore_df = ip6_sk_ignore_df(sk);
1670
1671         *final_dst = fl6->daddr;
1672         __skb_pull(skb, skb_network_header_len(skb));
1673         if (opt && opt->opt_flen)
1674                 ipv6_push_frag_opts(skb, opt, &proto);
1675         if (opt && opt->opt_nflen)
1676                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1677
1678         skb_push(skb, sizeof(struct ipv6hdr));
1679         skb_reset_network_header(skb);
1680         hdr = ipv6_hdr(skb);
1681
1682         ip6_flow_hdr(hdr, v6_cork->tclass,
1683                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1684                                         np->autoflowlabel, fl6));
1685         hdr->hop_limit = v6_cork->hop_limit;
1686         hdr->nexthdr = proto;
1687         hdr->saddr = fl6->saddr;
1688         hdr->daddr = *final_dst;
1689
1690         skb->priority = sk->sk_priority;
1691         skb->mark = sk->sk_mark;
1692
1693         skb_dst_set(skb, dst_clone(&rt->dst));
1694         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1695         if (proto == IPPROTO_ICMPV6) {
1696                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1697
1698                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1699                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1700         }
1701
1702         ip6_cork_release(cork, v6_cork);
1703 out:
1704         return skb;
1705 }
1706
1707 int ip6_send_skb(struct sk_buff *skb)
1708 {
1709         struct net *net = sock_net(skb->sk);
1710         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1711         int err;
1712
1713         err = ip6_local_out(net, skb->sk, skb);
1714         if (err) {
1715                 if (err > 0)
1716                         err = net_xmit_errno(err);
1717                 if (err)
1718                         IP6_INC_STATS(net, rt->rt6i_idev,
1719                                       IPSTATS_MIB_OUTDISCARDS);
1720         }
1721
1722         return err;
1723 }
1724
1725 int ip6_push_pending_frames(struct sock *sk)
1726 {
1727         struct sk_buff *skb;
1728
1729         skb = ip6_finish_skb(sk);
1730         if (!skb)
1731                 return 0;
1732
1733         return ip6_send_skb(skb);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1736
1737 static void __ip6_flush_pending_frames(struct sock *sk,
1738                                        struct sk_buff_head *queue,
1739                                        struct inet_cork_full *cork,
1740                                        struct inet6_cork *v6_cork)
1741 {
1742         struct sk_buff *skb;
1743
1744         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1745                 if (skb_dst(skb))
1746                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1747                                       IPSTATS_MIB_OUTDISCARDS);
1748                 kfree_skb(skb);
1749         }
1750
1751         ip6_cork_release(cork, v6_cork);
1752 }
1753
1754 void ip6_flush_pending_frames(struct sock *sk)
1755 {
1756         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1757                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1758 }
1759 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1760
1761 struct sk_buff *ip6_make_skb(struct sock *sk,
1762                              int getfrag(void *from, char *to, int offset,
1763                                          int len, int odd, struct sk_buff *skb),
1764                              void *from, int length, int transhdrlen,
1765                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1766                              struct rt6_info *rt, unsigned int flags,
1767                              const struct sockcm_cookie *sockc)
1768 {
1769         struct inet_cork_full cork;
1770         struct inet6_cork v6_cork;
1771         struct sk_buff_head queue;
1772         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1773         int err;
1774
1775         if (flags & MSG_PROBE)
1776                 return NULL;
1777
1778         __skb_queue_head_init(&queue);
1779
1780         cork.base.flags = 0;
1781         cork.base.addr = 0;
1782         cork.base.opt = NULL;
1783         v6_cork.opt = NULL;
1784         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1785         if (err)
1786                 return ERR_PTR(err);
1787
1788         if (ipc6->dontfrag < 0)
1789                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1790
1791         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1792                                 &current->task_frag, getfrag, from,
1793                                 length + exthdrlen, transhdrlen + exthdrlen,
1794                                 flags, ipc6, sockc);
1795         if (err) {
1796                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1797                 return ERR_PTR(err);
1798         }
1799
1800         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1801 }