sctp: Add GSO support
authorMarcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Thu, 2 Jun 2016 18:05:43 +0000 (15:05 -0300)
committerDavid S. Miller <davem@davemloft.net>
Fri, 3 Jun 2016 23:37:21 +0000 (19:37 -0400)
SCTP has this pecualiarity that its packets cannot be just segmented to
(P)MTU. Its chunks must be contained in IP segments, padding respected.
So we can't just generate a big skb, set gso_size to the fragmentation
point and deliver it to IP layer.

This patch takes a different approach. SCTP will now build a skb as it
would be if it was received using GRO. That is, there will be a cover
skb with protocol headers and children ones containing the actual
segments, already segmented to a way that respects SCTP RFCs.

With that, we can tell skb_segment() to just split based on frag_list,
trusting its sizes are already in accordance.

This way SCTP can benefit from GSO and instead of passing several
packets through the stack, it can pass a single large packet.

v2:
- Added support for receiving GSO frames, as requested by Dave Miller.
- Clear skb->cb if packet is GSO (otherwise it's not used by SCTP)
- Added heuristics similar to what we have in TCP for not generating
  single GSO packets that fills cwnd.
v3:
- consider sctphdr size in skb_gso_transport_seglen()
- rebased due to 5c7cdf339af5 ("gso: Remove arbitrary checks for
  unsupported GSO")

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Tested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
14 files changed:
include/linux/netdev_features.h
include/linux/netdevice.h
include/linux/skbuff.h
include/net/sctp/sctp.h
include/net/sctp/structs.h
net/core/ethtool.c
net/core/skbuff.c
net/sctp/Makefile
net/sctp/input.c
net/sctp/inqueue.c
net/sctp/offload.c [new file with mode: 0644]
net/sctp/output.c
net/sctp/protocol.c
net/sctp/socket.c

index aa7b240..9c6c8ef 100644 (file)
@@ -53,8 +53,9 @@ enum {
                                         *     headers in software.
                                         */
        NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
+       NETIF_F_GSO_SCTP_BIT,           /* ... SCTP fragmentation */
        /**/NETIF_F_GSO_LAST =          /* last bit, see GSO_MASK */
-               NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
+               NETIF_F_GSO_SCTP_BIT,
 
        NETIF_F_FCOE_CRC_BIT,           /* FCoE CRC32 */
        NETIF_F_SCTP_CRC_BIT,           /* SCTP checksum offload */
@@ -128,6 +129,7 @@ enum {
 #define NETIF_F_TSO_MANGLEID   __NETIF_F(TSO_MANGLEID)
 #define NETIF_F_GSO_PARTIAL     __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
+#define NETIF_F_GSO_SCTP       __NETIF_F(GSO_SCTP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX        __NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX        __NETIF_F(HW_VLAN_STAG_TX)
@@ -166,7 +168,8 @@ enum {
                                 NETIF_F_FSO)
 
 /* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE   (NETIF_F_ALL_TSO | NETIF_F_UFO)
+#define NETIF_F_GSO_SOFTWARE   (NETIF_F_ALL_TSO | NETIF_F_UFO | \
+                                NETIF_F_GSO_SCTP)
 
 /*
  * If one device supports one of these features, then enable them
index f45929c..fa6df26 100644 (file)
@@ -4012,6 +4012,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
+       BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
 
        return (features & feature) == feature;
 }
index aa3f9d7..dc0fca7 100644 (file)
@@ -487,6 +487,8 @@ enum {
        SKB_GSO_PARTIAL = 1 << 13,
 
        SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
+
+       SKB_GSO_SCTP = 1 << 15,
 };
 
 #if BITS_PER_LONG > 32
index b392ac8..632e205 100644 (file)
@@ -186,6 +186,10 @@ void sctp_assocs_proc_exit(struct net *net);
 int sctp_remaddr_proc_init(struct net *net);
 void sctp_remaddr_proc_exit(struct net *net);
 
+/*
+ * sctp/offload.c
+ */
+int sctp_offload_init(void);
 
 /*
  * Module global variables
index 16b013a..83c5ec5 100644 (file)
@@ -566,6 +566,9 @@ struct sctp_chunk {
        /* This points to the sk_buff containing the actual data.  */
        struct sk_buff *skb;
 
+       /* In case of GSO packets, this will store the head one */
+       struct sk_buff *head_skb;
+
        /* These are the SCTP headers by reverse order in a packet.
         * Note that some of these may happen more than once.  In that
         * case, we point at the "current" one, whatever that means
@@ -696,6 +699,8 @@ struct sctp_packet {
        size_t overhead;
        /* This is the total size of all chunks INCLUDING padding.  */
        size_t size;
+       /* This is the maximum size this packet may have */
+       size_t max_size;
 
        /* The packet is destined for this transport address.
         * The function we finally use to pass down to the next lower
index f403481..9774898 100644 (file)
@@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
        [NETIF_F_GSO_UDP_TUNNEL_BIT] =   "tx-udp_tnl-segmentation",
        [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
        [NETIF_F_GSO_PARTIAL_BIT] =      "tx-gso-partial",
+       [NETIF_F_GSO_SCTP_BIT] =         "tx-sctp-segmentation",
 
        [NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
        [NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
index 5ca562b..b6e0f95 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
+#include <linux/sctp.h>
 #include <linux/netdevice.h>
 #ifdef CONFIG_NET_CLS_ACT
 #include <net/pkt_sched.h>
@@ -4383,6 +4384,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
                        thlen += inner_tcp_hdrlen(skb);
        } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                thlen = tcp_hdrlen(skb);
+       } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) {
+               thlen = sizeof(struct sctphdr);
        }
        /* UFO sets gso_size to the size of the fragmentation
         * payload, i.e. the size of the L4 (UDP) header is already
index 0fca582..6c4f749 100644 (file)
@@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
          transport.o chunk.o sm_make_chunk.o ulpevent.o \
          inqueue.o outqueue.o ulpqueue.o \
          tsnmap.o bind_addr.o socket.o primitive.o \
-         output.o input.o debug.o ssnmap.o auth.o
+         output.o input.o debug.o ssnmap.o auth.o \
+         offload.o
 
 sctp_probe-y := probe.o
 
index 5cff254..6f8e676 100644 (file)
@@ -139,7 +139,9 @@ int sctp_rcv(struct sk_buff *skb)
        skb->csum_valid = 0; /* Previous value not applicable */
        if (skb_csum_unnecessary(skb))
                __skb_decr_checksum_unnecessary(skb);
-       else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0)
+       else if (!sctp_checksum_disable &&
+                !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) &&
+                sctp_rcv_checksum(net, skb) < 0)
                goto discard_it;
        skb->csum_valid = 1;
 
@@ -1175,6 +1177,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
 {
        sctp_chunkhdr_t *ch;
 
+       /* We do not allow GSO frames here as we need to linearize and
+        * then cannot guarantee frame boundaries. This shouldn't be an
+        * issue as packets hitting this are mostly INIT or INIT-ACK and
+        * those cannot be on GSO-style anyway.
+        */
+       if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP)
+               return NULL;
+
        if (skb_linearize(skb))
                return NULL;
 
index 5ba08ce..edabbbd 100644 (file)
@@ -138,6 +138,17 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
                if (chunk->singleton ||
                    chunk->end_of_packet ||
                    chunk->pdiscard) {
+                       if (chunk->head_skb == chunk->skb) {
+                               chunk->skb = skb_shinfo(chunk->skb)->frag_list;
+                               goto new_skb;
+                       }
+                       if (chunk->skb->next) {
+                               chunk->skb = chunk->skb->next;
+                               goto new_skb;
+                       }
+
+                       if (chunk->head_skb)
+                               chunk->skb = chunk->head_skb;
                        sctp_chunk_free(chunk);
                        chunk = queue->in_progress = NULL;
                } else {
@@ -155,15 +166,15 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
 
 next_chunk:
                /* Is the queue empty?  */
-               if (list_empty(&queue->in_chunk_list))
+               entry = sctp_list_dequeue(&queue->in_chunk_list);
+               if (!entry)
                        return NULL;
 
-               entry = queue->in_chunk_list.next;
                chunk = list_entry(entry, struct sctp_chunk, list);
-               list_del_init(entry);
 
                /* Linearize if it's not GSO */
-               if (skb_is_nonlinear(chunk->skb)) {
+               if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) != SKB_GSO_SCTP &&
+                   skb_is_nonlinear(chunk->skb)) {
                        if (skb_linearize(chunk->skb)) {
                                __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
                                sctp_chunk_free(chunk);
@@ -174,15 +185,39 @@ next_chunk:
                        chunk->sctp_hdr = sctp_hdr(chunk->skb);
                }
 
+               if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) {
+                       /* GSO-marked skbs but without frags, handle
+                        * them normally
+                        */
+                       if (skb_shinfo(chunk->skb)->frag_list)
+                               chunk->head_skb = chunk->skb;
+
+                       /* skbs with "cover letter" */
+                       if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len)
+                               chunk->skb = skb_shinfo(chunk->skb)->frag_list;
+
+                       if (WARN_ON(!chunk->skb)) {
+                               __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
+                               sctp_chunk_free(chunk);
+                               goto next_chunk;
+                       }
+               }
+
+               if (chunk->asoc)
+                       sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);
+
                queue->in_progress = chunk;
 
+new_skb:
                /* This is the first chunk in the packet.  */
-               chunk->singleton = 1;
                ch = (sctp_chunkhdr_t *) chunk->skb->data;
+               chunk->singleton = 1;
                chunk->data_accepted = 0;
-
-               if (chunk->asoc)
-                       sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);
+               chunk->pdiscard = 0;
+               chunk->auth = 0;
+               chunk->has_asconf = 0;
+               chunk->end_of_packet = 0;
+               chunk->ecn_ce_done = 0;
        }
 
        chunk->chunk_hdr = ch;
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
new file mode 100644 (file)
index 0000000..a37887b
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * sctp_offload - GRO/GSO Offloading for SCTP
+ *
+ * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/checksum.h>
+#include <net/protocol.h>
+
+static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
+{
+       skb->ip_summed = CHECKSUM_NONE;
+       return sctp_compute_cksum(skb, skb_transport_offset(skb));
+}
+
+static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
+                                       netdev_features_t features)
+{
+       struct sk_buff *segs = ERR_PTR(-EINVAL);
+       struct sctphdr *sh;
+
+       sh = sctp_hdr(skb);
+       if (!pskb_may_pull(skb, sizeof(*sh)))
+               goto out;
+
+       __skb_pull(skb, sizeof(*sh));
+
+       if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+               /* Packet is from an untrusted source, reset gso_segs. */
+               struct skb_shared_info *pinfo = skb_shinfo(skb);
+               struct sk_buff *frag_iter;
+
+               pinfo->gso_segs = 0;
+               if (skb->len != skb->data_len) {
+                       /* Means we have chunks in here too */
+                       pinfo->gso_segs++;
+               }
+
+               skb_walk_frags(skb, frag_iter)
+                       pinfo->gso_segs++;
+
+               segs = NULL;
+               goto out;
+       }
+
+       segs = skb_segment(skb, features | NETIF_F_HW_CSUM);
+       if (IS_ERR(segs))
+               goto out;
+
+       /* All that is left is update SCTP CRC if necessary */
+       if (!(features & NETIF_F_SCTP_CRC)) {
+               for (skb = segs; skb; skb = skb->next) {
+                       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                               sh = sctp_hdr(skb);
+                               sh->checksum = sctp_gso_make_checksum(skb);
+                       }
+               }
+       }
+
+out:
+       return segs;
+}
+
+static const struct net_offload sctp_offload = {
+       .callbacks = {
+               .gso_segment = sctp_gso_segment,
+       },
+};
+
+int __init sctp_offload_init(void)
+{
+       return inet_add_offload(&sctp_offload, IPPROTO_SCTP);
+}
index 9844fe5..60499a6 100644 (file)
@@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet)
 struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
                                       __u32 vtag, int ecn_capable)
 {
-       struct sctp_chunk *chunk = NULL;
+       struct sctp_transport *tp = packet->transport;
+       struct sctp_association *asoc = tp->asoc;
 
        pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
 
        packet->vtag = vtag;
 
+       if (asoc && tp->dst) {
+               struct sock *sk = asoc->base.sk;
+
+               rcu_read_lock();
+               if (__sk_dst_get(sk) != tp->dst) {
+                       dst_hold(tp->dst);
+                       sk_setup_caps(sk, tp->dst);
+               }
+
+               if (sk_can_gso(sk)) {
+                       struct net_device *dev = tp->dst->dev;
+
+                       packet->max_size = dev->gso_max_size;
+               } else {
+                       packet->max_size = asoc->pathmtu;
+               }
+               rcu_read_unlock();
+
+       } else {
+               packet->max_size = tp->pathmtu;
+       }
+
        if (ecn_capable && sctp_packet_empty(packet)) {
-               chunk = sctp_get_ecne_prepend(packet->transport->asoc);
+               struct sctp_chunk *chunk;
 
                /* If there a is a prepend chunk stick it on the list before
                 * any other chunks get appended.
                 */
+               chunk = sctp_get_ecne_prepend(asoc);
                if (chunk)
                        sctp_packet_append_chunk(packet, chunk);
        }
@@ -381,12 +405,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
        struct sctp_transport *tp = packet->transport;
        struct sctp_association *asoc = tp->asoc;
        struct sctphdr *sh;
-       struct sk_buff *nskb;
+       struct sk_buff *nskb = NULL, *head = NULL;
        struct sctp_chunk *chunk, *tmp;
        struct sock *sk;
        int err = 0;
        int padding;            /* How much padding do we need?  */
+       int pkt_size;
        __u8 has_data = 0;
+       int gso = 0;
+       int pktcount = 0;
        struct dst_entry *dst;
        unsigned char *auth = NULL;     /* pointer to auth in skb data */
 
@@ -400,18 +427,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
        chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
        sk = chunk->skb->sk;
 
-       /* Allocate the new skb.  */
-       nskb = alloc_skb(packet->size + MAX_HEADER, gfp);
-       if (!nskb)
+       /* Allocate the head skb, or main one if not in GSO */
+       if (packet->size > tp->pathmtu && !packet->ipfragok) {
+               if (sk_can_gso(sk)) {
+                       gso = 1;
+                       pkt_size = packet->overhead;
+               } else {
+                       /* If this happens, we trash this packet and try
+                        * to build a new one, hopefully correct this
+                        * time. Application may notice this error.
+                        */
+                       pr_err_once("Trying to GSO but underlying device doesn't support it.");
+                       goto nomem;
+               }
+       } else {
+               pkt_size = packet->size;
+       }
+       head = alloc_skb(pkt_size + MAX_HEADER, gfp);
+       if (!head)
                goto nomem;
+       if (gso) {
+               NAPI_GRO_CB(head)->last = head;
+               skb_shinfo(head)->gso_type = sk->sk_gso_type;
+       }
 
        /* Make sure the outbound skb has enough header room reserved. */
-       skb_reserve(nskb, packet->overhead + MAX_HEADER);
+       skb_reserve(head, packet->overhead + MAX_HEADER);
 
        /* Set the owning socket so that we know where to get the
         * destination IP address.
         */
-       sctp_packet_set_owner_w(nskb, sk);
+       sctp_packet_set_owner_w(head, sk);
 
        if (!sctp_transport_dst_check(tp)) {
                sctp_transport_route(tp, NULL, sctp_sk(sk));
@@ -422,11 +468,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
        dst = dst_clone(tp->dst);
        if (!dst)
                goto no_route;
-       skb_dst_set(nskb, dst);
+       skb_dst_set(head, dst);
 
        /* Build the SCTP header.  */
-       sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
-       skb_reset_transport_header(nskb);
+       sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr));
+       skb_reset_transport_header(head);
        sh->source = htons(packet->source_port);
        sh->dest   = htons(packet->destination_port);
 
@@ -441,90 +487,133 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
        sh->vtag     = htonl(packet->vtag);
        sh->checksum = 0;
 
-       /**
-        * 6.10 Bundling
-        *
-        *    An endpoint bundles chunks by simply including multiple
-        *    chunks in one outbound SCTP packet.  ...
-        */
-
-       /**
-        * 3.2  Chunk Field Descriptions
-        *
-        * The total length of a chunk (including Type, Length and
-        * Value fields) MUST be a multiple of 4 bytes.  If the length
-        * of the chunk is not a multiple of 4 bytes, the sender MUST
-        * pad the chunk with all zero bytes and this padding is not
-        * included in the chunk length field.  The sender should
-        * never pad with more than 3 bytes.
-        *
-        * [This whole comment explains WORD_ROUND() below.]
-        */
-
        pr_debug("***sctp_transmit_packet***\n");
 
-       list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
-               list_del_init(&chunk->list);
-               if (sctp_chunk_is_data(chunk)) {
-                       /* 6.3.1 C4) When data is in flight and when allowed
-                        * by rule C5, a new RTT measurement MUST be made each
-                        * round trip.  Furthermore, new RTT measurements
-                        * SHOULD be made no more than once per round-trip
-                        * for a given destination transport address.
-                        */
+       do {
+               /* Set up convenience variables... */
+               chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+               pktcount++;
 
-                       if (!chunk->resent && !tp->rto_pending) {
-                               chunk->rtt_in_progress = 1;
-                               tp->rto_pending = 1;
+               /* Calculate packet size, so it fits in PMTU. Leave
+                * other chunks for the next packets.
+                */
+               if (gso) {
+                       pkt_size = packet->overhead;
+                       list_for_each_entry(chunk, &packet->chunk_list, list) {
+                               int padded = WORD_ROUND(chunk->skb->len);
+
+                               if (pkt_size + padded > tp->pathmtu)
+                                       break;
+                               pkt_size += padded;
                        }
 
-                       has_data = 1;
-               }
+                       /* Allocate a new skb. */
+                       nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
+                       if (!nskb)
+                               goto nomem;
 
-               padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
-               if (padding)
-                       memset(skb_put(chunk->skb, padding), 0, padding);
+                       /* Make sure the outbound skb has enough header
+                        * room reserved.
+                        */
+                       skb_reserve(nskb, packet->overhead + MAX_HEADER);
+               } else {
+                       nskb = head;
+               }
 
-               /* if this is the auth chunk that we are adding,
-                * store pointer where it will be added and put
-                * the auth into the packet.
+               /**
+                * 3.2  Chunk Field Descriptions
+                *
+                * The total length of a chunk (including Type, Length and
+                * Value fields) MUST be a multiple of 4 bytes.  If the length
+                * of the chunk is not a multiple of 4 bytes, the sender MUST
+                * pad the chunk with all zero bytes and this padding is not
+                * included in the chunk length field.  The sender should
+                * never pad with more than 3 bytes.
+                *
+                * [This whole comment explains WORD_ROUND() below.]
                 */
-               if (chunk == packet->auth)
-                       auth = skb_tail_pointer(nskb);
 
-               memcpy(skb_put(nskb, chunk->skb->len),
+               pkt_size -= packet->overhead;
+               list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+                       list_del_init(&chunk->list);
+                       if (sctp_chunk_is_data(chunk)) {
+                               /* 6.3.1 C4) When data is in flight and when allowed
+                                * by rule C5, a new RTT measurement MUST be made each
+                                * round trip.  Furthermore, new RTT measurements
+                                * SHOULD be made no more than once per round-trip
+                                * for a given destination transport address.
+                                */
+
+                               if (!chunk->resent && !tp->rto_pending) {
+                                       chunk->rtt_in_progress = 1;
+                                       tp->rto_pending = 1;
+                               }
+
+                               has_data = 1;
+                       }
+
+                       padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+                       if (padding)
+                               memset(skb_put(chunk->skb, padding), 0, padding);
+
+                       /* if this is the auth chunk that we are adding,
+                        * store pointer where it will be added and put
+                        * the auth into the packet.
+                        */
+                       if (chunk == packet->auth)
+                               auth = skb_tail_pointer(nskb);
+
+                       memcpy(skb_put(nskb, chunk->skb->len),
                               chunk->skb->data, chunk->skb->len);
 
-               pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, "
-                        "rtt_in_progress:%d\n", chunk,
-                        sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
-                        chunk->has_tsn ? "TSN" : "No TSN",
-                        chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
-                        ntohs(chunk->chunk_hdr->length), chunk->skb->len,
-                        chunk->rtt_in_progress);
-
-               /*
-                * If this is a control chunk, this is our last
-                * reference. Free data chunks after they've been
-                * acknowledged or have failed.
-                */
-               if (!sctp_chunk_is_data(chunk))
-                       sctp_chunk_free(chunk);
-       }
+                       pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
+                                chunk,
+                                sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
+                                chunk->has_tsn ? "TSN" : "No TSN",
+                                chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
+                                ntohs(chunk->chunk_hdr->length), chunk->skb->len,
+                                chunk->rtt_in_progress);
+
+                       /* If this is a control chunk, this is our last
+                        * reference. Free data chunks after they've been
+                        * acknowledged or have failed.
+                        * Re-queue auth chunks if needed.
+                        */
+                       pkt_size -= WORD_ROUND(chunk->skb->len);
 
-       /* SCTP-AUTH, Section 6.2
-        *    The sender MUST calculate the MAC as described in RFC2104 [2]
-        *    using the hash function H as described by the MAC Identifier and
-        *    the shared association key K based on the endpoint pair shared key
-        *    described by the shared key identifier.  The 'data' used for the
-        *    computation of the AUTH-chunk is given by the AUTH chunk with its
-        *    HMAC field set to zero (as shown in Figure 6) followed by all
-        *    chunks that are placed after the AUTH chunk in the SCTP packet.
-        */
-       if (auth)
-               sctp_auth_calculate_hmac(asoc, nskb,
-                                        (struct sctp_auth_chunk *)auth,
-                                        gfp);
+                       if (chunk == packet->auth && !list_empty(&packet->chunk_list))
+                               list_add(&chunk->list, &packet->chunk_list);
+                       else if (!sctp_chunk_is_data(chunk))
+                               sctp_chunk_free(chunk);
+
+                       if (!pkt_size)
+                               break;
+               }
+
+               /* SCTP-AUTH, Section 6.2
+                *    The sender MUST calculate the MAC as described in RFC2104 [2]
+                *    using the hash function H as described by the MAC Identifier and
+                *    the shared association key K based on the endpoint pair shared key
+                *    described by the shared key identifier.  The 'data' used for the
+                *    computation of the AUTH-chunk is given by the AUTH chunk with its
+                *    HMAC field set to zero (as shown in Figure 6) followed by all
+                *    chunks that are placed after the AUTH chunk in the SCTP packet.
+                */
+               if (auth)
+                       sctp_auth_calculate_hmac(asoc, nskb,
+                                                (struct sctp_auth_chunk *)auth,
+                                                gfp);
+
+               if (!gso)
+                       break;
+
+               if (skb_gro_receive(&head, nskb))
+                       goto nomem;
+               nskb = NULL;
+               if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
+                                sk->sk_gso_max_segs))
+                       goto nomem;
+       } while (!list_empty(&packet->chunk_list));
 
        /* 2) Calculate the Adler-32 checksum of the whole packet,
         *    including the SCTP common header and all the
@@ -532,16 +621,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
         *
         * Note: Adler-32 is no longer applicable, as has been replaced
         * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
+        *
+        * If it's a GSO packet, it's postponed to sctp_skb_segment.
         */
-       if (!sctp_checksum_disable) {
-               if (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
-                   (dst_xfrm(dst) != NULL) || packet->ipfragok) {
-                       sh->checksum = sctp_compute_cksum(nskb, 0);
+       if (!sctp_checksum_disable || gso) {
+               if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
+                            dst_xfrm(dst) || packet->ipfragok)) {
+                       sh->checksum = sctp_compute_cksum(head, 0);
                } else {
                        /* no need to seed pseudo checksum for SCTP */
-                       nskb->ip_summed = CHECKSUM_PARTIAL;
-                       nskb->csum_start = skb_transport_header(nskb) - nskb->head;
-                       nskb->csum_offset = offsetof(struct sctphdr, checksum);
+                       head->ip_summed = CHECKSUM_PARTIAL;
+                       head->csum_start = skb_transport_header(head) - head->head;
+                       head->csum_offset = offsetof(struct sctphdr, checksum);
                }
        }
 
@@ -557,7 +648,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
         * Note: The works for IPv6 layer checks this bit too later
         * in transmission.  See IP6_ECN_flow_xmit().
         */
-       tp->af_specific->ecn_capable(nskb->sk);
+       tp->af_specific->ecn_capable(sk);
 
        /* Set up the IP options.  */
        /* BUG: not implemented
@@ -566,7 +657,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 
        /* Dump that on IP!  */
        if (asoc) {
-               asoc->stats.opackets++;
+               asoc->stats.opackets += pktcount;
                if (asoc->peer.last_sent_to != tp)
                        /* Considering the multiple CPU scenario, this is a
                         * "correcter" place for last_sent_to.  --xguo
@@ -589,16 +680,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
                }
        }
 
-       pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len);
+       pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);
+
+       if (gso) {
+               /* Cleanup our debris for IP stacks */
+               memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
+                                       sizeof(struct inet6_skb_parm)));
 
-       nskb->ignore_df = packet->ipfragok;
-       tp->af_specific->sctp_xmit(nskb, tp);
+               skb_shinfo(head)->gso_segs = pktcount;
+               skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
+
+               /* We have to refresh this in case we are xmiting to
+                * more than one transport at a time
+                */
+               rcu_read_lock();
+               if (__sk_dst_get(sk) != tp->dst) {
+                       dst_hold(tp->dst);
+                       sk_setup_caps(sk, tp->dst);
+               }
+               rcu_read_unlock();
+       }
+       head->ignore_df = packet->ipfragok;
+       tp->af_specific->sctp_xmit(head, tp);
 
 out:
        sctp_packet_reset(packet);
        return err;
 no_route:
-       kfree_skb(nskb);
+       kfree_skb(head);
+       if (nskb != head)
+               kfree_skb(nskb);
 
        if (asoc)
                IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
@@ -751,39 +862,63 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
                                        struct sctp_chunk *chunk,
                                        u16 chunk_len)
 {
-       size_t psize;
-       size_t pmtu;
-       int too_big;
+       size_t psize, pmtu;
        sctp_xmit_t retval = SCTP_XMIT_OK;
 
        psize = packet->size;
-       pmtu  = ((packet->transport->asoc) ?
-               (packet->transport->asoc->pathmtu) :
-               (packet->transport->pathmtu));
-
-       too_big = (psize + chunk_len > pmtu);
+       if (packet->transport->asoc)
+               pmtu = packet->transport->asoc->pathmtu;
+       else
+               pmtu = packet->transport->pathmtu;
 
        /* Decide if we need to fragment or resubmit later. */
-       if (too_big) {
-               /* It's OK to fragmet at IP level if any one of the following
+       if (psize + chunk_len > pmtu) {
+               /* It's OK to fragment at IP level if any one of the following
                 * is true:
-                *      1. The packet is empty (meaning this chunk is greater
-                *         the MTU)
-                *      2. The chunk we are adding is a control chunk
-                *      3. The packet doesn't have any data in it yet and data
-                *      requires authentication.
+                *      1. The packet is empty (meaning this chunk is greater
+                *         the MTU)
+                *      2. The packet doesn't have any data in it yet and data
+                *         requires authentication.
                 */
-               if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) ||
+               if (sctp_packet_empty(packet) ||
                    (!packet->has_data && chunk->auth)) {
                        /* We no longer do re-fragmentation.
                         * Just fragment at the IP layer, if we
                         * actually hit this condition
                         */
                        packet->ipfragok = 1;
-               } else {
-                       retval = SCTP_XMIT_PMTU_FULL;
+                       goto out;
                }
+
+               /* It is also okay to fragment if the chunk we are
+                * adding is a control chunk, but only if current packet
+                * is not a GSO one otherwise it causes fragmentation of
+                * a large frame. So in this case we allow the
+                * fragmentation by forcing it to be in a new packet.
+                */
+               if (!sctp_chunk_is_data(chunk) && packet->has_data)
+                       retval = SCTP_XMIT_PMTU_FULL;
+
+               if (psize + chunk_len > packet->max_size)
+                       /* Hit GSO/PMTU limit, gotta flush */
+                       retval = SCTP_XMIT_PMTU_FULL;
+
+               if (!packet->transport->burst_limited &&
+                   psize + chunk_len > (packet->transport->cwnd >> 1))
+                       /* Do not allow a single GSO packet to use more
+                        * than half of cwnd.
+                        */
+                       retval = SCTP_XMIT_PMTU_FULL;
+
+               if (packet->transport->burst_limited &&
+                   psize + chunk_len > (packet->transport->burst_limited >> 1))
+                       /* Do not allow a single GSO packet to use more
+                        * than half of original cwnd.
+                        */
+                       retval = SCTP_XMIT_PMTU_FULL;
+               /* Otherwise it will fit in the GSO packet */
        }
 
+out:
        return retval;
 }
index d3d50da..40022ee 100644 (file)
@@ -1516,6 +1516,9 @@ static __init int sctp_init(void)
        if (status)
                goto err_v6_add_protocol;
 
+       if (sctp_offload_init() < 0)
+               pr_crit("%s: Cannot add SCTP protocol offload\n", __func__);
+
 out:
        return status;
 err_v6_add_protocol:
index 67154b8..712fb23 100644 (file)
@@ -4003,6 +4003,8 @@ static int sctp_init_sock(struct sock *sk)
                return -ESOCKTNOSUPPORT;
        }
 
+       sk->sk_gso_type = SKB_GSO_SCTP;
+
        /* Initialize default send parameters. These parameters can be
         * modified with the SCTP_DEFAULT_SEND_PARAM socket option.
         */