Merge branch 'perf/urgent' into perf/core, to pick up fixes before merging new changes

[cascardo/linux.git] / net / ipv4 / tcp_output.c
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 79a03b8..e00e972 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -364,7 +364,7 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
   * be sent.
   */
  static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
-                               int tcp_header_len)
+                        struct tcphdr *th, int tcp_header_len)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
@@ -375,7 +375,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
                         INET_ECN_xmit(sk);
                         if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
                                 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
-                               tcp_hdr(skb)->cwr = 1;
+                               th->cwr = 1;
                                 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
                         }
                 } else if (!tcp_ca_needs_ecn(sk)) {
@@ -383,7 +383,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
                         INET_ECN_dontxmit(sk);
                 }
                 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
-                       tcp_hdr(skb)->ece = 1;
+                       th->ece = 1;
         }
  }
  
@@ -949,12 +949,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  
         skb_orphan(skb);
         skb->sk = sk;
-       skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
+       skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
         skb_set_hash_from_sk(skb, sk);
         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
  
         /* Build TCP header and checksum it. */
-       th = tcp_hdr(skb);
+       th = (struct tcphdr *)skb->data;
         th->source              = inet->inet_sport;
         th->dest                = inet->inet_dport;
         th->seq                 = htonl(tcb->seq);
@@ -962,14 +962,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
         *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
                                         tcb->tcp_flags);
  
-       if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
-               /* RFC1323: The window in SYN & SYN/ACK segments
-                * is never scaled.
-                */
-               th->window      = htons(min(tp->rcv_wnd, 65535U));
-       } else {
-               th->window      = htons(tcp_select_window(sk));
-       }
         th->check               = 0;
         th->urg_ptr             = 0;
  
@@ -986,9 +978,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  
         tcp_options_write((__be32 *)(th + 1), tp, &opts);
         skb_shinfo(skb)->gso_type = sk->sk_gso_type;
-       if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
-               tcp_ecn_send(sk, skb, tcp_header_size);
-
+       if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
+               th->window      = htons(tcp_select_window(sk));
+               tcp_ecn_send(sk, skb, th, tcp_header_size);
+       } else {
+               /* RFC1323: The window in SYN & SYN/ACK segments
+                * is never scaled.
+                */
+               th->window      = htons(min(tp->rcv_wnd, 65535U));
+       }
  #ifdef CONFIG_TCP_MD5SIG
         /* Calculate the MD5 hash, as we have all we need now */
         if (md5) {
@@ -1111,11 +1109,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
         tcp_verify_left_out(tp);
  }
  
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+       return TCP_SKB_CB(skb)->txstamp_ack ||
+               (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
  static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
  {
         struct skb_shared_info *shinfo = skb_shinfo(skb);
  
-       if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+       if (unlikely(tcp_has_tx_tstamp(skb)) &&
             !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
                 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
                 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
@@ -1123,9 +1127,17 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
                 shinfo->tx_flags &= ~tsflags;
                 shinfo2->tx_flags |= tsflags;
                 swap(shinfo->tskey, shinfo2->tskey);
+               TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+               TCP_SKB_CB(skb)->txstamp_ack = 0;
         }
  }
  
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+       TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+       TCP_SKB_CB(skb)->eor = 0;
+}
+
  /* Function to create two new TCP segments.  Shrinks the given segment
   * to the specified size and appends a new segment with the rest of the
   * packet to the list.  This won't be called frequently, I hope.
@@ -1171,6 +1183,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
         TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
         TCP_SKB_CB(buff)->tcp_flags = flags;
         TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+       tcp_skb_fragment_eor(skb, buff);
  
         if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
                 /* Copy and checksum data tail into the new buffer. */
@@ -1731,6 +1744,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
         /* This packet was never sent out yet, so no SACK bits. */
         TCP_SKB_CB(buff)->sacked = 0;
  
+       tcp_skb_fragment_eor(skb, buff);
+
         buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
         skb_split(skb, buff, len);
         tcp_fragment_tstamp(skb, buff);
@@ -2204,14 +2219,13 @@ bool tcp_schedule_loss_probe(struct sock *sk)
  /* Thanks to skb fast clones, we can detect if a prior transmit of
   * a packet is still in a qdisc or driver queue.
   * In this case, there is very little point doing a retransmit !
- * Note: This is called from BH context only.
   */
  static bool skb_still_in_host_queue(const struct sock *sk,
                                     const struct sk_buff *skb)
  {
         if (unlikely(skb_fclone_busy(sk, skb))) {
-               NET_INC_STATS_BH(sock_net(sk),
-                                LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+               NET_INC_STATS(sock_net(sk),
+                             LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
                 return true;
         }
         return false;
@@ -2266,14 +2280,14 @@ void tcp_send_loss_probe(struct sock *sk)
         if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
                 goto rearm_timer;
  
-       if (__tcp_retransmit_skb(sk, skb))
+       if (__tcp_retransmit_skb(sk, skb, 1))
                 goto rearm_timer;
  
         /* Record snd_nxt for loss detection. */
         tp->tlp_high_seq = tp->snd_nxt;
  
  probe_sent:
-       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
         /* Reset s.t. tcp_rearm_rto will restart timer from now */
         inet_csk(sk)->icsk_pending = 0;
  rearm_timer:
@@ -2444,14 +2458,15 @@ u32 __tcp_select_window(struct sock *sk)
  void tcp_skb_collapse_tstamp(struct sk_buff *skb,
                              const struct sk_buff *next_skb)
  {
-       const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
-       u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
-
-       if (unlikely(tsflags)) {
+       if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+               const struct skb_shared_info *next_shinfo =
+                       skb_shinfo(next_skb);
                 struct skb_shared_info *shinfo = skb_shinfo(skb);
  
-               shinfo->tx_flags |= tsflags;
+               shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
                 shinfo->tskey = next_shinfo->tskey;
+               TCP_SKB_CB(skb)->txstamp_ack |=
+                       TCP_SKB_CB(next_skb)->txstamp_ack;
         }
  }
  
@@ -2490,6 +2505,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
          * packet counting does not break.
          */
         TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+       TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
  
         /* changed transmit queue under us so clear hints */
         tcp_clear_retrans_hints_partial(tp);
@@ -2541,6 +2557,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
                 if (!tcp_can_collapse(sk, skb))
                         break;
  
+               if (!tcp_skb_can_collapse_to(to))
+                       break;
+
                 space -= skb->len;
  
                 if (first) {
@@ -2567,17 +2586,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
   * state updates are done by the caller.  Returns non-zero if an
   * error occurred which prevented the send.
   */
-int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  {
-       struct tcp_sock *tp = tcp_sk(sk);
         struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
         unsigned int cur_mss;
-       int err;
+       int diff, len, err;
  
-       /* Inconslusive MTU probe */
-       if (icsk->icsk_mtup.probe_size) {
+
+       /* Inconclusive MTU probe */
+       if (icsk->icsk_mtup.probe_size)
                 icsk->icsk_mtup.probe_size = 0;
-       }
  
         /* Do not sent more than we queued. 1/4 is reserved for possible
          * copying overhead: fragmentation, tunneling, mangling etc.
@@ -2610,30 +2629,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
             TCP_SKB_CB(skb)->seq != tp->snd_una)
                 return -EAGAIN;
  
-       if (skb->len > cur_mss) {
-               if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
+       len = cur_mss * segs;
+       if (skb->len > len) {
+               if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
                         return -ENOMEM; /* We'll try again later. */
         } else {
-               int oldpcount = tcp_skb_pcount(skb);
+               if (skb_unclone(skb, GFP_ATOMIC))
+                       return -ENOMEM;
  
-               if (unlikely(oldpcount > 1)) {
-                       if (skb_unclone(skb, GFP_ATOMIC))
-                               return -ENOMEM;
-                       tcp_init_tso_segs(skb, cur_mss);
-                       tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
-               }
+               diff = tcp_skb_pcount(skb);
+               tcp_set_skb_tso_segs(skb, cur_mss);
+               diff -= tcp_skb_pcount(skb);
+               if (diff)
+                       tcp_adjust_pcount(sk, skb, diff);
+               if (skb->len < cur_mss)
+                       tcp_retrans_try_collapse(sk, skb, cur_mss);
         }
  
         /* RFC3168, section 6.1.1.1. ECN fallback */
         if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
                 tcp_ecn_clear_syn(sk, skb);
  
-       tcp_retrans_try_collapse(sk, skb, cur_mss);
-
-       /* Make a copy, if the first transmission SKB clone we made
-        * is still in somebody's hands, else make a clone.
-        */
-
         /* make sure skb->data is aligned on arches that require it
          * and check if ack-trimming & collapsing extended the headroom
          * beyond what csum_start can cover.
@@ -2651,20 +2667,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         }
  
         if (likely(!err)) {
+               segs = tcp_skb_pcount(skb);
+
                 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
                 /* Update global TCP statistics. */
-               TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+               TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
                 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
-                       NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-               tp->total_retrans++;
+                       __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+               tp->total_retrans += segs;
         }
         return err;
  }
  
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       int err = __tcp_retransmit_skb(sk, skb);
+       int err = __tcp_retransmit_skb(sk, skb, segs);
  
         if (err == 0) {
  #if FASTRETRANS_DEBUG > 0
@@ -2680,7 +2698,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                         tp->retrans_stamp = tcp_skb_timestamp(skb);
  
         } else if (err != -EBUSY) {
-               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
         }
  
         if (tp->undo_retrans < 0)
@@ -2733,7 +2751,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
         struct sk_buff *hole = NULL;
-       u32 last_lost;
+       u32 max_segs, last_lost;
         int mib_idx;
         int fwd_rexmitting = 0;
  
@@ -2753,8 +2771,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                 last_lost = tp->snd_una;
         }
  
+       max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
         tcp_for_write_queue_from(skb, sk) {
                 __u8 sacked = TCP_SKB_CB(skb)->sacked;
+               int segs;
  
                 if (skb == tcp_send_head(sk))
                         break;
@@ -2762,15 +2782,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                 if (!hole)
                         tp->retransmit_skb_hint = skb;
  
-               /* Assume this retransmit will generate
-                * only one packet for congestion window
-                * calculation purposes.  This works because
-                * tcp_retransmit_skb() will chop up the
-                * packet to be MSS sized and all the
-                * packet counting works out.
-                */
-               if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+               segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
+               if (segs <= 0)
                         return;
+               /* In case tcp_shift_skb_data() have aggregated large skbs,
+                * we need to make sure not sending too bigs TSO packets
+                */
+               segs = min_t(int, segs, max_segs);
  
                 if (fwd_rexmitting) {
  begin_fwd:
@@ -2806,10 +2824,10 @@ begin_fwd:
                 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
                         continue;
  
-               if (tcp_retransmit_skb(sk, skb))
+               if (tcp_retransmit_skb(sk, skb, segs))
                         return;
  
-               NET_INC_STATS_BH(sock_net(sk), mib_idx);
+               NET_INC_STATS(sock_net(sk), mib_idx);
  
                 if (tcp_in_cwnd_reduction(sk))
                         tp->prr_out += tcp_skb_pcount(skb);
@@ -2962,7 +2980,7 @@ int tcp_send_synack(struct sock *sk)
  struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                 struct request_sock *req,
                                 struct tcp_fastopen_cookie *foc,
-                               bool attach_req)
+                               enum tcp_synack_type synack_type)
  {
         struct inet_request_sock *ireq = inet_rsk(req);
         const struct tcp_sock *tp = tcp_sk(sk);
@@ -2982,14 +3000,22 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
         /* Reserve space for headers. */
         skb_reserve(skb, MAX_TCP_HEADER);
  
-       if (attach_req) {
+       switch (synack_type) {
+       case TCP_SYNACK_NORMAL:
                 skb_set_owner_w(skb, req_to_sk(req));
-       } else {
+               break;
+       case TCP_SYNACK_COOKIE:
+               /* Under synflood, we do not attach skb to a socket,
+                * to avoid false sharing.
+                */
+               break;
+       case TCP_SYNACK_FASTOPEN:
                 /* sk is a const pointer, because we want to express multiple
                  * cpu might call us concurrently.
                  * sk->sk_wmem_alloc in an atomic, we can promote to rw.
                  */
                 skb_set_owner_w(skb, (struct sock *)sk);
+               break;
         }
         skb_dst_set(skb, dst);
  
@@ -3017,7 +3043,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
         skb_push(skb, tcp_header_size);
         skb_reset_transport_header(skb);
  
-       th = tcp_hdr(skb);
+       th = (struct tcphdr *)skb->data;
         memset(th, 0, sizeof(struct tcphdr));
         th->syn = 1;
         th->ack = 1;
@@ -3038,7 +3064,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
         th->window = htons(min(req->rsk_rcv_wnd, 65535U));
         tcp_options_write((__be32 *)(th + 1), NULL, &opts);
         th->doff = (tcp_header_size >> 2);
-       TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
+       __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
  
  #ifdef CONFIG_TCP_MD5SIG
         /* Okay, we have all we need - do the md5 hash if needed */
@@ -3534,10 +3560,10 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
         int res;
  
         tcp_rsk(req)->txhash = net_tx_rndhash();
-       res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
+       res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
         if (!res) {
-               TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+               __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+               __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
         }
         return res;
  }