Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[cascardo/linux.git] / net / ipv4 / tcp_input.c
index 068c8fb..63095b2 100644 (file)
@@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
  * 1. Tuning sk->sk_sndbuf, when connection enters established state.
  */
 
-static void tcp_fixup_sndbuf(struct sock *sk)
+static void tcp_sndbuf_expand(struct sock *sk)
 {
-       int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
+       const struct tcp_sock *tp = tcp_sk(sk);
+       int sndmem, per_mss;
+       u32 nr_segs;
+
+       /* Worst case is non GSO/TSO : each frame consumes one skb
+        * and skb->head is kmalloced using power of two area of memory
+        */
+       per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+                 MAX_TCP_HEADER +
+                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+       per_mss = roundup_pow_of_two(per_mss) +
+                 SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+       nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+       nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+       /* Fast Recovery (RFC 5681 3.2) :
+        * Cubic needs 1.7 factor, rounded to 2 to include
+        * extra cushion (application might react slowly to POLLOUT)
+        */
+       sndmem = 2 * nr_segs * per_mss;
 
-       sndmem *= TCP_INIT_CWND;
        if (sk->sk_sndbuf < sndmem)
                sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
 }
@@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
        rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
                 tcp_default_init_rwnd(mss);
 
+       /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+        * Allow enough cushion so that sender is not limited by our window
+        */
+       if (sysctl_tcp_moderate_rcvbuf)
+               rcvmem <<= 2;
+
        if (sk->sk_rcvbuf < rcvmem)
                sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
 }
@@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk)
        if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
                tcp_fixup_rcvbuf(sk);
        if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
-               tcp_fixup_sndbuf(sk);
+               tcp_sndbuf_expand(sk);
 
        tp->rcvq_space.space = tp->rcv_wnd;
+       tp->rcvq_space.time = tcp_time_stamp;
+       tp->rcvq_space.seq = tp->copied_seq;
 
        maxwin = tcp_full_space(sk);
 
@@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int time;
-       int space;
-
-       if (tp->rcvq_space.time == 0)
-               goto new_measure;
+       int copied;
 
        time = tcp_time_stamp - tp->rcvq_space.time;
        if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
                return;
 
-       space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+       /* Number of bytes copied to user in last RTT */
+       copied = tp->copied_seq - tp->rcvq_space.seq;
+       if (copied <= tp->rcvq_space.space)
+               goto new_measure;
 
-       space = max(tp->rcvq_space.space, space);
+       /* A bit of theory :
+        * copied = bytes received in previous RTT, our base window
+        * To cope with packet losses, we need a 2x factor
+        * To cope with slow start, and sender growing its cwin by 100 %
+        * every RTT, we need a 4x factor, because the ACK we are sending
+        * now is for the next RTT, not the current one :
+        * <prev RTT . ><current RTT .. ><next RTT .... >
+        */
 
-       if (tp->rcvq_space.space != space) {
-               int rcvmem;
+       if (sysctl_tcp_moderate_rcvbuf &&
+           !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+               int rcvwin, rcvmem, rcvbuf;
 
-               tp->rcvq_space.space = space;
+               /* minimal window to cope with packet losses, assuming
+                * steady state. Add some cushion because of small variations.
+                */
+               rcvwin = (copied << 1) + 16 * tp->advmss;
 
-               if (sysctl_tcp_moderate_rcvbuf &&
-                   !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-                       int new_clamp = space;
+               /* If rate increased by 25%,
+                *      assume slow start, rcvwin = 3 * copied
+                * If rate increased by 50%,
+                *      assume sender can use 2x growth, rcvwin = 4 * copied
+                */
+               if (copied >=
+                   tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+                       if (copied >=
+                           tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+                               rcvwin <<= 1;
+                       else
+                               rcvwin += (rcvwin >> 1);
+               }
 
-                       /* Receive space grows, normalize in order to
-                        * take into account packet headers and sk_buff
-                        * structure overhead.
-                        */
-                       space /= tp->advmss;
-                       if (!space)
-                               space = 1;
-                       rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
-                       while (tcp_win_from_space(rcvmem) < tp->advmss)
-                               rcvmem += 128;
-                       space *= rcvmem;
-                       space = min(space, sysctl_tcp_rmem[2]);
-                       if (space > sk->sk_rcvbuf) {
-                               sk->sk_rcvbuf = space;
-
-                               /* Make the window clamp follow along.  */
-                               tp->window_clamp = new_clamp;
-                       }
+               rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+               while (tcp_win_from_space(rcvmem) < tp->advmss)
+                       rcvmem += 128;
+
+               rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+               if (rcvbuf > sk->sk_rcvbuf) {
+                       sk->sk_rcvbuf = rcvbuf;
+
+                       /* Make the window clamp follow along.  */
+                       tp->window_clamp = rcvwin;
                }
        }
+       tp->rcvq_space.space = copied;
 
 new_measure:
        tp->rcvq_space.seq = tp->copied_seq;
@@ -713,7 +755,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
        if (tp->srtt > 8 + 2)
                do_div(rate, tp->srtt);
 
-       sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+       /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+        * without any lock. We want to make sure compiler wont store
+        * intermediate values in this location.
+        */
+       ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
+                                               sk->sk_max_pacing_rate);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -2979,7 +3026,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb;
        u32 now = tcp_time_stamp;
-       int fully_acked = true;
+       bool fully_acked = true;
        int flag = 0;
        u32 pkts_acked = 0;
        u32 reord = tp->packets_out;
@@ -4717,15 +4764,7 @@ static void tcp_new_space(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
 
        if (tcp_should_expand_sndbuf(sk)) {
-               int sndmem = SKB_TRUESIZE(max_t(u32,
-                                               tp->rx_opt.mss_clamp,
-                                               tp->mss_cache) +
-                                         MAX_TCP_HEADER);
-               int demanded = max_t(unsigned int, tp->snd_cwnd,
-                                    tp->reordering + 1);
-               sndmem *= 2 * demanded;
-               if (sndmem > sk->sk_sndbuf)
-                       sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+               tcp_sndbuf_expand(sk);
                tp->snd_cwnd_stamp = tcp_time_stamp;
        }
 
@@ -5693,8 +5732,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        tcp_init_congestion_control(sk);
 
                        tcp_mtup_init(sk);
-                       tcp_init_buffer_space(sk);
                        tp->copied_seq = tp->rcv_nxt;
+                       tcp_init_buffer_space(sk);
                }
                smp_mb();
                tcp_set_state(sk, TCP_ESTABLISHED);