X-Git-Url: http://git.cascardo.info/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_input.c;h=722c9cbb91e3e913c95cbf6a1e2b9a7677ab5b4f;hb=3ab224be6d69de912ee21302745ea45a99274dbc;hp=b9e429d2d1de5f035137a55174324169b64a9e70;hpb=2a1292b36ba106b9b7f030d3fa130f5f634fd8f0;p=cascardo%2Flinux.git diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b9e429d2d1de..722c9cbb91e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -289,8 +289,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; + int truesize = tcp_win_from_space(skb->truesize) >> 1; + int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -591,7 +591,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -863,6 +863,9 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) */ static void tcp_disable_fack(struct tcp_sock *tp) { + /* RFC3517 uses different metric in lost marker => reset on change */ + if (tcp_is_fack(tp)) + tp->lost_skb_hint = NULL; tp->rx_opt.sack_ok &= ~2; } @@ -923,7 +926,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { tp->mdev = dst_metric(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); tcp_bound_rto(sk); @@ -1112,16 +1115,22 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from - * SACK blocks by the caller). Also calculate the lowest snd_nxt among the - * remaining retransmitted skbs to avoid some costly processing per ACKs. + * highest SACK block). Also calculate the lowest snd_nxt among the remaining + * retransmitted skbs to avoid some costly processing per ACKs. */ -static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) +static void tcp_mark_lost_retrans(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int flag = 0; int cnt = 0; u32 new_low_seq = tp->snd_nxt; + u32 received_upto = tcp_highest_sack_seq(tp); + + if (!tcp_is_fack(tp) || !tp->retrans_out || + !after(received_upto, tp->lost_retrans_low) || + icsk->icsk_ca_state != TCP_CA_Recovery) + return; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; @@ -1149,9 +1158,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - flag |= FLAG_DATA_SACKED; - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1161,8 +1169,6 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (tp->retrans_out) tp->lost_retrans_low = new_low_seq; - - return flag; } static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, @@ -1230,6 +1236,175 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, + int *reord, int dup_sack, int fack_count) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 sacked = TCP_SKB_CB(skb)->sacked; + int flag = 0; + + /* Account D-SACK for retransmitted packet. */ + if (dup_sack && (sacked & TCPCB_RETRANS)) { + if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + if (sacked & TCPCB_SACKED_ACKED) + *reord = min(fack_count, *reord); + } + + /* Nothing to do; acked frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + return flag; + + if (!(sacked & TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= + ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= tcp_skb_pcount(skb); + tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } else { + if (!(sacked & TCPCB_RETRANS)) { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + *reord = min(fack_count, *reord); + + /* SACK enhanced F-RTO (RFC4138; Appendix B) */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + flag |= FLAG_ONLY_ORIG_SACKED; + } + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out += tcp_skb_pcount(skb); + + fack_count += tcp_skb_pcount(skb); + + /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ + if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq)) + tp->lost_cnt_hint += tcp_skb_pcount(skb); + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + + /* D-SACK. We can detect redundant retransmission in S|R and plain R + * frames and clear it. undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; + } + + return flag; +} + +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, + struct tcp_sack_block *next_dup, + u32 start_seq, u32 end_seq, + int dup_sack_in, int *fack_count, + int *reord, int *flag) +{ + tcp_for_write_queue_from(skb, sk) { + int in_sack = 0; + int dup_sack = dup_sack_in; + + if (skb == tcp_send_head(sk)) + break; + + /* queue is in-order => we can short-circuit the walk early */ + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + if ((next_dup != NULL) && + before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { + in_sack = tcp_match_skb_to_sack(sk, skb, + next_dup->start_seq, + next_dup->end_seq); + if (in_sack > 0) + dup_sack = 1; + } + + if (in_sack <= 0) + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); + if (unlikely(in_sack < 0)) + break; + + if (in_sack) + *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, *fack_count); + + *fack_count += tcp_skb_pcount(skb); + } + return skb; +} + +/* Avoid all extra work that is being done by sacktag while walking in + * a normal way + */ +static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, + u32 skip_to_seq) +{ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + break; + } + return skb; +} + +static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, + struct sock *sk, + struct tcp_sack_block *next_dup, + u32 skip_to_seq, + int *fack_count, int *reord, + int *flag) +{ + if (next_dup == NULL) + return skb; + + if (before(next_dup->start_seq, skip_to_seq)) { + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); + tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); + } + + return skb; +} + +static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +{ + return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); +} + static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { @@ -1237,27 +1412,26 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); - struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); - struct sk_buff *cached_skb; + struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); + struct tcp_sack_block sp[4]; + struct tcp_sack_block *cache; + struct sk_buff *skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + int used_sacks; int reord = tp->packets_out; - int prior_fackets; - u32 highest_sack_end_seq = tp->lost_retrans_low; int flag = 0; int found_dup_sack = 0; - int cached_fack_count; - int i; + int fack_count; + int i, j; int first_sack_index; - int force_one_sack; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; - tp->highest_sack = tp->snd_una; + tcp_highest_sack_reset(sk); } - prior_fackets = tp->fackets_out; - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -1272,78 +1446,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (!tp->packets_out) goto out; - /* SACK fastpath: - * if the only SACK change is the increase of the end_seq of - * the first block then only apply that SACK block - * and use retrans queue hinting otherwise slowpath */ - force_one_sack = 1; - for (i = 0; i < num_sacks; i++) { - __be32 start_seq = sp[i].start_seq; - __be32 end_seq = sp[i].end_seq; - - if (i == 0) { - if (tp->recv_sack_cache[i].start_seq != start_seq) - force_one_sack = 0; - } else { - if ((tp->recv_sack_cache[i].start_seq != start_seq) || - (tp->recv_sack_cache[i].end_seq != end_seq)) - force_one_sack = 0; - } - tp->recv_sack_cache[i].start_seq = start_seq; - tp->recv_sack_cache[i].end_seq = end_seq; - } - /* Clear the rest of the cache sack blocks so they won't match mistakenly. */ - for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) { - tp->recv_sack_cache[i].start_seq = 0; - tp->recv_sack_cache[i].end_seq = 0; - } - + used_sacks = 0; first_sack_index = 0; - if (force_one_sack) - num_sacks = 1; - else { - int j; - tp->fastpath_skb_hint = NULL; - - /* order SACK blocks to allow in order walk of the retrans queue */ - for (i = num_sacks-1; i > 0; i--) { - for (j = 0; j < i; j++){ - if (after(ntohl(sp[j].start_seq), - ntohl(sp[j+1].start_seq))){ - struct tcp_sack_block_wire tmp; - - tmp = sp[j]; - sp[j] = sp[j+1]; - sp[j+1] = tmp; - - /* Track where the first SACK block goes to */ - if (j == first_sack_index) - first_sack_index = j+1; - } - - } - } - } - - /* Use SACK fastpath hint if valid */ - cached_skb = tp->fastpath_skb_hint; - cached_fack_count = tp->fastpath_cnt_hint; - if (!cached_skb) { - cached_skb = tcp_write_queue_head(sk); - cached_fack_count = 0; - } - for (i = 0; i < num_sacks; i++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count; - int dup_sack = (found_dup_sack && (i == first_sack_index)); - int next_dup = (found_dup_sack && (i+1 == first_sack_index)); + int dup_sack = !i && found_dup_sack; - sp++; + sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq)); + sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (!tcp_is_sackblock_valid(tp, dup_sack, + sp[used_sacks].start_seq, + sp[used_sacks].end_seq)) { if (dup_sack) { if (!tp->undo_marker) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); @@ -1352,169 +1465,144 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && - !after(end_seq, tp->snd_una)) + !after(sp[used_sacks].end_seq, tp->snd_una)) continue; NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); } + if (i == 0) + first_sack_index = -1; continue; } - skb = cached_skb; - fack_count = cached_fack_count; - - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - flag |= FLAG_DATA_LOST; - - tcp_for_write_queue_from(skb, sk) { - int in_sack = 0; - u8 sacked; - - if (skb == tcp_send_head(sk)) - break; - - cached_skb = skb; - cached_fack_count = fack_count; - if (i == first_sack_index) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint = fack_count; - } + /* Ignore very old stuff early */ + if (!after(sp[used_sacks].end_seq, prior_snd_una)) + continue; - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if (!before(TCP_SKB_CB(skb)->seq, end_seq)) - break; + used_sacks++; + } - dup_sack = (found_dup_sack && (i == first_sack_index)); + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = used_sacks - 1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(sp[j].start_seq, sp[j+1].start_seq)) { + struct tcp_sack_block tmp; - /* Due to sorting DSACK may reside within this SACK block! */ - if (next_dup) { - u32 dup_start = ntohl(sp->start_seq); - u32 dup_end = ntohl(sp->end_seq); + tmp = sp[j]; + sp[j] = sp[j+1]; + sp[j+1] = tmp; - if (before(TCP_SKB_CB(skb)->seq, dup_end)) { - in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); - if (in_sack > 0) - dup_sack = 1; - } + /* Track where the first SACK block goes to */ + if (j == first_sack_index) + first_sack_index = j+1; } + } + } - /* DSACK info lost if out-of-mem, try SACK still */ - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); - if (unlikely(in_sack < 0)) - break; + skb = tcp_write_queue_head(sk); + fack_count = 0; + i = 0; - sacked = TCP_SKB_CB(skb)->sacked; + if (!tp->sacked_out) { + /* It's already past, so skip checking against it */ + cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); + } else { + cache = tp->recv_sack_cache; + /* Skip empty blocks in at head of the cache */ + while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && + !cache->end_seq) + cache++; + } - /* Account D-SACK for retransmitted packet. */ - if ((dup_sack && in_sack) && - (sacked & TCPCB_RETRANS) && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) - tp->undo_retrans--; + while (i < used_sacks) { + u32 start_seq = sp[i].start_seq; + u32 end_seq = sp[i].end_seq; + int dup_sack = (found_dup_sack && (i == first_sack_index)); + struct tcp_sack_block *next_dup = NULL; - /* The frame is ACKed. */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { - if (sacked&TCPCB_RETRANS) { - if ((dup_sack && in_sack) && - (sacked&TCPCB_SACKED_ACKED)) - reord = min(fack_count, reord); - } + if (found_dup_sack && ((i + 1) == first_sack_index)) + next_dup = &sp[i + 1]; - /* Nothing to do; acked frame is about to be dropped. */ - fack_count += tcp_skb_pcount(skb); - continue; - } + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; - if (!in_sack) { - fack_count += tcp_skb_pcount(skb); - continue; + /* Skip too early cached blocks */ + while (tcp_sack_cache_ok(tp, cache) && + !before(start_seq, cache->end_seq)) + cache++; + + /* Can skip some work by looking recv_sack_cache? */ + if (tcp_sack_cache_ok(tp, cache) && !dup_sack && + after(end_seq, cache->start_seq)) { + + /* Head todo? */ + if (before(start_seq, cache->start_seq)) { + skb = tcp_sacktag_skip(skb, sk, start_seq); + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, + cache->start_seq, dup_sack, + &fack_count, &reord, &flag); } - if (!(sacked&TCPCB_SACKED_ACKED)) { - if (sacked & TCPCB_SACKED_RETRANS) { - /* If the segment is not tagged as lost, - * we do not clear RETRANS, believing - * that retransmission is still in flight. - */ - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out -= tcp_skb_pcount(skb); - tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } else { - if (!(sacked & TCPCB_RETRANS)) { - /* New sack for not retransmitted frame, - * which was in hole. It is reordering. - */ - if (fack_count < prior_fackets) - reord = min(fack_count, reord); - - /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) - flag |= FLAG_ONLY_ORIG_SACKED; - } - - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } - - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; - flag |= FLAG_DATA_SACKED; - tp->sacked_out += tcp_skb_pcount(skb); - - fack_count += tcp_skb_pcount(skb); - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; + /* Rest of the block already fully processed? */ + if (!after(end_seq, cache->end_seq)) + goto advance_sp; - if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) { - tp->highest_sack = TCP_SKB_CB(skb)->seq; - highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq; - } - } else { - if (dup_sack && (sacked&TCPCB_RETRANS)) - reord = min(fack_count, reord); + skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq, + &fack_count, &reord, &flag); - fack_count += tcp_skb_pcount(skb); + /* ...tail remains todo... */ + if (tcp_highest_sack_seq(tp) == cache->end_seq) { + /* ...but better entrypoint exists! */ + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; + cache++; + goto walk; } - /* D-SACK. We can detect redundant retransmission - * in S|R and plain R frames and clear it. - * undo_retrans is decreased above, L|R frames - * are accounted above as well. - */ - if (dup_sack && - (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - tp->retransmit_skb_hint = NULL; - } + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); + /* Check overlap against next cached too (past this one already) */ + cache++; + continue; } + if (!before(start_seq, tcp_highest_sack_seq(tp))) { + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; + } + skb = tcp_sacktag_skip(skb, sk, start_seq); + +walk: + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, + dup_sack, &fack_count, &reord, &flag); + +advance_sp: /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct * due to in-order walk */ if (after(end_seq, tp->frto_highmark)) flag &= ~FLAG_ONLY_ORIG_SACKED; + + i++; } - if (tp->retrans_out && - after(highest_sack_end_seq, tp->lost_retrans_low) && - icsk->icsk_ca_state == TCP_CA_Recovery) - flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); + /* Clear the head of the cache sack blocks so we can skip it next time */ + for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { + tp->recv_sack_cache[i].start_seq = 0; + tp->recv_sack_cache[i].end_seq = 0; + } + for (j = 0; j < used_sacks; j++) + tp->recv_sack_cache[i++] = sp[j]; + + tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + if ((reord < tp->fackets_out) && + ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, tp->fackets_out - reord, 0); @@ -1860,6 +1948,26 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; } +/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs + * counter when SACK is enabled (without SACK, sacked_out is used for + * that purpose). + * + * Instead, with FACK TCP uses fackets_out that includes both SACKed + * segments up to the highest received SACK block so far and holes in + * between them. + * + * With reordering, holes may still be in flight, so RFC3517 recovery + * uses pure sacked_out (total number of SACKed segments) even though + * it violates the RFC that uses duplicate ACKs, often these are equal + * but when e.g. out-of-window ACKs or packet duplication occurs, + * they differ. Since neither occurs due to loss, TCP should really + * ignore them. + */ +static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +{ + return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; +} + static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); @@ -1980,13 +2088,13 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_fackets_out(tp) > tp->reordering) + if (tcp_dupack_heurestics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk)) + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -2019,8 +2127,10 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, tp->retransmit_skb_hint = NULL; } -/* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, int packets) +/* Mark head of queue up as lost. With RFC3517 SACK, the packets is + * is against sacked "cnt", otherwise it's against facked "cnt" + */ +static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2042,8 +2152,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - cnt += tcp_skb_pcount(skb); - if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + + if (tcp_is_fack(tp) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + cnt += tcp_skb_pcount(skb); + + if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) || + after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -2056,17 +2171,22 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk) +static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_fack(tp)) { + if (tcp_is_reno(tp)) { + tcp_mark_head_lost(sk, 1, fast_rexmit); + } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost); + tcp_mark_head_lost(sk, lost, fast_rexmit); } else { - tcp_mark_head_lost(sk, 1); + int sacked_upto = tp->sacked_out - tp->reordering; + if (sacked_upto < 0) + sacked_upto = 0; + tcp_mark_head_lost(sk, sacked_upto, fast_rexmit); } /* New heuristics: it is possible only after we switched @@ -2074,7 +2194,7 @@ static void tcp_update_scoreboard(struct sock *sk) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -2247,7 +2367,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2381,7 +2501,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) struct tcp_sock *tp = tcp_sk(sk); int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && - (tp->fackets_out > tp->reordering)); + (tcp_fackets_out(tp) > tp->reordering)); + int fast_rexmit = 0; /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ @@ -2401,11 +2522,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) return; /* C. Process data loss notification, provided it is valid. */ - if ((flag&FLAG_DATA_LOST) && + if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); + tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, 0); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -2524,10 +2645,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); + fast_rexmit = 1; } - if (do_lost || tcp_head_timedout(sk)) - tcp_update_scoreboard(sk); + if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } @@ -2591,11 +2713,10 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight, good); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -2638,8 +2759,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, - int prior_fackets) +static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2647,31 +2767,32 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, u32 now = tcp_time_stamp; int fully_acked = 1; int flag = 0; - int prior_packets = tp->packets_out; - u32 cnt = 0; + u32 pkts_acked = 0; u32 reord = tp->packets_out; s32 seq_rtt = -1; + s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 end_seq; - u32 packets_acked; + u32 acked_pcount; u8 sacked = scb->sacked; + /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) break; - packets_acked = tcp_tso_acked(sk, skb); - if (!packets_acked) + acked_pcount = tcp_tso_acked(sk, skb); + if (!acked_pcount) break; fully_acked = 0; end_seq = tp->snd_una; } else { - packets_acked = tcp_skb_pcount(skb); + acked_pcount = tcp_skb_pcount(skb); end_seq = scb->end_seq; } @@ -2681,43 +2802,36 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, tcp_mtup_probe_success(sk, skb); } - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - flag |= FLAG_RETRANS_DATA_ACKED; - seq_rtt = -1; - if ((flag & FLAG_DATA_ACKED) || - (packets_acked > 1)) - flag |= FLAG_NONHEAD_RETRANS_ACKED; - } else { - if (seq_rtt < 0) { - seq_rtt = now - scb->when; - if (fully_acked) - last_ackt = skb->tstamp; - } - if (!(sacked & TCPCB_SACKED_ACKED)) - reord = min(cnt, reord); - } - - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - - if ((sacked & TCPCB_URG) && tp->urg_mode && - !before(end_seq, tp->snd_up)) - tp->urg_mode = 0; + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= acked_pcount; + flag |= FLAG_RETRANS_DATA_ACKED; + ca_seq_rtt = -1; + seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || + (acked_pcount > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { + ca_seq_rtt = now - scb->when; + last_ackt = skb->tstamp; if (seq_rtt < 0) { - seq_rtt = now - scb->when; - if (fully_acked) - last_ackt = skb->tstamp; + seq_rtt = ca_seq_rtt; } - reord = min(cnt, reord); + if (!(sacked & TCPCB_SACKED_ACKED)) + reord = min(pkts_acked, reord); } - tp->packets_out -= packets_acked; - cnt += packets_acked; + + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= acked_pcount; + if (sacked & TCPCB_LOST) + tp->lost_out -= acked_pcount; + + if (unlikely((sacked & TCPCB_URG) && tp->urg_mode && + !before(end_seq, tp->snd_up))) + tp->urg_mode = 0; + + tp->packets_out -= acked_pcount; + pkts_acked += acked_pcount; /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -2737,12 +2851,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, break; tcp_unlink_write_queue(skb, sk); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); tcp_clear_all_retrans_hints(tp); } if (flag & FLAG_ACKED) { - u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -2758,9 +2871,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, } tp->fackets_out -= min(pkts_acked, tp->fackets_out); - /* hint's skb might be NULL but we don't need to care */ - tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, - tp->fastpath_cnt_hint); + if (ca_ops->pkts_acked) { s32 rtt_us = -1; @@ -2772,8 +2883,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); - else if (seq_rtt > 0) - rtt_us = jiffies_to_usecs(seq_rtt); + else if (ca_seq_rtt > 0) + rtt_us = jiffies_to_usecs(ca_seq_rtt); } ca_ops->pkts_acked(sk, pkts_acked, rtt_us); @@ -2803,7 +2914,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, } } #endif - *seq_rtt_p = seq_rtt; return flag; } @@ -3040,7 +3150,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; u32 prior_fackets; - s32 seq_rtt; int prior_packets; int frto_cwnd = 0; @@ -3106,7 +3215,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, prior_fackets); + flag |= tcp_clean_rtx_queue(sk, prior_fackets); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3118,11 +3227,11 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight, 0); + tcp_cong_avoid(sk, ack, prior_in_flight); tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) - tcp_cong_avoid(sk, ack, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, prior_in_flight); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -3458,7 +3567,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) __skb_queue_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -3466,9 +3575,9 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, 1, POLL_HUP); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else - sk_wake_async(sk, 1, POLL_IN); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } } @@ -3741,12 +3850,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) queue_and_out: if (eaten < 0 && (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_stream_rmem_schedule(sk, skb))) { + !sk_rmem_schedule(sk, skb->truesize))) { if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) + !sk_rmem_schedule(sk, skb->truesize)) goto drop; } - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3815,9 +3924,9 @@ drop: TCP_ECN_check_ce(tp, skb); if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_stream_rmem_schedule(sk, skb)) { + !sk_rmem_schedule(sk, skb->truesize)) { if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) + !sk_rmem_schedule(sk, skb->truesize)) goto drop; } @@ -3828,7 +3937,7 @@ drop: SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ @@ -3970,7 +4079,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); - sk_stream_set_owner_r(nskb, sk); + skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4068,7 +4177,7 @@ static int tcp_prune_queue(struct sock *sk) sk->sk_receive_queue.next, (struct sk_buff*)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -4088,7 +4197,7 @@ static int tcp_prune_queue(struct sock *sk) */ if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) @@ -4590,7 +4699,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Bulk data transfer: receiver */ __skb_pull(skb,tcp_header_len); __skb_queue_tail(&sk->sk_receive_queue, skb); - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } @@ -4827,7 +4936,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); - sk_wake_async(sk, 0, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (sk->sk_write_pending || @@ -5057,9 +5166,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * are not waked up, because sk->sk_sleep == * NULL and sk->sk_socket == NULL. */ - if (sk->sk_socket) { - sk_wake_async(sk,0,POLL_OUT); - } + if (sk->sk_socket) + sk_wake_async(sk, + SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) <<