summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_recovery.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_recovery.c')
-rw-r--r--net/ipv4/tcp_recovery.c126
1 files changed, 76 insertions, 50 deletions
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index be8ef1e5dfef..3a81720ac0c4 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,8 +2,6 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
-
static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -46,7 +44,8 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
+ u32 min_rtt = tcp_min_rtt(tp);
+ struct sk_buff *skb, *n;
u32 reo_wnd;
*reo_timeout = 0;
@@ -56,48 +55,37 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
* to queuing or delayed ACKs.
*/
reo_wnd = 1000;
- if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
- reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+ if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
+ min_rtt != ~0U) {
+ reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
+ reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
+ }
- tcp_for_write_queue(skb, sk) {
+ list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
+ tcp_tsorted_anchor) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ s32 remaining;
- if (skb == tcp_send_head(sk))
- break;
-
- /* Skip ones already (s)acked */
- if (!after(scb->end_seq, tp->snd_una) ||
- scb->sacked & TCPCB_SACKED_ACKED)
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
- tp->rack.end_seq, scb->end_seq)) {
- /* Step 3 in draft-cheng-tcpm-rack-00.txt:
- * A packet is lost if its elapsed time is beyond
- * the recent RTT plus the reordering window.
- */
- u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
- skb->skb_mstamp);
- s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
-
- if (remaining < 0) {
- tcp_rack_mark_skb_lost(sk, skb);
- continue;
- }
-
- /* Skip ones marked lost but not yet retransmitted */
- if ((scb->sacked & TCPCB_LOST) &&
- !(scb->sacked & TCPCB_SACKED_RETRANS))
- continue;
-
- /* Record maximum wait time (+1 to avoid 0) */
- *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
-
- } else if (!(scb->sacked & TCPCB_RETRANS)) {
- /* Original data are sent sequentially so stop early
- * b/c the rest are all sent after rack_sent
- */
+ if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+ tp->rack.end_seq, scb->end_seq))
break;
+
+ /* A packet is lost if it has not been s/acked beyond
+ * the recent RTT plus the reordering window.
+ */
+ remaining = tp->rack.rtt_us + reo_wnd -
+ tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ if (remaining <= 0) {
+ tcp_rack_mark_skb_lost(sk, skb);
+ list_del_init(&skb->tcp_tsorted_anchor);
+ } else {
+ /* Record maximum wait time */
+ *reo_timeout = max_t(u32, *reo_timeout, remaining);
}
}
}
@@ -129,13 +117,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
{
u32 rtt_us;
- if (tp->rack.mstamp &&
- !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
- end_seq, tp->rack.end_seq))
- return;
-
rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
- if (sacked & TCPCB_RETRANS) {
+ if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
@@ -146,13 +129,15 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
- if (rtt_us < tcp_min_rtt(tp))
- return;
+ return;
}
- tp->rack.rtt_us = rtt_us;
- tp->rack.mstamp = xmit_time;
- tp->rack.end_seq = end_seq;
tp->rack.advanced = 1;
+ tp->rack.rtt_us = rtt_us;
+ if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
+ tp->rack.mstamp = xmit_time;
+ tp->rack.end_seq = end_seq;
+ }
}
/* We have waited long enough to accommodate reordering. Mark the expired
@@ -176,3 +161,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
tcp_rearm_rto(sk);
}
+
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
+ !rs->prior_delivered)
+ return;
+
+ /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+ if (before(rs->prior_delivered, tp->rack.last_delivered))
+ tp->rack.dsack_seen = 0;
+
+ /* Adjust the reo_wnd if update is pending */
+ if (tp->rack.dsack_seen) {
+ tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+ tp->rack.reo_wnd_steps + 1);
+ tp->rack.dsack_seen = 0;
+ tp->rack.last_delivered = tp->delivered;
+ tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+ } else if (!tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_steps = 1;
+ }
+}