summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c256
1 files changed, 249 insertions, 7 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c39..97d57676b8ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
* aligned portion of it that matches. Therefore we might need to fragment
* which may fail and creates some hassle (caller must handle error case
* returns).
+ *
+ * FIXME: this could be merged to shift decision code
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
-
- if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
- tcp_advance_highest_sack(sk, skb);
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
return flag;
}
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ struct sk_buff *skb, unsigned int pcount,
+ int shifted, int fack_count, int *reord,
+ int *flag, int mss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */
+
+ BUG_ON(!pcount);
+
+ TCP_SKB_CB(prev)->end_seq += shifted;
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ skb_shinfo(prev)->gso_segs += pcount;
+ BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+ skb_shinfo(skb)->gso_segs -= pcount;
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+ * in theory this shouldn't be necessary but as long as DSACK
+ * code can come after this skb later on it's better to keep
+ * setting gso_size to something.
+ */
+ if (!skb_shinfo(prev)->gso_size) {
+ skb_shinfo(prev)->gso_size = mss;
+ skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+ }
+
+ /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+ if (skb_shinfo(skb)->gso_segs <= 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ }
+
+ *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+ pcount);
+
+ /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+ tcp_clear_all_retrans_hints(tp);
+
+ if (skb->len > 0) {
+ BUG_ON(!tcp_skb_pcount(skb));
+ return 0;
+ }
+
+ /* Whole SKB was eaten :-) */
+
+ TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+ if (skb == tcp_highest_sack(sk))
+ tcp_advance_highest_sack(sk, skb);
+
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+
+ return 1;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+ int mss = tcp_skb_mss(skb);
+
+ if (!mss)
+ mss = skb->len;
+
+ return mss;
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ u32 start_seq, u32 end_seq,
+ int dup_sack, int *fack_count,
+ int *reord, int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
+ int pcount = 0;
+ int len;
+ int in_sack;
+
+ if (!sk_can_gso(sk))
+ goto fallback;
+
+ /* Normally R but no L won't result in plain S */
+ if (!dup_sack &&
+ (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+ goto fallback;
+ if (!skb_can_shift(skb))
+ goto fallback;
+ /* This frame is about to be dropped (was ACKed). */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ goto fallback;
+
+ /* Can only happen with delayed DSACK + discard craziness */
+ if (unlikely(skb == tcp_write_queue_head(sk)))
+ goto fallback;
+ prev = tcp_write_queue_prev(sk, skb);
+
+ if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto fallback;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (in_sack) {
+ len = skb->len;
+ pcount = tcp_skb_pcount(skb);
+ mss = tcp_shift_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_shift_mss(prev))
+ goto fallback;
+ } else {
+ if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+ goto noop;
+ /* CHECKME: This is non-MSS split case only?, this will
+ * cause skipped skbs due to advancing loop btw, original
+ * has that feature too
+ */
+ if (tcp_skb_pcount(skb) <= 1)
+ goto noop;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+ if (!in_sack) {
+ /* TODO: head merge to next could be attempted here
+ * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+ * though it might not be worth of the additional hassle
+ *
+ * ...we can probably just fallback to what was done
+ * previously. We could try merging non-SACKed ones
+ * as well but it probably isn't going to buy off
+ * because later SACKs might again split them, and
+ * it would make skb timestamp tracking considerably
+ * harder problem.
+ */
+ goto fallback;
+ }
+
+ len = end_seq - TCP_SKB_CB(skb)->seq;
+ BUG_ON(len < 0);
+ BUG_ON(len > skb->len);
+
+ /* MSS boundaries should be honoured or else pcount will
+ * severely break even though it makes things bit trickier.
+ * Optimize common case to avoid most of the divides
+ */
+ mss = tcp_skb_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_shift_mss(prev))
+ goto fallback;
+
+ if (len == mss) {
+ pcount = 1;
+ } else if (len < mss) {
+ goto noop;
+ } else {
+ pcount = len / mss;
+ len = pcount * mss;
+ }
+ }
+
+ if (!skb_shift(prev, skb, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+ flag, mss))
+ goto out;
+
+ /* Hole filled allows collapsing with the next as well, this is very
+ * useful when hole on every nth skb pattern happens
+ */
+ if (prev == tcp_write_queue_tail(sk))
+ goto out;
+ skb = tcp_write_queue_next(sk, prev);
+
+ if (!skb_can_shift(skb))
+ goto out;
+ if (skb == tcp_send_head(sk))
+ goto out;
+ if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto out;
+
+ len = skb->len;
+ if (skb_shift(prev, skb, len)) {
+ pcount += tcp_skb_pcount(skb);
+ tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+ *fack_count, reord, flag, mss);
+ }
+
+out:
+ *fack_count += pcount;
+ return prev;
+
+noop:
+ return skb;
+
+fallback:
+ return NULL;
+}
+
static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
u32 start_seq, u32 end_seq,
int dup_sack_in, int *fack_count,
int *reord, int *flag)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *tmp;
+
tcp_for_write_queue_from(skb, sk) {
int in_sack = 0;
int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
dup_sack = 1;
}
- if (in_sack <= 0)
- in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
- end_seq);
+ /* skb reference here is a bit tricky to get right, since
+ * shifting can eat and free both this skb and the next,
+ * so not even _safe variant of the loop is enough.
+ */
+ if (in_sack <= 0) {
+ tmp = tcp_shift_skb_data(sk, skb, start_seq,
+ end_seq, dup_sack,
+ fack_count, reord, flag);
+ if (tmp != NULL) {
+ if (tmp != skb) {
+ skb = tmp;
+ continue;
+ }
+
+ in_sack = 0;
+ } else {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ start_seq,
+ end_seq);
+ }
+ }
+
if (unlikely(in_sack < 0))
break;
- if (in_sack)
+ if (in_sack) {
*flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
*fack_count,
&(TCP_SKB_CB(skb)->sacked),
tcp_skb_pcount(skb));
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tcp_highest_sack_seq(tp)))
+ tcp_advance_highest_sack(sk, skb);
+ }
+
*fack_count += tcp_skb_pcount(skb);
}
return skb;