summaryrefslogtreecommitdiffstats
path: root/src/net/tcp.c
diff options
context:
space:
mode:
authorMichael Brown2015-03-11 18:53:29 +0100
committerMichael Brown2015-03-12 00:14:39 +0100
commite0fc8fe781b81083298dcddb08744c4c1ddf41ff (patch)
treebc59126d610fed247365b466eb57451bcae0360d /src/net/tcp.c
parent[legal] Relicense files under GPL2_OR_LATER_OR_UBDL (diff)
downloadipxe-e0fc8fe781b81083298dcddb08744c4c1ddf41ff.tar.gz
ipxe-e0fc8fe781b81083298dcddb08744c4c1ddf41ff.tar.xz
ipxe-e0fc8fe781b81083298dcddb08744c4c1ddf41ff.zip
[tcp] Implement support for TCP Selective Acknowledgements (SACK)
The TCP Selective Acknowledgement option (specified in RFC2018) provides a mechanism for the receiver to indicate packets that have been received out of order (e.g. due to earlier dropped packets). iPXE often operates in environments in which there is a high probability of packet loss. For example, the legacy USB keyboard emulation in some BIOSes involves polling the USB bus from within a system management interrupt: this introduces an invisible delay of around 500us which is long enough for around 40 full-length packets to be dropped. Similarly, almost all 1Gbps USB2 devices will eventually end up dropping packets because the USB2 bus does not provide enough bandwidth to sustain a 1Gbps stream, and most devices will not provide enough internal buffering to hold a full TCP window's worth of received packets. Add support for sending TCP Selective Acknowledgements. This provides the sender with more detailed information about which packets have been lost, and so allows for a more efficient retransmission strategy. We include a SACK-permitted option in our SYN packet, since experimentation shows that at least Linux peers will not include a SACK-permitted option in the SYN-ACK packet if one was not present in the initial SYN. (RFC2018 does not seem to mandate this behaviour, but it is consistent with the approach taken in RFC1323.) We ignore any received SACK options; this is safe to do since SACK is only ever advisory and we never have to send non-trivial amounts of data. Since our TCP receive queue is a candidate for cache discarding under low memory conditions, we may end up discarding data that has been reported as received via a SACK option. This is permitted by RFC2018. We follow the stricture that SACK blocks must not report data which is no longer held by the receiver: previously-reported blocks are validated against the current receive queue before being included within the current SACK block list. Experiments in a qemu VM using forced packet drops (by setting NETDEV_DISCARD_RATE to 32) show that implementing SACK improves throughput by around 400%. Experiments with a USB2 NIC (an SMSC7500) show that implementing SACK improves throughput by around 700%, increasing the download rate from 35Mbps up to 250Mbps (which is approximately the usable bandwidth limit for USB2). Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/net/tcp.c')
-rw-r--r--src/net/tcp.c162
1 files changed, 158 insertions, 4 deletions
diff --git a/src/net/tcp.c b/src/net/tcp.c
index 719c9936..aeacfd79 100644
--- a/src/net/tcp.c
+++ b/src/net/tcp.c
@@ -104,6 +104,9 @@ struct tcp_connection {
/** Maximum receive window */
uint32_t max_rcv_win;
+ /** Selective acknowledgement list (in host-endian order) */
+ struct tcp_sack_block sack[TCP_SACK_MAX];
+
/** Transmit queue */
struct list_head tx_queue;
/** Receive queue */
@@ -129,6 +132,8 @@ enum tcp_flags {
TCP_TS_ENABLED = 0x0002,
/** TCP acknowledgement is pending */
TCP_ACK_PENDING = 0x0004,
+ /** TCP selective acknowledgement is enabled */
+ TCP_SACK_ENABLED = 0x0008,
};
/** TCP internal header
@@ -143,6 +148,8 @@ struct tcp_rx_queued_header {
* enqueued, and so excludes the SYN, if present.
*/
uint32_t seq;
+ /** Next SEQ value, in host-endian order */
+ uint32_t nxt;
/** Flags
*
* Only FIN is valid within this flags byte; all other flags
@@ -450,6 +457,94 @@ static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
}
/**
+ * Find selective acknowledgement block
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value in SACK block (in host-endian order)
+ * @v sack SACK block to fill in (in host-endian order)
+ * @ret len Length of SACK block
+ */
+static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
+ struct tcp_sack_block *sack ) {
+ struct io_buffer *iobuf;
+ struct tcp_rx_queued_header *tcpqhdr;
+ uint32_t left = tcp->rcv_ack;
+ uint32_t right = left;
+
+ /* Find highest block which does not start after SEQ */
+ list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
+ tcpqhdr = iobuf->data;
+ if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
+ if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
+ break;
+ left = tcpqhdr->seq;
+ }
+ if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
+ right = tcpqhdr->nxt;
+ }
+
+ /* Fail if this block does not contain SEQ */
+ if ( tcp_cmp ( right, seq ) < 0 )
+ return 0;
+
+ /* Populate SACK block */
+ sack->left = left;
+ sack->right = right;
+ return ( right - left );
+}
+
+/**
+ * Update TCP selective acknowledgement list
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value in first SACK block (in host-endian order)
+ * @ret count Number of SACK blocks
+ */
+static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
+ struct tcp_sack_block sack[TCP_SACK_MAX];
+ unsigned int old = 0;
+ unsigned int new = 0;
+ unsigned int i;
+ uint32_t len;
+
+ /* Populate first new SACK block */
+ len = tcp_sack_block ( tcp, seq, &sack[0] );
+ if ( len )
+ new++;
+
+ /* Populate remaining new SACK blocks based on old SACK blocks */
+ for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
+
+ /* Stop if we run out of space in the new list */
+ if ( new == TCP_SACK_MAX )
+ break;
+
+ /* Skip empty old SACK blocks */
+ if ( tcp->sack[old].left == tcp->sack[old].right )
+ continue;
+
+ /* Populate new SACK block */
+ len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
+ if ( len == 0 )
+ continue;
+
+ /* Eliminate duplicates */
+ for ( i = 0 ; i < new ; i++ ) {
+ if ( sack[i].left == sack[new].left ) {
+ new--;
+ break;
+ }
+ }
+ new++;
+ }
+
+ /* Update SACK list */
+ memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
+ memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
+ return new;
+}
+
+/**
* Process TCP transmit queue
*
* @v tcp TCP connection
@@ -493,9 +588,10 @@ static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
}
/**
- * Transmit any outstanding data
+ * Transmit any outstanding data (with selective acknowledgement)
*
* @v tcp TCP connection
+ * @v sack_seq SEQ for first selective acknowledgement (if any)
*
* Transmits any outstanding data on the connection.
*
@@ -503,15 +599,21 @@ static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
* will have been started if necessary, and so the stack will
* eventually attempt to retransmit the failed packet.
*/
-static void tcp_xmit ( struct tcp_connection *tcp ) {
+static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
struct io_buffer *iobuf;
struct tcp_header *tcphdr;
struct tcp_mss_option *mssopt;
struct tcp_window_scale_padded_option *wsopt;
struct tcp_timestamp_padded_option *tsopt;
+ struct tcp_sack_permitted_padded_option *spopt;
+ struct tcp_sack_padded_option *sackopt;
+ struct tcp_sack_block *sack;
void *payload;
unsigned int flags;
+ unsigned int sack_count;
+ unsigned int i;
size_t len = 0;
+ size_t sack_len;
uint32_t seq_len;
uint32_t app_win;
uint32_t max_rcv_win;
@@ -590,6 +692,10 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
wsopt->wsopt.kind = TCP_OPTION_WS;
wsopt->wsopt.length = sizeof ( wsopt->wsopt );
wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
+ spopt = iob_push ( iobuf, sizeof ( *spopt ) );
+ memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt ) );
+ spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
+ spopt->spopt.length = sizeof ( spopt->spopt );
}
if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
@@ -599,6 +705,21 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
tsopt->tsopt.tsval = htonl ( currticks() );
tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
}
+ if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
+ ( ! list_empty ( &tcp->rx_queue ) ) &&
+ ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
+ sack_len = ( sack_count * sizeof ( *sack ) );
+ sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
+ memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
+ sackopt->sackopt.kind = TCP_OPTION_SACK;
+ sackopt->sackopt.length =
+ ( sizeof ( sackopt->sackopt ) + sack_len );
+ sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
+ for ( i = 0 ; i < sack_count ; i++, sack++ ) {
+ sack->left = htonl ( tcp->sack[i].left );
+ sack->right = htonl ( tcp->sack[i].right );
+ }
+ }
if ( len != 0 )
flags |= TCP_PSH;
tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
@@ -635,6 +756,17 @@ static void tcp_xmit ( struct tcp_connection *tcp ) {
profile_stop ( &tcp_tx_profiler );
}
+/**
+ * Transmit any outstanding data
+ *
+ * @v tcp TCP connection
+ */
+static void tcp_xmit ( struct tcp_connection *tcp ) {
+
+ /* Transmit without an explicit first SACK */
+ tcp_xmit_sack ( tcp, tcp->rcv_ack );
+}
+
/** TCP process descriptor */
static struct process_descriptor tcp_process_desc =
PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
@@ -804,6 +936,12 @@ static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
case TCP_OPTION_WS:
options->wsopt = data;
break;
+ case TCP_OPTION_SACK_PERMITTED:
+ options->spopt = data;
+ break;
+ case TCP_OPTION_SACK:
+ /* Ignore received SACKs */
+ break;
case TCP_OPTION_TS:
options->tsopt = data;
break;
@@ -823,6 +961,7 @@ static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
* @v seq_len Sequence space length to consume
*/
static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
+ unsigned int sack;
/* Sanity check */
assert ( seq_len > 0 );
@@ -840,6 +979,16 @@ static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
/* Update timestamp */
tcp->ts_recent = tcp->ts_val;
+ /* Update SACK list */
+ for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
+ if ( tcp->sack[sack].left == tcp->sack[sack].right )
+ continue;
+ if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
+ tcp->sack[sack].left = tcp->rcv_ack;
+ if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
+ tcp->sack[sack].right = tcp->rcv_ack;
+ }
+
/* Mark ACK as pending */
tcp->flags |= TCP_ACK_PENDING;
}
@@ -860,6 +1009,8 @@ static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
tcp->rcv_ack = seq;
if ( options->tsopt )
tcp->flags |= TCP_TS_ENABLED;
+ if ( options->spopt )
+ tcp->flags |= TCP_SACK_ENABLED;
if ( options->wsopt ) {
tcp->snd_win_scale = options->wsopt->scale;
tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
@@ -1070,6 +1221,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
struct io_buffer *queued;
size_t len;
uint32_t seq_len;
+ uint32_t nxt;
/* Calculate remaining flags and sequence length. Note that
* SYN, if present, has already been processed by this point.
@@ -1077,6 +1229,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
flags &= TCP_FIN;
len = iob_len ( iobuf );
seq_len = ( len + ( flags ? 1 : 0 ) );
+ nxt = ( seq + seq_len );
/* Discard immediately (to save memory) if:
*
@@ -1087,7 +1240,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
*/
if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
- ( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
+ ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
( seq_len == 0 ) ) {
free_iob ( iobuf );
return;
@@ -1096,6 +1249,7 @@ static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
/* Add internal header */
tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
tcpqhdr->seq = seq;
+ tcpqhdr->nxt = nxt;
tcpqhdr->flags = flags;
/* Add to RX queue */
@@ -1289,7 +1443,7 @@ static int tcp_rx ( struct io_buffer *iobuf,
if ( list_empty ( &tcp->rx_queue ) ) {
process_add ( &tcp->process );
} else {
- tcp_xmit ( tcp );
+ tcp_xmit_sack ( tcp, seq );
}
/* If this packet was the last we expect to receive, set up