From 3b4929f65b0d8249f19a50245cd88ed1a2f78cff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 May 2019 17:17:22 -0700 Subject: tcp: limit payload size of sacked skbs Jonathan Looney reported that TCP can trigger the following crash in tcp_shifted_skb() : BUG_ON(tcp_skb_pcount(skb) < pcount); This can happen if the remote peer has advertized the smallest MSS that linux TCP accepts : 48 An skb can hold 17 fragments, and each fragment can hold 32KB on x86, or 64KB on PowerPC. This means that the 16bit witdh of TCP_SKB_CB(skb)->tcp_gso_segs can overflow. Note that tcp_sendmsg() builds skbs with less than 64KB of payload, so this problem needs SACK to be enabled. SACK blocks allow TCP to coalesce multiple skbs in the retransmit queue, thus filling the 17 fragments to maximal capacity. CVE-2019-11477 -- u16 overflow of TCP_SKB_CB(skb)->tcp_gso_segs Fixes: 832d11c5cd07 ("tcp: Try to restore large SKBs while SACK processing") Signed-off-by: Eric Dumazet Reported-by: Jonathan Looney Acked-by: Neal Cardwell Reviewed-by: Tyler Hicks Cc: Yuchung Cheng Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f429e856e263..b8e3bbb85211 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1454,8 +1454,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; + if (mss_now < TCP_MIN_SND_MSS) + mss_now = TCP_MIN_SND_MSS; return mss_now; } @@ -2747,7 +2747,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) if (next_skb_size <= skb_availroom(skb)) skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), next_skb_size); - else if (!skb_shift(skb, next_skb, next_skb_size)) + else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; } tcp_highest_sack_replace(sk, next_skb, skb); -- cgit v1.2.3-55-g7522 From f070ef2ac66716357066b683fb0baf55f8191a2e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 May 2019 05:12:05 -0700 Subject: tcp: tcp_fragment() should apply sane memory limits Jonathan Looney reported that a malicious peer can force a sender to fragment its retransmit queue into tiny skbs, inflating memory usage and/or overflow 32bit counters. TCP allows an application to queue up to sk_sndbuf bytes, so we need to give some allowance for non malicious splitting of retransmit queue. A new SNMP counter is added to monitor how many times TCP did not allow to split an skb if the allowance was exceeded. Note that this counter might increase in the case applications use SO_SNDBUF socket option to lower sk_sndbuf. CVE-2019-11478 : tcp_fragment, prevent fragmenting a packet when the socket is already using more than half the allowed space Signed-off-by: Eric Dumazet Reported-by: Jonathan Looney Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Reviewed-by: Tyler Hicks Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_output.c | 5 +++++ 3 files changed, 7 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 86dc24a96c90..fd42c1316d3d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -283,6 +283,7 @@ enum LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */ LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */ LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */ + LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4370f4246e86..073273b751f8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -287,6 +287,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), + SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b8e3bbb85211..1bb1c46b4aba 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1296,6 +1296,11 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (nsize < 0) nsize = 0; + if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); + return -ENOMEM; + } + if (skb_unclone(skb, gfp)) return -ENOMEM; -- cgit v1.2.3-55-g7522 From 5f3e2bf008c2221478101ee72f5cb4654b9fc363 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Jun 2019 09:15:31 -0700 Subject: tcp: add tcp_min_snd_mss sysctl Some TCP peers announce a very small MSS option in their SYN and/or SYN/ACK messages. This forces the stack to send packets with a very high network/cpu overhead. Linux has enforced a minimal value of 48. Since this value includes the size of TCP options, and that the options can consume up to 40 bytes, this means that each segment can include only 8 bytes of payload. In some cases, it can be useful to increase the minimal value to a saner value. We still let the default to 48 (TCP_MIN_SND_MSS), for compatibility reasons. Note that TCP_MAXSEG socket option enforces a minimal value of (TCP_MIN_MSS). David Miller increased this minimal value in commit c39508d6f118 ("tcp: Make TCP_MAXSEG minimum more correct.") from 64 to 88. We might in the future merge TCP_MIN_SND_MSS and TCP_MIN_MSS. CVE-2019-11479 -- tcp mss hardcoded to 48 Signed-off-by: Eric Dumazet Suggested-by: Jonathan Looney Acked-by: Neal Cardwell Cc: Yuchung Cheng Cc: Tyler Hicks Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 8 ++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 11 +++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 3 +-- 5 files changed, 22 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 288aa264ac26..22f6b8b1110a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -255,6 +255,14 @@ tcp_base_mss - INTEGER Path MTU discovery (MTU probing). If MTU probing is enabled, this is the initial MSS used by the connection. +tcp_min_snd_mss - INTEGER + TCP SYN and SYNACK messages usually advertise an ADVMSS option, + as described in RFC 1122 and RFC 6691. + If this ADVMSS option is smaller than tcp_min_snd_mss, + it is silently capped to tcp_min_snd_mss. + + Default : 48 (at least 8 bytes of payload per segment) + tcp_congestion_control - STRING Set the congestion control algorithm to be used for new connections. The algorithm "reno" is always available, but diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7698460a3dd1..623cfbb7b8dc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -117,6 +117,7 @@ struct netns_ipv4 { #endif int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; + int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 08a428a7b274..9e5257251163 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -39,6 +39,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; +static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -774,6 +776,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_min_snd_mss", + .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bc86f9735f45..cfa81190a1b1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2628,6 +2628,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1bb1c46b4aba..00c01a01b547 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1459,8 +1459,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < TCP_MIN_SND_MSS) - mss_now = TCP_MIN_SND_MSS; + mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); return mss_now; } -- cgit v1.2.3-55-g7522