diff options
Diffstat (limited to 'net/ipv4')
43 files changed, 1813 insertions, 1161 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..ce4aa827be05 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); @@ -826,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how) err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for POLLHUP, even on eg. unconnected UDP sockets -- RR */ + /* fall through */ default: sk->sk_shutdown |= how; if (sk->sk_prot->shutdown) @@ -839,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how) case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; - /* Fall through */ + /* fall through */ case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 37db44f60718..4dd95cdd8070 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -240,7 +240,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) + if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7c45b8896709..a8d7c5a9fb05 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; + /* fall through */ case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..a4573bccd6da 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { - if (ifa->ifa_local == addr) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; - result = dev; - break; - } - } - if (!result) { + ifa = inet_lookup_ifaddr_rcu(net, addr); + if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; @@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); + } else { + result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); @@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) } EXPORT_SYMBOL(__ip_dev_find); +/* called under RCU lock */ +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) +{ + u32 hash = inet_addr_hash(net, addr); + struct in_ifaddr *ifa; + + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) + if (ifa->ifa_local == addr && + net_eq(dev_net(ifa->ifa_dev->dev), net)) + return ifa; + + return NULL; +} + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -438,7 +444,7 @@ static void check_lifetime(struct work_struct *work); static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -483,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; + ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); @@ -515,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { - return __inet_insert_ifa(ifa, NULL, 0); + return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) @@ -896,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return ret; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, + extack); } else { inet_free_ifa(ifa); @@ -1516,6 +1524,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ + /* fall through */ case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; @@ -1751,7 +1760,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rtnl(dev)) + if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); @@ -1775,7 +1784,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b00e4a43b4dc..d57aa64fa7c7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -432,7 +432,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * case -EINPROGRESS: goto error; - case -EBUSY: + case -ENOSPC: err = NET_XMIT_DROP; break; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..f52d27a422c3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -73,6 +73,11 @@ fail: fib_free_table(main_table); return -ENOMEM; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return false; +} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -128,6 +133,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return net->ipv4.fib_has_custom_rules; +} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, @@ -345,9 +355,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(net) && - (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) - goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -402,13 +409,28 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + struct net *net = dev_net(dev); - if (!r && !fib_num_tclassid_users(dev_net(dev)) && - IN_DEV_ACCEPT_LOCAL(idev) && + if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + if (IN_DEV_ACCEPT_LOCAL(idev)) + goto ok; + /* with custom local routes in place, checking local addresses + * only will be too optimistic, with custom rules, checking + * local addresses only can be too strict, e.g. due to vrf + */ + if (net->ipv4.fib_has_custom_local_routes || + fib4_has_custom_rules(net)) + goto full_check; + if (inet_lookup_ifaddr_rcu(net, src)) + return -EINVAL; + +ok: *itag = 0; return 0; } + +full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } @@ -759,6 +781,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, } err = fib_table_insert(net, tb, &cfg, extack); + if (!err && cfg.fc_type == RTN_LOCAL) + net->ipv4.fib_has_custom_local_routes = true; errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 01ed22139ac2..f04d944f8abe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -718,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { val = nla_get_u32(nla); } @@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1038,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { @@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c636650a6a70..5ddc4aefff12 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -87,32 +87,32 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } @@ -1216,9 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id); + key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1273,8 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, - tb->tb_id); + call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1574,8 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, - fa_to_delete->fa_type, tb->tb_id); + fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1892,9 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + KEYLENGTH - fa->fa_slen, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1932,8 +1927,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, continue; call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id); + KEYLENGTH - fa->fa_slen, fa); } } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 681e33998e03..1617604c9284 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ @@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb) if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb) } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers @@ -959,8 +968,9 @@ static bool icmp_timestamp(struct sk_buff *skb) */ icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; - if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) - BUG(); + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b47a59cb3573..4ca46dc08e63 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ -static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, - const struct in6_addr *sk2_rcv_saddr6, - __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk1_ipv6only, bool sk2_ipv6only, - bool match_wildcard) +static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; + return true; if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; + return true; if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; + return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) - return 1; + return true; - return 0; + return false; } #endif @@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk2_ipv6only, bool match_wildcard) +static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) @@ -495,17 +495,15 @@ EXPORT_SYMBOL(inet_csk_accept); * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) + void (*retransmit_handler)(struct timer_list *t), + void (*delack_handler)(struct timer_list *t), + void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); - setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, - (unsigned long)sk); - setup_timer(&icsk->icsk_delack_timer, delack_handler, - (unsigned long)sk); - setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); + timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); @@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); -static void reqsk_timer_handler(unsigned long data) +static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = (struct request_sock *)data; + struct request_sock *req = from_timer(req, t, rsk_timer); struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); @@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, - (unsigned long)req); + timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..26a3d0315728 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) spin_unlock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); + f->frag_expire(&fq->timer); return evicted; } @@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work) local_bh_disable(); - for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + for (i = READ_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); i = (i + 1) & (INETFRAGS_HASHSZ - 1); if (evicted > INETFRAGS_EVICT_MAX) @@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, f->constructor(q, arg); add_frag_mem_limit(nf, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 1); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..c690cd0d9b3f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,7 +9,6 @@ */ #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> @@ -142,9 +141,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -static void tw_timer_handler(unsigned long data) +static void tw_timer_handler(struct timer_list *t) { - struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); if (tw->tw_kill) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); @@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, if (tw) { const struct inet_sock *inet = inet_sk(sk); - kmemcheck_annotate_bitfield(tw, flags); - tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; @@ -188,8 +185,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - setup_pinned_timer(&tw->tw_timer, tw_timer_handler, - (unsigned long)tw); + timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b20c8ac64081..914d56928578 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index df8fe0503de0..bbf1b94942c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -191,12 +191,13 @@ static bool frag_expire_skip_icmp(u32 user) /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ -static void ip_expire(unsigned long arg) +static void ip_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct ipq *qp; struct net *net; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); rcu_read_lock(); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 467e44d7587d..bb6239169b1a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, false)) goto err_free_rt; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len - dev->hard_header_len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -773,20 +773,46 @@ free_skb: return NETDEV_TX_OK; } +static void ipgre_link_update(struct net_device *dev, bool set_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int len; + + len = tunnel->tun_hlen; + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + len = tunnel->tun_hlen - len; + tunnel->hlen = tunnel->hlen + len; + + dev->needed_headroom = dev->needed_headroom + len; + if (set_mtu) + dev->mtu = max_t(int, dev->mtu - len, 68); + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + tunnel->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + dev->features |= NETIF_F_LLTX; + } +} + static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err; struct ip_tunnel_parm p; + int err; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) return -EFAULT; + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || - ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || + ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); p.o_flags = gre_flags_to_tnl_flags(p.o_flags); @@ -794,11 +820,22 @@ static int ipgre_tunnel_ioctl(struct net_device *dev, if (err) return err; + if (cmd == SIOCCHGTUNNEL) { + struct ip_tunnel *t = netdev_priv(dev); + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, true); + } + p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) return -EFAULT; + return 0; } @@ -1011,15 +1048,14 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1308,9 +1344,9 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { @@ -1323,7 +1359,18 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_changelink(dev, tb, &p, fwmark); + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; } static size_t ipgre_get_size(const struct net_device *dev) @@ -1542,15 +1589,14 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1561,16 +1607,14 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_net(struct net *net) +static void __net_exit erspan_exit_batch_net(struct list_head *net_list) { - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit = erspan_exit_net, + .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..fe6fee728ce4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, } } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, + struct rtnl_link_ops *ops) { + struct ip_tunnel_net *itn; + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); + list_for_each_entry(net, net_list, exit_list) { + itn = net_generic(net, id); + ip_tunnel_destroy(itn, &list, ops); + } unregister_netdevice_many(&list); rtnl_unlock(); } -EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 89453cf62158..949f432a5f04 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -198,15 +198,6 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); @@ -453,15 +444,14 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_net(struct net *net) +static void __net_exit vti_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit = vti_exit_net, + .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index cdd627355ed1..c891235b4966 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -659,15 +659,14 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit = ipip_exit_net, + .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..40a43ad294cb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/nexthop.h> +#include <net/switchdev.h> struct ipmr_rule { struct fib_rule common; @@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { @@ -1393,6 +1528,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; + /* fall through */ case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { @@ -1724,10 +1860,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1748,6 +1907,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1925,8 +2087,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -1937,9 +2099,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } @@ -2156,6 +2319,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; @@ -3048,14 +3214,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3072,7 +3311,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3082,6 +3323,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 9e2770fd00be..f88221aebc9d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -634,6 +634,25 @@ static void get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct arpt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -910,8 +929,7 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 39286e543ee6..4cbe5e80f3bf 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -781,6 +781,26 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ipt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; /* macro does multi eval of i */ + } + + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1070,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fe374da4bc13..89af9d88ca21 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv4_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a046c298413a..1849fedd9b81 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first @@ -165,6 +164,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + /* Small and modified version of icmp_rcv */ static int icmp_error(struct net *net, struct nf_conn *tmpl, @@ -177,18 +182,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, - NULL, "nf_ct_icmp: short packet "); + icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: bad HW ICMP checksum "); + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -199,9 +200,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: invalid ICMP type "); + icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } @@ -259,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmp_nlattr_tuple_size(void) +static unsigned int icmp_nlattr_tuple_size(void) { - return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index a0f37b208268..0443ca4120b0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 127153f1ed8a..9f37c4727861 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), - SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3d9f1c2f81c5..3b427757b1f8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs) { u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); + u32 old = READ_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 new, delta = 0; @@ -1250,7 +1250,7 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size, + unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, ip_rt_min_advmss); return min(advmss, IPV4_MAX_PMTU - header_size); @@ -3038,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3095,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 77cf32a80952..fda37f2862c9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0989e739d098..93e172118a94 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,7 @@ #include <net/inet_frag.h> #include <net/ping.h> #include <net/protocol.h> +#include <net/netevent.h> static int zero; static int one = 1; @@ -200,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -207,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -252,10 +255,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -266,7 +271,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -283,12 +288,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, NULL, user_key, + TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -359,11 +360,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -386,15 +389,25 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_policy); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { - .procname = "tcp_retrans_collapse", - .data = &sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -402,48 +415,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, - { - .procname = "tcp_abort_on_overflow", - .data = &sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_stdurg", - .data = &sysctl_tcp_stdurg, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_rfc1337", - .data = &sysctl_tcp_rfc1337, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -465,34 +436,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_fack", - .data = &sysctl_tcp_fack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_recovery", - .data = &sysctl_tcp_recovery, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_max_reordering", - .data = &sysctl_tcp_max_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_dsack", - .data = &sysctl_tcp_dsack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), .data = &sysctl_tcp_mem, @@ -500,113 +443,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_doulongvec_minmax, }, { - .procname = "tcp_wmem", - .data = &sysctl_tcp_wmem, - .maxlen = sizeof(sysctl_tcp_wmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_rmem", - .data = &sysctl_tcp_rmem, - .maxlen = sizeof(sysctl_tcp_rmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_app_win", - .data = &sysctl_tcp_app_win, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_adv_win_scale", - .data = &sysctl_tcp_adv_win_scale, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_adv_win_scale_min, - .extra2 = &tcp_adv_win_scale_max, - }, - { - .procname = "tcp_frto", - .data = &sysctl_tcp_frto, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_min_rtt_wlen", - .data = &sysctl_tcp_min_rtt_wlen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_no_metrics_save", - .data = &sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_moderate_rcvbuf", - .data = &sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_tso_win_divisor", - .data = &sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, - { - .procname = "tcp_workaround_signed_windows", - .data = &sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_limit_output_bytes", - .data = &sysctl_tcp_limit_output_bytes, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_challenge_ack_limit", - .data = &sysctl_tcp_challenge_ack_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_slow_start_after_idle", - .data = &sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -650,65 +492,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_allowed_congestion_control, }, { - .procname = "tcp_thin_linear_timeouts", - .data = &sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_early_retrans", - .data = &sysctl_tcp_early_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &four, - }, - { - .procname = "tcp_min_tso_segs", - .data = &sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &gso_max_segs, - }, - { - .procname = "tcp_pacing_ss_ratio", - .data = &sysctl_tcp_pacing_ss_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, - { - .procname = "tcp_pacing_ca_ratio", - .data = &sysctl_tcp_pacing_ca_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, - { - .procname = "tcp_autocorking", - .data = &sysctl_tcp_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "tcp_invalid_ratelimit", - .data = &sysctl_tcp_invalid_ratelimit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, .mode = 0444, @@ -977,6 +760,13 @@ static struct ctl_table ipv4_net_table[] = { }, #endif { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -1086,6 +876,28 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", @@ -1101,7 +913,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, .extra2 = &one, }, @@ -1145,6 +957,216 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_early_retrans", + .data = &init_net.ipv4.sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &four, + }, + { + .procname = "tcp_recovery", + .data = &init_net.ipv4.sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_slow_start_after_idle", + .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retrans_collapse", + .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_stdurg", + .data = &init_net.ipv4.sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_rfc1337", + .data = &init_net.ipv4.sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_abort_on_overflow", + .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fack", + .data = &init_net.ipv4.sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_reordering", + .data = &init_net.ipv4.sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_dsack", + .data = &init_net.ipv4.sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_app_win", + .data = &init_net.ipv4.sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_adv_win_scale", + .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, + }, + { + .procname = "tcp_frto", + .data = &init_net.ipv4.sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_no_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_moderate_rcvbuf", + .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_tso_win_divisor", + .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_workaround_signed_windows", + .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_limit_output_bytes", + .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_challenge_ack_limit", + .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_min_tso_segs", + .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &gso_max_segs, + }, + { + .procname = "tcp_min_rtt_wlen", + .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_autocorking", + .data = &init_net.ipv4.sysctl_tcp_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "tcp_invalid_ratelimit", + .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tcp_pacing_ss_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_wmem", + .data = &init_net.ipv4.sysctl_tcp_wmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_rmem", + .data = &init_net.ipv4.sysctl_tcp_rmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..bf97317e6c97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> +#include <linux/static_key.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -282,24 +283,22 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -int sysctl_tcp_min_tso_segs __read_mostly = 2; - -int sysctl_tcp_autocorking __read_mostly = 1; +#include <trace/events/tcp.h> struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif + /* * Current number of TCP sockets. */ @@ -413,8 +412,10 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -441,6 +442,7 @@ void tcp_init_sock(struct sock *sk) tcp_assign_congestion_control(sk); tp->tsoffset = 0; + tp->rack.reo_wnd_steps = 1; sk->sk_state = TCP_CLOSE; @@ -449,15 +451,29 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +void tcp_init_transfer(struct sock *sk, int bpf_op) { + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -675,7 +691,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sysctl_tcp_autocorking && + sock_net(sk)->ipv4.sysctl_tcp_autocorking && skb != tcp_write_queue_head(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -686,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -869,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -948,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1027,7 +1043,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1126,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1183,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; @@ -1259,7 +1275,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1279,7 +1295,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1404,7 +1420,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: @@ -1505,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -2017,6 +2040,8 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + trace_tcp_set_state(sk, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2304,6 +2329,37 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2362,7 +2418,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); @@ -2454,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sysctl_tcp_fack) - tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) @@ -2518,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + if (optlen != sizeof(key)) + return -EINVAL; + + if (copy_from_user(key, optval, optlen)) + return -EFAULT; + + return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + } default: /* fallthru */ break; @@ -2749,7 +2813,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { @@ -2759,7 +2823,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else @@ -2768,6 +2832,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EOPNOTSUPP; } break; + case TCP_FASTOPEN_NO_COOKIE: + if (val > 1 || val < 0) + err = -EINVAL; + else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + err = -EINVAL; + else + tp->fastopen_no_cookie = val; + break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; @@ -2905,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); @@ -3104,6 +3175,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctx; + + if (get_user(len, optlen)) + return -EFAULT; + + rcu_read_lock(); + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + if (ctx) + memcpy(key, ctx->key, sizeof(key)); + else + len = 0; + rcu_read_unlock(); + + len = min_t(unsigned int, len, sizeof(key)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, key, len)) + return -EFAULT; + return 0; + } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; @@ -3166,6 +3259,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_connect; break; + case TCP_FASTOPEN_NO_COOKIE: + val = tp->fastopen_no_cookie; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; @@ -3531,13 +3628,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 66ac69f7bd19..06fbe102a425 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -389,7 +389,7 @@ static void tcp_cdg_release(struct sock *sk) kfree(ca->gradients); } -struct tcp_congestion_ops tcp_cdg __read_mostly = { +static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..bc6c02f16243 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + const struct tcp_congestion_ops *prev; + int ret; - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index fbbeda647774..78c192ee03a4 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -10,15 +10,18 @@ #include <net/inetpeer.h> #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -26,8 +29,8 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -38,10 +41,37 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_destroy_cipher(struct sock *sk) +{ + struct tcp_fastopen_context *ctx; + + ctx = rcu_dereference_protected( + inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); + if (ctx) + call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); +} + +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, + void *key, unsigned int len) { - int err; struct tcp_fastopen_context *ctx, *octx; + struct fastopen_queue *q; + int err; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -62,26 +92,37 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + if (sk) { + q = &inet_csk(sk)->icsk_accept_queue.fastopenq; + octx = rcu_dereference_protected(q->ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(q->ctx, ctx); + } else { + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + } + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + + ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); + if (!ctx) + ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -97,7 +138,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct sock *sk, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -105,7 +147,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(sk, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -113,13 +155,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(sk, buf, foc); } } #endif @@ -217,12 +259,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; @@ -272,33 +309,45 @@ static bool tcp_fastopen_queue_check(struct sock *sk) return true; } +static bool tcp_fastopen_no_cookie(const struct sock *sk, + const struct dst_entry *dst, + int flag) +{ + return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) || + tcp_sk(sk)->fastopen_no_cookie || + (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); +} + /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + const struct dst_entry *dst) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && + tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { @@ -331,6 +380,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { unsigned long last_syn_loss = 0; + const struct dst_entry *dst; int syn_loss = 0; tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); @@ -348,7 +398,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + dst = __sk_dst_get(sk); + + if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } @@ -402,25 +454,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -429,17 +472,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -458,27 +502,25 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node *p; - struct sk_buff *skb; struct dst_entry *dst; + struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { - p = rb_first(&tp->out_of_order_queue); - if (p && !rb_next(p)) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); + if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5a87a00641d3..f844c06c0676 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,25 +76,10 @@ #include <linux/ipsec.h> #include <asm/unaligned.h> #include <linux/errqueue.h> +#include <trace/events/tcp.h> +#include <linux/static_key.h> -int sysctl_tcp_fack __read_mostly; -int sysctl_tcp_max_reordering __read_mostly = 300; -int sysctl_tcp_dsack __read_mostly = 1; -int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); - -/* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 1000; - -int sysctl_tcp_stdurg __read_mostly; -int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly = 2; -int sysctl_tcp_min_rtt_wlen __read_mostly = 300; -int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_early_retrans __read_mostly = 3; -int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -115,7 +100,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -335,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -368,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize) >> 1; - int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; + int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -394,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, skb->truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); @@ -420,11 +405,11 @@ static void tcp_fixup_rcvbuf(struct sock *sk) /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency * Allow enough cushion so that sender is not limited by our window */ - if (sysctl_tcp_moderate_rcvbuf) + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); + sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -432,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) */ void tcp_init_buffer_space(struct sock *sk) { + int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -450,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk) if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; - if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + if (tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - - (maxwin >> sysctl_tcp_app_win), + (maxwin >> tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ - if (sysctl_tcp_app_win && + if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); @@ -471,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + net->ipv4.sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -610,7 +597,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * <prev RTT . ><current RTT .. ><next RTT .... > */ - if (sysctl_tcp_moderate_rcvbuf && + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvwin, rcvmem, rcvbuf; @@ -634,10 +621,11 @@ void tcp_rcv_space_adjust(struct sock *sk) } rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) + while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + rcvbuf = min(rcvwin / tp->advmss * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; @@ -781,15 +769,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -/* Set the sk_pacing_rate to allow proper sizing of TSO packets. - * Note: TCP stack does not yet implement pacing. - * FQ packet scheduler can be used to implement cheap but effective - * TCP pacing, to smooth the burst on large writes when packets - * in flight is significantly lower than cwnd (or rwin) - */ -int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; -int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; - static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -807,21 +786,21 @@ static void tcp_update_pacing_rate(struct sock *sk) * end of slow start and should slow down. */ if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate *= sysctl_tcp_pacing_ss_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; else - rate *= sysctl_tcp_pacing_ca_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); - /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate + /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ - ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, - sk->sk_max_pacing_rate); + WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, + sk->sk_max_pacing_rate)); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -863,60 +842,46 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* - * Packet counting of FACK is based on in-order assumptions, therefore TCP - * disables it when reordering is detected - */ -void tcp_disable_fack(struct tcp_sock *tp) -{ - /* RFC3517 uses different metric in lost marker => reset on change */ - if (tcp_is_fack(tp)) - tp->lost_skb_hint = NULL; - tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; -} - /* Take a notice that peer is sending D-SACKs */ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; + tp->rack.dsack_seen = 1; } -static void tcp_update_reordering(struct sock *sk, const int metric, - const int ts) +/* It's reordering when higher sequence was delivered (i.e. sacked) before + * some lower never-retransmitted sequence ("low_seq"). The maximum reordering + * distance is approximated in full-mss packet distance ("reordering"). + */ +static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, + const int ts) { struct tcp_sock *tp = tcp_sk(sk); - int mib_idx; + const u32 mss = tp->mss_cache; + u32 fack, metric; - if (WARN_ON_ONCE(metric < 0)) + fack = tcp_highest_sack_seq(tp); + if (!before(low_seq, fack)) return; - if (metric > tp->reordering) { - tp->reordering = min(sysctl_tcp_max_reordering, metric); - + metric = fack - low_seq; + if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, - tp->fackets_out, + 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - tcp_disable_fack(tp); + tp->reordering = min_t(u32, (metric + mss - 1) / mss, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } tp->rack.reord = 1; - /* This exciting event is worth to be remembered. 8) */ - if (ts) - mib_idx = LINUX_MIB_TCPTSREORDER; - else if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENOREORDER; - else if (tcp_is_fack(tp)) - mib_idx = LINUX_MIB_TCPFACKREORDER; - else - mib_idx = LINUX_MIB_TCPSACKREORDER; - - NET_INC_STATS(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), + ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out is incremented */ @@ -990,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modification, head until snd.fack is lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. @@ -1133,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; + u32 reord; /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. @@ -1143,6 +1106,7 @@ struct tcp_sacktag_state { u64 last_sackt; struct rate_sample *rate; int flag; + unsigned int mss_now; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1192,7 +1156,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, if (pkt_len >= skb->len && !in_sack) return 0; - err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); + err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1208,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); - int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; - if (sacked & TCPCB_SACKED_ACKED) - state->reord = min(fack_count, state->reord); + if ((sacked & TCPCB_SACKED_ACKED) && + before(start_seq, state->reord)) + state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ @@ -1242,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk, * which was in hole. It is reordering. */ if (before(start_seq, - tcp_highest_sack_seq(tp))) - state->reord = min(fack_count, - state->reord); + tcp_highest_sack_seq(tp)) && + before(start_seq, state->reord)) + state->reord = start_seq; + if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) @@ -1263,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; tp->delivered += pcount; /* Out-of-order packets delivered */ - fack_count += pcount; - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (!tcp_is_fack(tp) && tp->lost_skb_hint && + if (tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; - - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1289,13 +1250,13 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *prev = tcp_write_queue_prev(sk, skb); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ @@ -1364,8 +1325,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); @@ -1415,9 +1375,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto fallback; /* Can only happen with delayed DSACK + discard craziness */ - if (unlikely(skb == tcp_write_queue_head(sk))) + prev = skb_rb_prev(skb); + if (!prev) goto fallback; - prev = tcp_write_queue_prev(sk, skb); if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; @@ -1496,18 +1456,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ - if (prev == tcp_write_queue_tail(sk)) + skb = skb_rb_next(prev); + if (!skb) goto out; - skb = tcp_write_queue_next(sk, prev); if (!skb_can_shift(skb) || - (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; @@ -1515,11 +1474,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + len, mss, 0); } out: - state->fack_count += pcount; return prev; noop: @@ -1539,13 +1498,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; - if (skb == tcp_send_head(sk)) - break; - /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; @@ -1594,34 +1550,48 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } - - state->fack_count += tcp_skb_pcount(skb); } return skb; } -/* Avoid all extra work that is being done by sacktag while walking in - * a normal way - */ +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, + struct tcp_sacktag_state *state, + u32 seq) +{ + struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; + struct sk_buff *skb; + + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + return skb; + } + return NULL; +} + static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, u32 skip_to_seq) { - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) - break; + if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) + return skb; - state->fack_count += tcp_skb_pcount(skb); - } - return skb; + return tcp_sacktag_bsearch(sk, state, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1666,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int first_sack_index; state->flag = 0; - state->reord = tp->packets_out; + state->reord = tp->snd_nxt; - if (!tp->sacked_out) { - if (WARN_ON(tp->fackets_out)) - tp->fackets_out = 0; + if (!tp->sacked_out) tcp_highest_sack_reset(sk); - } found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); @@ -1743,8 +1710,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } } - skb = tcp_write_queue_head(sk); - state->fack_count = 0; + state->mss_now = tcp_current_mss(sk); + skb = NULL; i = 0; if (!tp->sacked_out) { @@ -1801,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; cache++; goto walk; } @@ -1816,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; } skb = tcp_sacktag_skip(skb, sk, state, start_seq); @@ -1836,9 +1801,8 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state->reord < tp->fackets_out) && - ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) + tcp_check_sack_reordering(sk, state->reord, 0); tcp_verify_left_out(tp); out: @@ -1876,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_limit_reno_sacked(tp)) - tcp_update_reordering(sk, tp->packets_out + addend, 0); + + if (!tcp_limit_reno_sacked(tp)) + return; + + tp->reordering = min_t(u32, tp->packets_out + addend, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -1923,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = -1; - tp->fackets_out = 0; tp->sacked_out = 0; } @@ -1968,19 +1936,15 @@ void tcp_enter_loss(struct sock *sk) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; - tp->fackets_out = 0; } tcp_clear_all_retrans_hints(tp); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - + skb_rbtree_walk_from(skb) { mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || is_reneg); if (mark_lost) @@ -2014,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk) * falsely raise the receive window, which results in repeated * timeouts and stop-and-go behavior. */ - tp->frto = sysctl_tcp_frto && + tp->frto = net->ipv4.sysctl_tcp_frto && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -2043,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) return false; } -static inline int tcp_fackets_out(const struct tcp_sock *tp) -{ - return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; -} - /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * - * Instead, with FACK TCP uses fackets_out that includes both SACKed - * segments up to the highest received SACK block so far and holes in - * between them. - * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal @@ -2065,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) */ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { - return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; + return tp->sacked_out + 1; } -/* Linux NewReno/SACK/FACK/ECN state machine. +/* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. @@ -2133,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * - * FACK (Disabled by default. Subsumbed by RACK): - * It is the simplest heuristics. As soon as we decided - * that something is lost, we decide that _all_ not SACKed - * packets until the most forward SACK are lost. I.e. - * lost_out = fackets_out - sacked_out and left_out = fackets_out. - * It is absolutely correct estimate, if network does not reorder - * packets. And it loses any connection to reality when reordering - * takes place. We use FACK by default until reordering - * is suspected on the path to this destination. - * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment @@ -2191,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * For non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. @@ -2206,20 +2151,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); - if (tp->lost_skb_hint) { - skb = tp->lost_skb_hint; - cnt = tp->lost_cnt_hint; + skb = tp->lost_skb_hint; + if (skb) { /* Head already handled? */ - if (mark_head && skb != tcp_write_queue_head(sk)) + if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; + cnt = tp->lost_cnt_hint; } else { - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); cnt = 0; } - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -2229,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; oldcnt = cnt; - if (tcp_is_fack(tp) || tcp_is_reno(tp) || + if (tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); if (cnt > packets) { - if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + if (tcp_is_sack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; @@ -2243,7 +2186,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) /* If needed, chop off the prefix to mark as lost. */ lost = (packets - oldcnt) * mss; if (lost < skb->len && - tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2264,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) if (tcp_is_reno(tp)) { tcp_mark_head_lost(sk, 1, 1); - } else if (tcp_is_fack(tp)) { - int lost = tp->fackets_out - tp->reordering; - if (lost <= 0) - lost = 1; - tcp_mark_head_lost(sk, lost, 0); } else { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) @@ -2327,16 +2266,16 @@ static bool tcp_any_retrans_done(const struct sock *sk) if (tp->retrans_out) return true; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; return false; } -#if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { +#if FASTRETRANS_DEBUG > 1 struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -2358,10 +2297,8 @@ static void DBGUNDO(struct sock *sk, const char *msg) tp->packets_out); } #endif -} -#else -#define DBGUNDO(x...) do { } while (0) #endif +} static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { @@ -2370,9 +2307,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (unmark_loss) { struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; @@ -2417,6 +2352,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS(sock_net(sk), mib_idx); + } else if (tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_persist--; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2436,6 +2373,8 @@ static bool tcp_try_undo_dsack(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { + tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, + tp->rack.reo_wnd_persist + 1); DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); @@ -2615,11 +2554,8 @@ void tcp_simple_retransmit(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk); - u32 prior_lost = tp->lost_out; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { @@ -2632,7 +2568,7 @@ void tcp_simple_retransmit(struct sock *sk) tcp_clear_retrans_hints_partial(tp); - if (prior_lost == tp->lost_out) + if (!tp->lost_out) return; if (tcp_is_reno(tp)) @@ -2713,7 +2649,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ - if (tcp_send_head(sk) && + if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; @@ -2740,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. + * packet, rather than with a retransmit. Check reordering. */ - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + tcp_check_sack_reordering(sk, prior_snd_una, 1); /* We are getting evidence that the reordering degree is higher * than we realized. If there are no retransmits out then we @@ -2775,7 +2711,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) struct tcp_sock *tp = tcp_sk(sk); /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); @@ -2784,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) } } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2796,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, const int acked, +static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - (tcp_fackets_out(tp) > tp->reordering)); + tcp_force_fast_retransmit(sk)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) - tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ @@ -2855,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked)) + if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || - tcp_fackets_out(tp) > tp->reordering; + tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk); @@ -2874,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ + /* fall through */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -2914,8 +2857,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { + u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); - u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); @@ -3057,28 +3000,31 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, int *acked, +static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, + u32 prior_snd_una, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; - u32 reord = tp->packets_out; + u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ + struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; - struct sk_buff *skb; u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; @@ -3086,8 +3032,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = 0; - while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + const u32 start_seq = scb->seq; u8 sacked = scb->sacked; u32 acked_pcount; @@ -3104,8 +3051,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = false; } else { - /* Speedup tcp_unlink_write_queue() and next loop */ - prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } @@ -3120,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; - reord = min(pkts_acked, reord); + if (before(start_seq, reord)) + reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; } @@ -3157,12 +3103,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!fully_acked) break; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) @@ -3198,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets && reord <= tp->fackets_out) - tcp_update_reordering(sk, tp->fackets_out - reord, 0); + if (before(reord, prior_fack)) + tcp_check_sack_reordering(sk, reord, 0); - delta = tcp_is_fack(tp) ? pkts_acked : - prior_sacked - tp->sacked_out; + delta = prior_sacked - tp->sacked_out; tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } - - tp->fackets_out -= min(pkts_acked, tp->fackets_out); - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3248,18 +3190,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif - *acked = pkts_acked; return flag; } static void tcp_ack_probe(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ - - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + if (!head) + return; + if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -3379,7 +3322,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); - if (tcp_send_head(sk)) + if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { @@ -3400,7 +3343,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, if (*last_oow_ack_time) { s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } @@ -3436,10 +3379,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); u32 count, now; /* First check our per-socket dupack rate limit. */ - if (__tcp_oow_rate_limited(sock_net(sk), + if (__tcp_oow_rate_limited(net, LINUX_MIB_TCPACKSKIPPEDCHALLENGE, &tp->last_oow_ack_time)) return; @@ -3447,16 +3391,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { - u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + u32 half = (ack_limit + 1) >> 1; challenge_timestamp = now; - WRITE_ONCE(challenge_count, half + - prandom_u32_max(sysctl_tcp_challenge_ack_limit)); + WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); if (count > 0) { WRITE_ONCE(challenge_count, count - 1); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } @@ -3554,18 +3498,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_fackets; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; - int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; - /* We very likely will need to access write queue head. */ - prefetchw(sk->sk_write_queue.next); + /* We very likely will need to access rtx queue. */ + prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. @@ -3591,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; } - prior_fackets = tp->fackets_out; + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet @@ -3647,8 +3590,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + + tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3658,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) @@ -3674,13 +3619,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tcp_send_head(sk)) - tcp_ack_probe(sk); + tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3697,7 +3642,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); tcp_xmit_recovery(sk, rexmit); } @@ -3722,6 +3668,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } +static void smc_parse_options(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_SMC_BASE && + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + opt_rx->smc_ok = 1; + } +#endif +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3829,6 +3790,9 @@ void tcp_parse_options(const struct net *net, tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); + else + smc_parse_options(th, opt_rx, ptr, + opsize); break; } @@ -3996,6 +3960,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ void tcp_reset(struct sock *sk) { + trace_tcp_receive_reset(sk); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -4118,7 +4084,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4153,7 +4119,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -4269,11 +4235,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } -enum tcp_queue { - OOO_QUEUE, - RCV_QUEUE, -}; - /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket @@ -4289,7 +4250,6 @@ enum tcp_queue { * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, - enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4314,10 +4274,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; - if (dest == OOO_QUEUE) - TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; - else - to->tstamp = from->tstamp; + to->tstamp = from->tstamp; } return true; @@ -4342,7 +4299,7 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_first(&tp->out_of_order_queue); while (p) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4354,9 +4311,6 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - /* Replace tstamp which was stomped by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4368,8 +4322,7 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, - tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4410,7 +4363,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node **p, *q, *parent; + struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; @@ -4423,10 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Stash tstamp to avoid being stomped on by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - TCP_SKB_CB(skb)->swtstamp = skb->tstamp; - /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4454,7 +4403,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); @@ -4473,7 +4422,7 @@ coalesce_done: parent = NULL; while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; @@ -4505,7 +4454,7 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } @@ -4518,9 +4467,7 @@ insert: merge_right: /* Remove other segments covered by skb. */ - while ((q = rb_next(&skb->rbnode)) != NULL) { - skb1 = rb_entry(q, struct sk_buff, rbnode); - + while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4535,7 +4482,7 @@ merge_right: tcp_drop(sk, skb1); } /* If there is no skb after us, we are the last_skb ! */ - if (!q) + if (!skb1) tp->ooo_last_skb = skb; add_sack: @@ -4557,7 +4504,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, RCV_QUEUE, tail, + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { @@ -4721,7 +4668,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; - return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); + return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4742,7 +4689,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -4750,7 +4697,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else @@ -4797,7 +4744,7 @@ restart: * overlaps to the next one. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && - (tcp_win_from_space(skb->truesize) > skb->len || + (tcp_win_from_space(sk, skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; break; @@ -4869,26 +4816,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; - struct rb_node *p; u32 start, end; - p = rb_first(&tp->out_of_order_queue); - skb = rb_entry_safe(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { - p = rb_last(&tp->out_of_order_queue); - /* Note: This is possible p is NULL here. We do not - * use rb_entry_safe(), as ooo_last_skb is valid only - * if rbtree is not empty. - */ - tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; for (head = skb;;) { - skb = tcp_skb_next(skb, NULL); + skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. @@ -4931,14 +4871,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); + tcp_drop(sk, rb_to_skb(node)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; node = prev; } while (node); - tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + tp->ooo_last_skb = rb_to_skb(prev); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -5113,7 +5053,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sysctl_tcp_stdurg) + if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) ptr--; ptr += ntohl(th->seq); @@ -5533,20 +5473,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5560,7 +5493,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; @@ -5595,9 +5528,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data, 1)) + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); @@ -5615,6 +5547,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return false; } +static void smc_check_reset_syn(struct tcp_sock *tp) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && !tp->rx_opt.smc_ok) + tp->syn_smc = 0; + } +#endif +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5710,10 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tcp_is_sack(tp) && sysctl_tcp_fack) - tcp_enable_fack(tp); - - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5722,6 +5660,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; + smc_check_reset_syn(tp); + smp_mb(); tcp_finish_connect(sk, skb); @@ -5939,15 +5879,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5967,19 +5910,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6076,6 +6006,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; + /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, @@ -6184,6 +6115,9 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); +#if IS_ENABLED(CONFIG_SMC) + ireq->smc_ok = rx_opt->smc_ok; +#endif } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, @@ -6196,7 +6130,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, if (req) { struct inet_request_sock *ireq = inet_rsk(req); - kmemcheck_annotate_bitfield(ireq, flags); ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; @@ -6359,7 +6292,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init_rwin(req, sk, dst); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); - fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5b027c69cbc5..c6bc0c4d19c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> +#include <trace/events/tcp.h> + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -480,7 +482,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); @@ -1783,8 +1787,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; @@ -1864,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + trace_tcp_destroy_sock(sk); + tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); @@ -1896,6 +1903,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_fastopen_destroy_cipher(sk); tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); @@ -2401,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2422,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2476,6 +2486,50 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_early_retrans = 3; + net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; + net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ + net->ipv4.sysctl_tcp_retrans_collapse = 1; + net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_dsack = 1; + net->ipv4.sysctl_tcp_app_win = 31; + net->ipv4.sysctl_tcp_adv_win_scale = 1; + net->ipv4.sysctl_tcp_frto = 2; + net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + /* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ + net->ipv4.sysctl_tcp_tso_win_divisor = 3; + /* Default TSQ limit of four TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* rfc5961 challenge ack rate limiting */ + net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + net->ipv4.sysctl_tcp_min_tso_segs = 2; + net->ipv4.sysctl_tcp_min_rtt_wlen = 300; + net->ipv4.sysctl_tcp_autocorking = 1; + net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; + net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; + net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); + + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; return 0; fail: @@ -2486,7 +2540,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0f0d740f6c8b..7097f92d16e5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -21,8 +21,6 @@ #include <net/tcp.h> #include <net/genetlink.h> -int sysctl_tcp_nometrics_save __read_mostly; - static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); @@ -331,7 +329,7 @@ void tcp_update_metrics(struct sock *sk) int m; sk_dst_confirm(sk); - if (sysctl_tcp_nometrics_save || !dst) + if (net->ipv4.sysctl_tcp_nometrics_save || !dst) return; rcu_read_lock(); @@ -472,10 +470,8 @@ void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); - if (val && tp->reordering != val) { - tcp_disable_fack(tp); + if (val && tp->reordering != val) tp->reordering = val; - } crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); @@ -893,10 +889,14 @@ static void tcp_metrics_flush_all(struct net *net) for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; + bool match; + spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { - if (net_eq(tm_net(tm), net)) { + match = net ? net_eq(tm_net(tm), net) : + !atomic_read(&tm_net(tm)->count); + if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); } else { @@ -1019,14 +1019,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) return 0; } -static void __net_exit tcp_net_metrics_exit(struct net *net) +static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { - tcp_metrics_flush_all(net); + tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, - .exit = tcp_net_metrics_exit, + .init = tcp_net_metrics_init, + .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..e36eff0403f4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -23,13 +23,12 @@ #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> +#include <linux/static_key.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/busy_poll.h> -int sysctl_tcp_abort_on_overflow __read_mostly; - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -180,7 +179,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (sysctl_tcp_rfc1337 == 0) { + if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; @@ -298,8 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) - BUG(); + BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); } } while (0); #endif @@ -371,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(full_space, + tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, @@ -417,6 +415,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +static void smc_check_reset_syn_req(struct tcp_sock *oldtp, + struct request_sock *req, + struct tcp_sock *newtp) +{ +#if IS_ENABLED(CONFIG_SMC) + struct inet_request_sock *ireq; + + if (static_branch_unlikely(&tcp_have_smc)) { + ireq = inet_rsk(req); + if (oldtp->syn_smc && !ireq->smc_ok) + newtp->syn_smc = 0; + } +#endif +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -434,6 +447,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; @@ -446,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); @@ -458,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->packets_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; - newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; @@ -492,10 +508,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sysctl_tcp_fack) - tcp_enable_fack(newtp); - } + newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; @@ -534,6 +547,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->syn_data_acked = 0; newtp->rack.mstamp = 0; newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -764,7 +781,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 125fc1450b01..0b5a05bd82e3 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -39,7 +39,7 @@ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 * nv_rtt_factor RTT averaging factor - * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping @@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2; static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ -static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ static int nv_cwnd_growth_rate_neg __read_mostly = 8; static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ static int nv_dec_eval_min_calls __read_mostly = 60; @@ -101,6 +101,11 @@ struct tcpnv { u32 nv_last_rtt; /* last rtt */ u32 nv_min_rtt; /* active min rtt. Used to determine slope */ u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_base_rtt; /* If non-zero it represents the threshold for + * congestion */ + u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is + * set to 80% of nv_base_rtt. It helps reduce + * unfairness between flows */ u32 nv_rtt_max_rate; /* max rate seen during current RTT */ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives * acking beyond nv_rtt_start_seq */ @@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) static void tcpnv_init(struct sock *sk) { struct tcpnv *ca = inet_csk_ca(sk); + int base_rtt; tcpnv_reset(ca, sk); + /* See if base_rtt is available from socket_ops bpf program. + * It is meant to be used in environments, such as communication + * within a datacenter, where we have reasonable estimates of + * RTTs + */ + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + if (base_rtt > 0) { + ca->nv_base_rtt = base_rtt; + ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ + } else { + ca->nv_base_rtt = 0; + ca->nv_lower_bound_rtt = 0; + } + ca->nv_allow_cwnd_growth = 1; ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; ca->nv_min_rtt = NV_INIT_RTT; @@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk) ca->cwnd_growth_factor = 0; } +/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) + * bounds to RTT. + */ +inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) +{ + if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) + return ca->nv_lower_bound_rtt; + else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) + return ca->nv_base_rtt; + else + return val; +} + static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -209,7 +242,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) struct tcp_sock *tp = tcp_sk(sk); struct tcpnv *ca = inet_csk_ca(sk); unsigned long now = jiffies; - s64 rate64 = 0; + u64 rate64; u32 rate, max_win, cwnd_by_slope; u32 avg_rtt; u32 bytes_acked = 0; @@ -251,8 +284,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) } /* rate in 100's bits per second */ - rate64 = ((u64)sample->in_flight) * 8000000; - rate = (u32)div64_u64(rate64, (u64)(avg_rtt ?: 1) * 100); + rate64 = ((u64)sample->in_flight) * 80000; + do_div(rate64, avg_rtt ?: 1); + rate = (u32)rate64; /* Remember the maximum rate seen during this RTT * Note: It may be more than one RTT. This function should be @@ -265,6 +299,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) if (ca->nv_eval_call_cnt < 255) ca->nv_eval_call_cnt++; + /* Apply bounds to rtt. Only used to update min_rtt */ + avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); + /* update min rtt if necessary */ if (avg_rtt < ca->nv_min_rtt) ca->nv_min_rtt = avg_rtt; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 11f69bbf9307..b6a2aa1dcf56 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -149,11 +149,19 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, * is freed by GSO engine */ if (copy_destructor) { + int delta; + swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; - refcount_add(sum_truesize - gso_skb->truesize, - &skb->sk->sk_wmem_alloc); + delta = sum_truesize - gso_skb->truesize; + /* In some pathological cases, delta can be negative. + * We need to either use refcount_add() or refcount_sub_and_test() + */ + if (likely(delta >= 0)) + refcount_add(delta, &skb->sk->sk_wmem_alloc); + else + WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc)); } delta = htonl(oldlen + (skb_tail_pointer(skb) - diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 478909f4694d..540b7d92cc70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,40 +41,25 @@ #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> +#include <linux/static_key.h> -/* People can turn this off for buggy TCP's found in printers etc. */ -int sysctl_tcp_retrans_collapse __read_mostly = 1; - -/* People can turn this on to work with those rare, broken TCPs that - * interpret the window field as a signed quantity. - */ -int sysctl_tcp_workaround_signed_windows __read_mostly = 0; - -/* Default TSQ limit of four TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 262144; - -/* This limits the percentage of the congestion window which we - * will allow a single TSO frame to consume. Building TSO frames - * which are too large can cause TCP streams to be bursty. - */ -int sysctl_tcp_tso_win_divisor __read_mostly = 3; - -/* By default, RFC2861 behavior. */ -int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +#include <trace/events/tcp.h> static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_write_queue); + tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss) * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ -void tcp_select_initial_window(int __space, __u32 mss, +void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) @@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sysctl_tcp_workaround_signed_windows) + if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; @@ -235,7 +220,7 @@ void tcp_select_initial_window(int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { @@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk) /* Make sure we do not exceed the maximum possible * scaled window. */ - if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -395,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; @@ -418,6 +403,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) +#define OPTION_SMC (1 << 9) + +static void smc_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (unlikely(OPTION_SMC & *options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } + } +#endif +} struct tcp_out_options { u16 options; /* bit field of OPTION_* */ @@ -536,6 +537,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (len + 3) >> 2; } + + smc_options_write(ptr, &options); +} + +static void smc_set_option(const struct tcp_sock *tp, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif +} + +static void smc_set_option_cond(const struct tcp_sock *tp, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && ireq->smc_ok) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif } /* Compute TCP options for SYN packets. This is not the final @@ -603,11 +639,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + smc_set_option(tp, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct request_sock *req, +static unsigned int tcp_synack_options(const struct sock *sk, + struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, @@ -663,6 +702,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, } } + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -973,6 +1014,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1005,10 +1052,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1129,7 +1180,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1167,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) } } -/* When a modification to fackets out becomes necessary, we need to check - * skb is counted to fackets_out or not. - */ -static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, - int decr) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->sacked_out || tcp_is_reno(tp)) - return; - - if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) - tp->fackets_out -= decr; -} - /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ @@ -1202,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - tcp_adjust_fackets_out(sk, skb, decr); - if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); @@ -1241,12 +1275,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->eor = 0; } +/* Insert buff after skb on the write or rtx queue of sk. */ +static void tcp_insert_write_queue_after(struct sk_buff *skb, + struct sk_buff *buff, + struct sock *sk, + enum tcp_queue tcp_queue) +{ + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + __skb_queue_after(&sk->sk_write_queue, skb, buff); + else + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -1329,7 +1376,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); + if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -1607,7 +1656,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sysctl_tcp_slow_start_after_idle && + if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1616,10 +1665,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) - * 3) no more data to send (null tcp_send_head ) + * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ - if (!tcp_send_head(sk) && sk->sk_socket && + if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1671,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> 10, + bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, @@ -1694,7 +1743,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; return tso_segs ? : - tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); + tcp_tso_autosize(sk, mss_now, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); } /* Returns the portion of skb which can be sent right away */ @@ -1815,7 +1865,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; @@ -1824,7 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now, gfp); + return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1860,7 +1911,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); return 0; } @@ -1910,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1930,8 +1981,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - head = tcp_write_queue_head(sk); - + /* TODO : use tsorted_sent_queue ? */ + head = tcp_rtx_queue_head(sk); + if (!head) + goto send_now; age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) @@ -2145,18 +2198,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, { unsigned int limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); - limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); + limit = min_t(u32, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send the 1st or 2nd skb in write queue. + /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (skb == sk->sk_write_queue.next || - skb->prev == sk->sk_write_queue.next) + if (tcp_rtx_queue_empty(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2207,7 +2260,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ - if (tcp_write_queue_empty(sk)) + if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); @@ -2263,7 +2316,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2302,7 +2355,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, limit, mss_now, gfp))) break; if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) @@ -2342,7 +2396,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return !tp->packets_out && tcp_send_head(sk); + return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2350,6 +2404,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, rto_delta_us; + int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. @@ -2357,16 +2412,17 @@ bool tcp_schedule_loss_probe(struct sock *sk) if (tp->fastopen_rsk) return false; + early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - tcp_send_head(sk)) + !tcp_write_queue_empty(sk)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2419,18 +2475,14 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); skb = tcp_send_head(sk); - if (skb) { - if (tcp_snd_wnd_test(tp, skb, mss)) { - pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - if (tp->packets_out > pcount) - goto probe_sent; - goto rearm_timer; - } - skb = tcp_write_queue_prev(sk, skb); - } else { - skb = tcp_write_queue_tail(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; } + skb = skb_rb_last(&sk->tcp_rtx_queue); /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) @@ -2448,10 +2500,11 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_next(sk, skb); + skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) @@ -2651,7 +2704,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + struct sk_buff *next_skb = skb_rb_next(skb); int skb_size, next_skb_size; skb_size = skb->len; @@ -2668,8 +2721,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_replace(sk, next_skb, skb); - tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2697,7 +2748,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); - sk_wmem_free_skb(sk, next_skb); + tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } @@ -2708,8 +2759,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; - if (skb == tcp_send_head(sk)) - return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2727,12 +2776,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sysctl_tcp_retrans_collapse) + if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; - tcp_for_write_queue_from_safe(skb, tmp, sk) { + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -2807,7 +2856,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) len = cur_mss * segs; if (skb->len > len) { - if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, + cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone(skb, GFP_ATOMIC)) @@ -2841,11 +2891,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + if (!err) { - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); tcp_rate_skb_sent(sk, skb); } } else { @@ -2854,6 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2890,36 +2944,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - struct sk_buff *hole = NULL; u32 max_segs; int mib_idx; if (!tp->packets_out) return; - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - } else { - skb = tcp_write_queue_head(sk); - } - + rtx_head = tcp_rtx_queue_head(sk); + skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { __u8 sacked; int segs; - if (skb == tcp_send_head(sk)) - break; - if (tcp_pacing_check(sk)) break; @@ -2964,7 +3007,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk) && + if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -3006,12 +3049,15 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); + + if (tskb) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (!tcp_send_head(sk)) { + if (tcp_write_queue_empty(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have @@ -3028,6 +3074,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3064,6 +3111,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + /* skb of trace_tcp_send_reset() keeps the skb that caused RST, + * skb here is different to the troublesome skb, so use NULL + */ + trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. @@ -3076,20 +3128,24 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - pr_debug("%s: wrong queue state\n", __func__); + pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; - tcp_unlink_write_queue(skb, sk); + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); - __tcp_add_write_queue_head(sk, nskb); - sk_wmem_free_skb(sk, skb); + tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); skb = nskb; @@ -3166,8 +3222,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + - sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3268,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, @@ -3307,7 +3363,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); tp->write_seq = tcb->end_seq; @@ -3355,6 +3410,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) int copied = copy_from_iter(skb_put(syn_data, space), space, &fo->data->msg_iter); if (unlikely(!copied)) { + tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } @@ -3385,12 +3441,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); + tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - /* data was not sent, this is our new send_head */ - sk->sk_send_head = syn_data; + /* data was not sent, put it in write_queue */ + __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: @@ -3433,6 +3490,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3627,7 +3685,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); @@ -3657,7 +3716,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; @@ -3698,6 +3757,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) tcp_sk(sk)->total_retrans++; + trace_tcp_retransmit_synack(sk, req); } return res; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index be8ef1e5dfef..d3ea89020c69 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -2,8 +2,6 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; - static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -46,7 +44,8 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + u32 min_rtt = tcp_min_rtt(tp); + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -56,48 +55,36 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) - reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); + reo_wnd = min(reo_wnd, tp->srtt_us >> 3); + } - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } @@ -176,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk) if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } + +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..16df6dd44b98 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,8 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_thin_linear_timeouts __read_mostly; - /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -109,26 +107,23 @@ static int tcp_orphan_retries(struct sock *sk, bool alive) static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { - struct net *net = sock_net(sk); + const struct net *net = sock_net(sk); + int mss; /* Black hole detection */ - if (net->ipv4.sysctl_tcp_mtu_probing) { - if (!icsk->icsk_mtup.enabled) { - icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } else { - struct net *net = sock_net(sk); - struct tcp_sock *tp = tcp_sk(sk); - int mss; - - mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, 68 - tp->tcp_header_len); - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } + if (!net->ipv4.sysctl_tcp_mtu_probing) + return; + + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + } else { + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(net->ipv4.sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } @@ -156,8 +151,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -283,15 +283,17 @@ out: * * Returns: Nothing (void) */ -static void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - inet_csk(sk)->icsk_ack.blocked = 1; + icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) @@ -304,11 +306,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +324,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +411,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +444,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +476,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -514,7 +517,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; @@ -570,9 +573,11 @@ out: sk_mem_reclaim(sk); } -static void tcp_write_timer(unsigned long data) +static void tcp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { @@ -607,9 +612,9 @@ void tcp_set_keepalive(struct sock *sk, int val) EXPORT_SYMBOL_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (struct timer_list *t) { - struct sock *sk = (struct sock *) data; + struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; @@ -647,7 +652,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ebfbccae62fd..e4ff25c947c5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1209,8 +1209,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&up->reader_queue)) + if (size < (sk->sk_rcvbuf >> 2)) return; } else { size += up->forward_deficit; @@ -1853,7 +1852,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - encap_rcv = ACCESS_ONCE(up->encap_rcv); + encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; @@ -2298,7 +2297,7 @@ void udp_destroy_sock(struct sock *sk) unlock_sock_fast(sk, slow); if (static_key_false(&udp_encap_needed) && up->encap_type) { void (*encap_destroy)(struct sock *sk); - encap_destroy = ACCESS_ONCE(up->encap_destroy); + encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } |