diff options
Diffstat (limited to 'net')
215 files changed, 5349 insertions, 1600 deletions
diff --git a/net/Kconfig b/net/Kconfig index 7b6cd340b72b..a1005007224c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -402,6 +402,14 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config LWTUNNEL_BPF + bool "Execute BPF program as route nexthop action" + depends on LWTUNNEL + default y if LWTUNNEL=y + ---help--- + Allows to run BPF programs as a nexthop action following a route + lookup for incoming and outgoing packets. + config DST_CACHE bool default n diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 672150b42c61..61a431a9772b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -851,6 +851,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_softif_destroy_sysfs(hard_iface->soft_iface); } + hard_iface->soft_iface = NULL; batadv_hardif_put(hard_iface); out: diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index f1564520dfae..981e8c5b07e9 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -834,6 +834,7 @@ static int batadv_tp_send(void *arg) primary_if = batadv_primary_if_get_selected(bat_priv); if (unlikely(!primary_if)) { err = BATADV_TP_REASON_DST_UNREACHABLE; + tp_vars->reason = err; goto out; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 447f9490b692..30ecbfb40adf 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3287,7 +3287,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, &tvlv_tt_data, &tt_change, &tt_len); - if (!tt_len) + if (!tt_len || !tvlv_len) goto unlock; /* Copy the last orig_node's OGM buffer */ @@ -3305,7 +3305,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, &tvlv_tt_data, &tt_change, &tt_len); - if (!tt_len) + if (!tt_len || !tvlv_len) goto out; /* fill the rest of the tvlv with the real TT entries */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index d020299baba4..1904a93f47d5 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -1090,7 +1090,6 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, { struct hci_conn *hcon; struct hci_dev *hdev; - bdaddr_t *src = BDADDR_ANY; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", @@ -1101,7 +1100,8 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, if (n < 7) return -EINVAL; - hdev = hci_get_route(addr, src); + /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ + hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3809617aa98d..dc59eae54717 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -613,7 +613,7 @@ int hci_conn_del(struct hci_conn *conn) return 0; } -struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) +struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) { int use_src = bacmp(src, BDADDR_ANY); struct hci_dev *hdev = NULL, *d; @@ -634,7 +634,29 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) */ if (use_src) { - if (!bacmp(&d->bdaddr, src)) { + bdaddr_t id_addr; + u8 id_addr_type; + + if (src_type == BDADDR_BREDR) { + if (!lmp_bredr_capable(d)) + continue; + bacpy(&id_addr, &d->bdaddr); + id_addr_type = BDADDR_BREDR; + } else { + if (!lmp_le_capable(d)) + continue; + + hci_copy_identity_address(d, &id_addr, + &id_addr_type); + + /* Convert from HCI to three-value type */ + if (id_addr_type == ADDR_LE_DEV_PUBLIC) + id_addr_type = BDADDR_LE_PUBLIC; + else + id_addr_type = BDADDR_LE_RANDOM; + } + + if (!bacmp(&id_addr, src) && id_addr_type == src_type) { hdev = d; break; } } else { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index d4cad29b033f..577f1c01454a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7060,7 +7060,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst, dst_type, __le16_to_cpu(psm)); - hdev = hci_get_route(dst, &chan->src); + hdev = hci_get_route(dst, &chan->src, chan->src_type); if (!hdev) return -EHOSTUNREACH; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 8e385a0ae60e..2f2cb5e27cdd 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -178,7 +178,7 @@ static void rfcomm_reparent_device(struct rfcomm_dev *dev) struct hci_dev *hdev; struct hci_conn *conn; - hdev = hci_get_route(&dev->dst, &dev->src); + hdev = hci_get_route(&dev->dst, &dev->src, BDADDR_BREDR); if (!hdev) return; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index f52bcbf2e58c..3125ce670c2f 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -219,7 +219,7 @@ static int sco_connect(struct sock *sk) BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); - hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src); + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); if (!hdev) return -EHOSTUNREACH; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 43faf2aea2ab..fae391f1871f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -57,7 +57,7 @@ #define SMP_TIMEOUT msecs_to_jiffies(30000) #define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \ - 0x1f : 0x07) + 0x3f : 0x07) #define KEY_DIST_MASK 0x07 /* Maximum message length that can be passed to aes_cmac */ @@ -76,6 +76,7 @@ enum { SMP_FLAG_DHKEY_PENDING, SMP_FLAG_REMOTE_OOB, SMP_FLAG_LOCAL_OOB, + SMP_FLAG_CT2, }; struct smp_dev { @@ -357,6 +358,22 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], return err; } +static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16], + const u8 salt[16], u8 res[16]) +{ + int err; + + SMP_DBG("w %16phN salt %16phN", w, salt); + + err = aes_cmac(tfm_cmac, salt, w, 16, res); + if (err) + return err; + + SMP_DBG("res %16phN", res); + + return err; +} + /* The following functions map to the legacy SMP crypto functions e, c1, * s1 and ah. */ @@ -1130,20 +1147,31 @@ static void sc_add_ltk(struct smp_chan *smp) static void sc_generate_link_key(struct smp_chan *smp) { - /* These constants are as specified in the core specification. - * In ASCII they spell out to 'tmp1' and 'lebr'. - */ - const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 }; + /* From core spec. Spells out in ASCII as 'lebr'. */ const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c }; smp->link_key = kzalloc(16, GFP_KERNEL); if (!smp->link_key) return; - if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { - kzfree(smp->link_key); - smp->link_key = NULL; - return; + if (test_bit(SMP_FLAG_CT2, &smp->flags)) { + /* SALT = 0x00000000000000000000000000000000746D7031 */ + const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 }; + + if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) { + kzfree(smp->link_key); + smp->link_key = NULL; + return; + } + } else { + /* From core spec. Spells out in ASCII as 'tmp1'. */ + const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 }; + + if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { + kzfree(smp->link_key); + smp->link_key = NULL; + return; + } } if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) { @@ -1169,10 +1197,7 @@ static void smp_allow_key_dist(struct smp_chan *smp) static void sc_generate_ltk(struct smp_chan *smp) { - /* These constants are as specified in the core specification. - * In ASCII they spell out to 'tmp2' and 'brle'. - */ - const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 }; + /* From core spec. Spells out in ASCII as 'brle'. */ const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 }; struct hci_conn *hcon = smp->conn->hcon; struct hci_dev *hdev = hcon->hdev; @@ -1187,8 +1212,19 @@ static void sc_generate_ltk(struct smp_chan *smp) if (key->type == HCI_LK_DEBUG_COMBINATION) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); - if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk)) - return; + if (test_bit(SMP_FLAG_CT2, &smp->flags)) { + /* SALT = 0x00000000000000000000000000000000746D7032 */ + const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 }; + + if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk)) + return; + } else { + /* From core spec. Spells out in ASCII as 'tmp2'. */ + const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 }; + + if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk)) + return; + } if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk)) return; @@ -1669,6 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, if (!rsp) { memset(req, 0, sizeof(*req)); + req->auth_req = SMP_AUTH_CT2; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->max_key_size = conn->hcon->enc_key_size; @@ -1680,6 +1717,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, memset(rsp, 0, sizeof(*rsp)); + rsp->auth_req = SMP_AUTH_CT2; rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; @@ -1744,6 +1782,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) build_bredr_pairing_cmd(smp, req, &rsp); + if (req->auth_req & SMP_AUTH_CT2) + set_bit(SMP_FLAG_CT2, &smp->flags); + key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; @@ -1761,9 +1802,13 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) build_pairing_cmd(conn, req, &rsp, auth); - if (rsp.auth_req & SMP_AUTH_SC) + if (rsp.auth_req & SMP_AUTH_SC) { set_bit(SMP_FLAG_SC, &smp->flags); + if (rsp.auth_req & SMP_AUTH_CT2) + set_bit(SMP_FLAG_CT2, &smp->flags); + } + if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else @@ -1917,6 +1962,9 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) */ smp->remote_key_dist &= rsp->resp_key_dist; + if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2)) + set_bit(SMP_FLAG_CT2, &smp->flags); + /* For BR/EDR this means we're done and can start phase 3 */ if (conn->hcon->type == ACL_LINK) { /* Clear bits which are generated but not distributed */ @@ -2312,8 +2360,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq = seclevel_to_authreq(sec_level); - if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) + if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) { authreq |= SMP_AUTH_SC; + if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED)) + authreq |= SMP_AUTH_CT2; + } /* Require MITM if IO Capability allows or the security level * requires it. diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index ffcc70b6b199..0ff6247eaa6c 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -57,6 +57,7 @@ struct smp_cmd_pairing { #define SMP_AUTH_MITM 0x04 #define SMP_AUTH_SC 0x08 #define SMP_AUTH_KEYPRESS 0x10 +#define SMP_AUTH_CT2 0x20 #define SMP_CMD_PAIRING_CONFIRM 0x03 struct smp_cmd_pairing_confirm { diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 073d54afa056..b30e77e8427c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -365,13 +365,18 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, __be32 group, u8 *igmp_type) { + struct igmpv3_query *ihv3; + size_t igmp_hdr_size; struct sk_buff *skb; struct igmphdr *ih; struct ethhdr *eth; struct iphdr *iph; + igmp_hdr_size = sizeof(*ih); + if (br->multicast_igmp_version == 3) + igmp_hdr_size = sizeof(*ihv3); skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) + - sizeof(*ih) + 4); + igmp_hdr_size + 4); if (!skb) goto out; @@ -396,7 +401,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->version = 4; iph->ihl = 6; iph->tos = 0xc0; - iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4); + iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4); iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = 1; @@ -412,17 +417,37 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_put(skb, 24); skb_set_transport_header(skb, skb->len); - ih = igmp_hdr(skb); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / - (HZ / IGMP_TIMER_SCALE); - ih->group = group; - ih->csum = 0; - ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - skb_put(skb, sizeof(*ih)); + switch (br->multicast_igmp_version) { + case 2: + ih = igmp_hdr(skb); + ih->type = IGMP_HOST_MEMBERSHIP_QUERY; + ih->code = (group ? br->multicast_last_member_interval : + br->multicast_query_response_interval) / + (HZ / IGMP_TIMER_SCALE); + ih->group = group; + ih->csum = 0; + ih->csum = ip_compute_csum((void *)ih, sizeof(*ih)); + break; + case 3: + ihv3 = igmpv3_query_hdr(skb); + ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; + ihv3->code = (group ? br->multicast_last_member_interval : + br->multicast_query_response_interval) / + (HZ / IGMP_TIMER_SCALE); + ihv3->group = group; + ihv3->qqic = br->multicast_query_interval / HZ; + ihv3->nsrcs = 0; + ihv3->resv = 0; + ihv3->suppress = 0; + ihv3->qrv = 2; + ihv3->csum = 0; + ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3)); + break; + } + + skb_put(skb, igmp_hdr_size); __skb_pull(skb, sizeof(*eth)); out: @@ -434,15 +459,20 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, const struct in6_addr *grp, u8 *igmp_type) { - struct sk_buff *skb; + struct mld2_query *mld2q; + unsigned long interval; struct ipv6hdr *ip6h; struct mld_msg *mldq; + size_t mld_hdr_size; + struct sk_buff *skb; struct ethhdr *eth; u8 *hopopt; - unsigned long interval; + mld_hdr_size = sizeof(*mldq); + if (br->multicast_mld_version == 2) + mld_hdr_size = sizeof(*mld2q); skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) + - 8 + sizeof(*mldq)); + 8 + mld_hdr_size); if (!skb) goto out; @@ -461,7 +491,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h = ipv6_hdr(skb); *(__force __be32 *)ip6h = htonl(0x60000000); - ip6h->payload_len = htons(8 + sizeof(*mldq)); + ip6h->payload_len = htons(8 + mld_hdr_size); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); @@ -489,26 +519,47 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, /* ICMPv6 */ skb_set_transport_header(skb, skb->len); - mldq = (struct mld_msg *) icmp6_hdr(skb); - interval = ipv6_addr_any(grp) ? br->multicast_query_response_interval : br->multicast_last_member_interval; - *igmp_type = ICMPV6_MGM_QUERY; - mldq->mld_type = ICMPV6_MGM_QUERY; - mldq->mld_code = 0; - mldq->mld_cksum = 0; - mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); - mldq->mld_reserved = 0; - mldq->mld_mca = *grp; - - /* checksum */ - mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - sizeof(*mldq), IPPROTO_ICMPV6, - csum_partial(mldq, - sizeof(*mldq), 0)); - skb_put(skb, sizeof(*mldq)); + switch (br->multicast_mld_version) { + case 1: + mldq = (struct mld_msg *)icmp6_hdr(skb); + mldq->mld_type = ICMPV6_MGM_QUERY; + mldq->mld_code = 0; + mldq->mld_cksum = 0; + mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); + mldq->mld_reserved = 0; + mldq->mld_mca = *grp; + mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + sizeof(*mldq), IPPROTO_ICMPV6, + csum_partial(mldq, + sizeof(*mldq), + 0)); + break; + case 2: + mld2q = (struct mld2_query *)icmp6_hdr(skb); + mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval)); + mld2q->mld2q_type = ICMPV6_MGM_QUERY; + mld2q->mld2q_code = 0; + mld2q->mld2q_cksum = 0; + mld2q->mld2q_resv1 = 0; + mld2q->mld2q_resv2 = 0; + mld2q->mld2q_suppress = 0; + mld2q->mld2q_qrv = 2; + mld2q->mld2q_nsrcs = 0; + mld2q->mld2q_qqic = br->multicast_query_interval / HZ; + mld2q->mld2q_mca = *grp; + mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + sizeof(*mld2q), + IPPROTO_ICMPV6, + csum_partial(mld2q, + sizeof(*mld2q), + 0)); + break; + } + skb_put(skb, mld_hdr_size); __skb_pull(skb, sizeof(*eth)); @@ -608,7 +659,8 @@ err: } struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, - struct net_bridge_port *port, struct br_ip *group) + struct net_bridge_port *p, + struct br_ip *group) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; @@ -624,7 +676,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, } hash = br_ip_hash(mdb, group); - mp = br_multicast_get_group(br, port, group, hash); + mp = br_multicast_get_group(br, p, group, hash); switch (PTR_ERR(mp)) { case 0: break; @@ -681,9 +733,9 @@ static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *group) { - struct net_bridge_mdb_entry *mp; - struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; int err; @@ -861,9 +913,9 @@ static void br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, struct bridge_mcast_own_query *own_query) { - unsigned long time; - struct br_ip br_group; struct bridge_mcast_other_query *other_query = NULL; + struct br_ip br_group; + unsigned long time; if (!netif_running(br->dev) || br->multicast_disabled || !br->multicast_querier) @@ -1831,7 +1883,9 @@ void br_multicast_init(struct net_bridge *br) br->ip4_other_query.delay_time = 0; br->ip4_querier.port = NULL; + br->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) + br->multicast_mld_version = 1; br->ip6_other_query.delay_time = 0; br->ip6_querier.port = NULL; #endif @@ -2132,6 +2186,44 @@ unlock: return err; } +int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) +{ + /* Currently we support only version 2 and 3 */ + switch (val) { + case 2: + case 3: + break; + default: + return -EINVAL; + } + + spin_lock_bh(&br->multicast_lock); + br->multicast_igmp_version = val; + spin_unlock_bh(&br->multicast_lock); + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) +{ + /* Currently we support version 1 and 2 */ + switch (val) { + case 1: + case 2: + break; + default: + return -EINVAL; + } + + spin_lock_bh(&br->multicast_lock); + br->multicast_mld_version = val; + spin_unlock_bh(&br->multicast_lock); + + return 0; +} +#endif + /** * br_multicast_list_adjacent - Returns snooped multicast addresses * @dev: The bridge port adjacent to which to retrieve addresses diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 83d937f4415e..b12501a77f18 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1008,10 +1008,10 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct nf_hook_state state; int ret; - elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); - - while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) - elem = rcu_dereference(elem->next); + for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); + elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF; + elem = rcu_dereference(elem->next)) + ; if (!elem) return okfn(net, sk, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e99037c6f7b7..71c7453268c1 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -858,6 +858,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1069,6 +1071,26 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); br->multicast_stats_enabled = !!mcast_stats; } + + if (data[IFLA_BR_MCAST_IGMP_VERSION]) { + __u8 igmp_version; + + igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); + err = br_multicast_set_igmp_version(br, igmp_version); + if (err) + return err; + } + +#if IS_ENABLED(CONFIG_IPV6) + if (data[IFLA_BR_MCAST_MLD_VERSION]) { + __u8 mld_version; + + mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); + err = br_multicast_set_mld_version(br, mld_version); + if (err) + return err; + } +#endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { @@ -1135,6 +1157,8 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ @@ -1210,9 +1234,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, - br->multicast_startup_query_count)) + br->multicast_startup_query_count) || + nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, + br->multicast_igmp_version)) return -EMSGSIZE; - +#if IS_ENABLED(CONFIG_IPV6) + if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, + br->multicast_mld_version)) + return -EMSGSIZE; +#endif clockval = jiffies_to_clock_t(br->multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1b63177e0ccd..26aec2366bc3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -333,6 +333,8 @@ struct net_bridge u32 multicast_last_member_count; u32 multicast_startup_query_count; + u8 multicast_igmp_version; + unsigned long multicast_last_member_interval; unsigned long multicast_membership_interval; unsigned long multicast_querier_interval; @@ -353,6 +355,7 @@ struct net_bridge struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; + u8 multicast_mld_version; #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif @@ -582,6 +585,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); int br_multicast_toggle(struct net_bridge *br, unsigned long val); int br_multicast_set_querier(struct net_bridge *br, unsigned long val); int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); +int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); +#if IS_ENABLED(CONFIG_IPV6) +int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); +#endif struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); struct net_bridge_mdb_entry * diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index e120307c6e36..a18148213b08 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -440,6 +440,23 @@ static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR_RW(hash_max); +static ssize_t multicast_igmp_version_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + + return sprintf(buf, "%u\n", br->multicast_igmp_version); +} + +static ssize_t multicast_igmp_version_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version); +} +static DEVICE_ATTR_RW(multicast_igmp_version); + static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) @@ -642,6 +659,25 @@ static ssize_t multicast_stats_enabled_store(struct device *d, return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); + +#if IS_ENABLED(CONFIG_IPV6) +static ssize_t multicast_mld_version_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + + return sprintf(buf, "%u\n", br->multicast_mld_version); +} + +static ssize_t multicast_mld_version_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_set_mld_version); +} +static DEVICE_ATTR_RW(multicast_mld_version); +#endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( @@ -809,6 +845,10 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, + &dev_attr_multicast_igmp_version.attr, +#if IS_ENABLED(CONFIG_IPV6) + &dev_attr_multicast_mld_version.attr, +#endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, @@ -898,6 +938,7 @@ int br_sysfs_addbr(struct net_device *dev) if (!br->ifobj) { pr_info("%s: can't add kobject (directory) %s/%s\n", __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); + err = -ENOMEM; goto out3; } return 0; diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index c197b1f844ee..bd2b3c78f59b 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -24,7 +24,8 @@ static void nf_log_bridge_packet(struct net *net, u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - nf_log_l2packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb, + in, out, loginfo, prefix); } static struct nf_logger nf_bridge_logger __read_mostly = { diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index aa209b1066c9..92cbbd2afddb 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1107,10 +1107,7 @@ static struct net_proto_family caif_family_ops = { static int __init caif_sktinit_module(void) { - int err = sock_register(&caif_family_ops); - if (!err) - return err; - return 0; + return sock_register(&caif_family_ops); } static void __exit caif_sktexit_module(void) diff --git a/net/can/bcm.c b/net/can/bcm.c index 8af9d25ff988..436a7537e6a9 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -77,7 +77,7 @@ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) -#define CAN_BCM_VERSION "20160617" +#define CAN_BCM_VERSION "20161123" MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); @@ -109,8 +109,9 @@ struct bcm_op { u32 count; u32 nframes; u32 currframe; - struct canfd_frame *frames; - struct canfd_frame *last_frames; + /* void pointers to arrays of struct can[fd]_frame */ + void *frames; + void *last_frames; struct canfd_frame sframe; struct canfd_frame last_sframe; struct sock *sk; @@ -681,7 +682,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) if (op->flags & RX_FILTER_ID) { /* the easiest case */ - bcm_rx_update_and_send(op, &op->last_frames[0], rxframe); + bcm_rx_update_and_send(op, op->last_frames, rxframe); goto rx_starttimer; } @@ -1068,7 +1069,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes) { /* update CAN frames content */ - err = memcpy_from_msg((u8 *)op->frames, msg, + err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) return err; @@ -1118,7 +1119,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } if (msg_head->nframes) { - err = memcpy_from_msg((u8 *)op->frames, msg, + err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) @@ -1163,6 +1164,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* check flags */ if (op->flags & RX_RTR_FRAME) { + struct canfd_frame *frame0 = op->frames; /* no timers in RTR-mode */ hrtimer_cancel(&op->thrtimer); @@ -1174,8 +1176,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, * prevent a full-load-loopback-test ... ;-] */ if ((op->flags & TX_CP_CAN_ID) || - (op->frames[0].can_id == op->can_id)) - op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG; + (frame0->can_id == op->can_id)) + frame0->can_id = op->can_id & ~CAN_RTR_FLAG; } else { if (op->flags & SETTIMER) { diff --git a/net/core/Makefile b/net/core/Makefile index d6508c2ddca5..f6761b6e3b29 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 49816af8586b..9482037a5c8c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -214,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; + *peeked = 0; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -227,22 +228,22 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { *last = skb; - *peeked = skb->peeked; if (flags & MSG_PEEK) { if (_off >= skb->len && (skb->len || _off || skb->peeked)) { _off -= skb->len; continue; } - - skb = skb_set_peeked(skb); - error = PTR_ERR(skb); - if (IS_ERR(skb)) { - spin_unlock_irqrestore(&queue->lock, - cpu_flags); - goto no_packet; + if (!skb->len) { + skb = skb_set_peeked(skb); + if (IS_ERR(skb)) { + error = PTR_ERR(skb); + spin_unlock_irqrestore(&queue->lock, + cpu_flags); + goto no_packet; + } } - + *peeked = 1; atomic_inc(&skb->users); } else { __skb_unlink(skb, queue); diff --git a/net/core/dev.c b/net/core/dev.c index f71b34ab57a5..1d33ce03365f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3447,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); +struct static_key rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -5260,7 +5262,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - return; + goto out; break; } @@ -5278,7 +5280,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) } } - __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -5288,6 +5289,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); +out: + __kfree_skb_flush(); } struct netdev_adjacent { @@ -6691,26 +6694,42 @@ EXPORT_SYMBOL(dev_change_proto_down); * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @fd: new program fd or negative value to clear + * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd) +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp = {}; + struct netdev_xdp xdp; int err; + ASSERT_RTNL(); + if (!ops->ndo_xdp) return -EOPNOTSUPP; if (fd >= 0) { + if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + err = ops->ndo_xdp(dev, &xdp); + if (err < 0) + return err; + if (xdp.prog_attached) + return -EBUSY; + } + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } + memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); diff --git a/net/core/devlink.c b/net/core/devlink.c index c14f8b661db9..2b5bf9efa720 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1394,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, - u32 seq, int flags, u16 mode) + u32 seq, int flags) { + const struct devlink_ops *ops = devlink->ops; void *hdr; + int err = 0; + u16 mode; + u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out; - if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) - goto nla_put_failure; + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto out; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto out; + + if (ops->eswitch_inline_mode_get) { + err = ops->eswitch_inline_mode_get(devlink, &inline_mode); + if (err) + goto out; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, + inline_mode); + if (err) + goto out; + } genlmsg_end(msg, hdr); return 0; -nla_put_failure: +out: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, @@ -1422,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct sk_buff *msg; - u16 mode; int err; if (!ops || !ops->eswitch_mode_get) return -EOPNOTSUPP; - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - return err; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, - info->snd_portid, info->snd_seq, 0, mode); + info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); @@ -1453,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; u16 mode; + u8 inline_mode; + int err = 0; - if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) - return -EINVAL; + if (!ops) + return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { + if (!ops->eswitch_mode_set) + return -EOPNOTSUPP; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = ops->eswitch_mode_set(devlink, mode); + if (err) + return err; + } - if (ops && ops->eswitch_mode_set) - return ops->eswitch_mode_set(devlink, mode); - return -EOPNOTSUPP; + if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { + if (!ops->eswitch_inline_mode_set) + return -EOPNOTSUPP; + inline_mode = nla_get_u8( + info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); + err = ops->eswitch_inline_mode_set(devlink, inline_mode); + if (err) + return err; + } + + return 0; } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -1478,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e9b4556751ff..e23766c7e3ba 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2466,7 +2466,9 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr) data = kmalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; + mutex_lock(&phydev->lock); ret = phydev->drv->get_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); if (ret) goto out; useraddr += sizeof(tuna); @@ -2501,7 +2503,9 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr) ret = -EFAULT; if (copy_from_user(data, useraddr, tuna.len)) goto out; + mutex_lock(&phydev->lock); ret = phydev->drv->set_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); out: kfree(data); @@ -2566,6 +2570,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: + case ETHTOOL_GLINKSETTINGS: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) diff --git a/net/core/filter.c b/net/core/filter.c index dece94fef005..b1461708a977 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -30,6 +30,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> +#include <linux/if_arp.h> #include <linux/gfp.h> #include <net/ip.h> #include <net/protocol.h> @@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + err = security_sock_rcv_skb(sk, skb); if (err) return err; @@ -1684,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); @@ -1692,17 +1703,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { - switch (dev->type) { - case ARPHRD_TUNNEL: - case ARPHRD_TUNNEL6: - case ARPHRD_SIT: - case ARPHRD_IPGRE: - case ARPHRD_VOID: - case ARPHRD_NONE: - return __bpf_redirect_no_mac(skb, dev, flags); - default: + if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); - } + else + return __bpf_redirect_no_mac(skb, dev, flags); } BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2190,16 +2194,79 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + } + + bpf_compute_data_end(skb); + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *data = xdp->data + offset; + + if (unlikely(data < xdp->data_hard_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || + func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace) + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head) return true; return false; @@ -2625,12 +2692,87 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_inout_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; default: return sk_filter_func_proto(func_id); } } -static bool __is_valid_access(int off, int size, enum bpf_access_type type) +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + default: + return lwt_inout_func_proto(func_id); + } +} + +static bool __is_valid_access(int off, int size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2664,7 +2806,64 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return __is_valid_access(off, size, type); + return __is_valid_access(off, size); +} + +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_access(off, size); +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -2733,11 +2932,10 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; } - return __is_valid_access(off, size, type); + return __is_valid_access(off, size); } -static bool __is_valid_xdp_access(int off, int size, - enum bpf_access_type type) +static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; @@ -2765,7 +2963,7 @@ static bool xdp_is_valid_access(int off, int size, break; } - return __is_valid_xdp_access(off, size, type); + return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(u32 act) @@ -2925,6 +3123,51 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + + case offsetof(struct bpf_sock, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sock, sk_family)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2992,6 +3235,31 @@ static const struct bpf_verifier_ops xdp_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_inout_ops = { + .get_func_proto = lwt_inout_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_xmit_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3012,12 +3280,42 @@ static struct bpf_prog_type_list xdp_type __read_mostly = { .type = BPF_PROG_TYPE_XDP, }; +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + +static struct bpf_prog_type_list lwt_in_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_IN, +}; + +static struct bpf_prog_type_list lwt_out_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_OUT, +}; + +static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { + .ops = &lwt_xmit_ops, + .type = BPF_PROG_TYPE_LWT_XMIT, +}; + +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); + bpf_register_prog_type(&lwt_in_type); + bpf_register_prog_type(&lwt_out_type); + bpf_register_prog_type(&lwt_xmit_type); return 0; } diff --git a/net/core/flow.c b/net/core/flow.c index 3937b1b68d5b..18e8893d4be5 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -95,7 +95,6 @@ static void flow_cache_gc_task(struct work_struct *work) list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) { flow_entry_kill(fce, xfrm); atomic_dec(&xfrm->flow_cache_gc_count); - WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0); } } @@ -236,9 +235,8 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, if (fcp->hash_count > fc->high_watermark) flow_cache_shrink(fc, fcp); - if (fcp->hash_count > 2 * fc->high_watermark || - atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) { - atomic_inc(&net->xfrm.flow_cache_genid); + if (atomic_read(&net->xfrm.flow_cache_gc_count) > + 2 * num_online_cpus() * fc->high_watermark) { flo = ERR_PTR(-ENOBUFS); goto ret_object; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b481a4a6d3ec..d6447dc10371 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, EXPORT_SYMBOL(skb_flow_dissector_init); /** + * skb_flow_get_be16 - extract be16 entity + * @skb: sk_buff to extract from + * @poff: offset to extract at + * @data: raw buffer pointer to the packet + * @hlen: packet header length + * + * The function will try to retrieve a be32 entity at + * offset poff + */ +__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, + int hlen) +{ + __be16 *u, _u; + + u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); + if (u) + return *u; + + return 0; +} + +/** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset @@ -117,6 +139,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_keyid *key_keyid; @@ -546,6 +569,14 @@ ip_proto_again: data, hlen); } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP)) { + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); + } + out_good: ret = true; @@ -1013,4 +1044,4 @@ static int __init init_default_flow_dissectors(void) return 0; } -late_initcall_sync(init_default_flow_dissectors); +core_initcall(init_default_flow_dissectors); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index cad8e791f28e..101b5d0e2142 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -7,6 +7,7 @@ * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Eric Dumazet <edumazet@google.com> * * Changes: * Jamal Hadi Salim - moved it to net/core and reshulfed @@ -30,165 +31,79 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> -#include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/seqlock.h> #include <net/sock.h> #include <net/gen_stats.h> -/* - This code is NOT intended to be used for statistics collection, - its purpose is to provide a base for statistical multiplexing - for controlled load service. - If you need only statistics, run a user level daemon which - periodically reads byte counters. - - Unfortunately, rate estimation is not a very easy task. - F.e. I did not find a simple way to estimate the current peak rate - and even failed to formulate the problem 8)8) - - So I preferred not to built an estimator into the scheduler, - but run this task separately. - Ideally, it should be kernel thread(s), but for now it runs - from timers, which puts apparent top bounds on the number of rated - flows, has minimal overhead on small, but is enough - to handle controlled load service, sets of aggregates. - - We measure rate over A=(1<<interval) seconds and evaluate EWMA: - - avrate = avrate*(1-W) + rate*W - - where W is chosen as negative power of 2: W = 2^(-ewma_log) - - The resulting time constant is: - - T = A/(-ln(1-W)) - - - NOTES. - - * avbps and avpps are scaled by 2^5. - * both values are reported as 32 bit unsigned values. bps can - overflow for fast links : max speed being 34360Mbit/sec - * Minimal interval is HZ/4=250msec (it is the greatest common divisor - for HZ=100 and HZ=1024 8)), maximal interval - is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals - are too expensive, longer ones can be implemented - at user level painlessly. +/* This code is NOT intended to be used for statistics collection, + * its purpose is to provide a base for statistical multiplexing + * for controlled load service. + * If you need only statistics, run a user level daemon which + * periodically reads byte counters. */ -#define EST_MAX_INTERVAL 5 - -struct gen_estimator -{ - struct list_head list; +struct net_rate_estimator { struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; seqcount_t *running; - int ewma_log; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ + + seqcount_t seq; u32 last_packets; - unsigned long avpps; u64 last_bytes; + + u64 avpps; u64 avbps; - struct rcu_head e_rcu; - struct rb_node node; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; - struct rcu_head head; -}; -struct gen_estimator_head -{ - struct timer_list timer; - struct list_head list; + unsigned long next_jiffies; + struct timer_list timer; + struct rcu_head rcu; }; -static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; - -/* Protects against NULL dereference */ -static DEFINE_RWLOCK(est_lock); - -/* Protects against soft lockup during large deletion */ -static struct rb_root est_root = RB_ROOT; -static DEFINE_SPINLOCK(est_tree_lock); - -static void est_timer(unsigned long arg) +static void est_fetch_counters(struct net_rate_estimator *e, + struct gnet_stats_basic_packed *b) { - int idx = (int)arg; - struct gen_estimator *e; + if (e->stats_lock) + spin_lock(e->stats_lock); - rcu_read_lock(); - list_for_each_entry_rcu(e, &elist[idx].list, list) { - struct gnet_stats_basic_packed b = {0}; - unsigned long rate; - u64 brate; - - if (e->stats_lock) - spin_lock(e->stats_lock); - read_lock(&est_lock); - if (e->bstats == NULL) - goto skip; - - __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats); - - brate = (b.bytes - e->last_bytes)<<(7 - idx); - e->last_bytes = b.bytes; - e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); - WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5); - - rate = b.packets - e->last_packets; - rate <<= (7 - idx); - e->last_packets = b.packets; - e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5); -skip: - read_unlock(&est_lock); - if (e->stats_lock) - spin_unlock(e->stats_lock); - } + __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); + + if (e->stats_lock) + spin_unlock(e->stats_lock); - if (!list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); - rcu_read_unlock(); } -static void gen_add_node(struct gen_estimator *est) +static void est_timer(unsigned long arg) { - struct rb_node **p = &est_root.rb_node, *parent = NULL; + struct net_rate_estimator *est = (struct net_rate_estimator *)arg; + struct gnet_stats_basic_packed b; + u64 rate, brate; - while (*p) { - struct gen_estimator *e; + est_fetch_counters(est, &b); + brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log); + brate -= (est->avbps >> est->ewma_log); - parent = *p; - e = rb_entry(parent, struct gen_estimator, node); + rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log); + rate -= (est->avpps >> est->ewma_log); - if (est->bstats > e->bstats) - p = &parent->rb_right; - else - p = &parent->rb_left; - } - rb_link_node(&est->node, parent, p); - rb_insert_color(&est->node, &est_root); -} + write_seqcount_begin(&est->seq); + est->avbps += brate; + est->avpps += rate; + write_seqcount_end(&est->seq); -static -struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est) -{ - struct rb_node *p = est_root.rb_node; - - while (p) { - struct gen_estimator *e; + est->last_bytes = b.bytes; + est->last_packets = b.packets; - e = rb_entry(p, struct gen_estimator, node); + est->next_jiffies += ((HZ/4) << est->intvl_log); - if (bstats > e->bstats) - p = p->rb_right; - else if (bstats < e->bstats || rate_est != e->rate_est) - p = p->rb_left; - else - return e; + if (unlikely(time_after_eq(jiffies, est->next_jiffies))) { + /* Ouch... timer was delayed. */ + est->next_jiffies = jiffies + 1; } - return NULL; + mod_timer(&est->timer, est->next_jiffies); } /** @@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, - struct gnet_stats_rate_est64 *rate_est, + struct net_rate_estimator __rcu **rate_est, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt) { - struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); - struct gnet_stats_basic_packed b = {0}; - int idx; + struct net_rate_estimator *old, *est; + struct gnet_stats_basic_packed b; + int intvl_log; if (nla_len(opt) < sizeof(*parm)) return -EINVAL; + /* allowed timer periods are : + * -2 : 250ms, -1 : 500ms, 0 : 1 sec + * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec + */ if (parm->interval < -2 || parm->interval > 3) return -EINVAL; est = kzalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) + if (!est) return -ENOBUFS; - __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats); - - idx = parm->interval + 2; + seqcount_init(&est->seq); + intvl_log = parm->interval + 2; est->bstats = bstats; - est->rate_est = rate_est; est->stats_lock = stats_lock; est->running = running; est->ewma_log = parm->ewma_log; - est->last_bytes = b.bytes; - est->avbps = rate_est->bps<<5; - est->last_packets = b.packets; - est->avpps = rate_est->pps<<10; + est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; - spin_lock_bh(&est_tree_lock); - if (!elist[idx].timer.function) { - INIT_LIST_HEAD(&elist[idx].list); - setup_timer(&elist[idx].timer, est_timer, idx); + est_fetch_counters(est, &b); + est->last_bytes = b.bytes; + est->last_packets = b.packets; + old = rcu_dereference_protected(*rate_est, 1); + if (old) { + del_timer_sync(&old->timer); + est->avbps = old->avbps; + est->avpps = old->avpps; } - if (list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); - - list_add_rcu(&est->list, &elist[idx].list); - gen_add_node(est); - spin_unlock_bh(&est_tree_lock); + est->next_jiffies = jiffies + ((HZ/4) << intvl_log); + setup_timer(&est->timer, est_timer, (unsigned long)est); + mod_timer(&est->timer, est->next_jiffies); + rcu_assign_pointer(*rate_est, est); + if (old) + kfree_rcu(old, rcu); return 0; } EXPORT_SYMBOL(gen_new_estimator); /** * gen_kill_estimator - remove a rate estimator - * @bstats: basic statistics - * @rate_est: rate estimator statistics + * @rate_est: rate estimator * - * Removes the rate estimator specified by &bstats and &rate_est. + * Removes the rate estimator. * - * Note : Caller should respect an RCU grace period before freeing stats_lock */ -void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est64 *rate_est) +void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { - struct gen_estimator *e; - - spin_lock_bh(&est_tree_lock); - while ((e = gen_find_node(bstats, rate_est))) { - rb_erase(&e->node, &est_root); + struct net_rate_estimator *est; - write_lock(&est_lock); - e->bstats = NULL; - write_unlock(&est_lock); - - list_del_rcu(&e->list); - kfree_rcu(e, e_rcu); + est = xchg((__force struct net_rate_estimator **)rate_est, NULL); + if (est) { + del_timer_sync(&est->timer); + kfree_rcu(est, rcu); } - spin_unlock_bh(&est_tree_lock); } EXPORT_SYMBOL(gen_kill_estimator); @@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator); */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, - struct gnet_stats_rate_est64 *rate_est, + struct net_rate_estimator __rcu **rate_est, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt) { - gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, + stats_lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); /** * gen_estimator_active - test if estimator is currently in use - * @bstats: basic statistics - * @rate_est: rate estimator statistics + * @rate_est: rate estimator * * Returns true if estimator is active, and false if not. */ -bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est) +bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est) { - bool res; + return !!rcu_access_pointer(*rate_est); +} +EXPORT_SYMBOL(gen_estimator_active); - ASSERT_RTNL(); +bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est, + struct gnet_stats_rate_est64 *sample) +{ + struct net_rate_estimator *est; + unsigned seq; + + rcu_read_lock(); + est = rcu_dereference(*rate_est); + if (!est) { + rcu_read_unlock(); + return false; + } - spin_lock_bh(&est_tree_lock); - res = gen_find_node(bstats, rate_est) != NULL; - spin_unlock_bh(&est_tree_lock); + do { + seq = read_seqcount_begin(&est->seq); + sample->bps = est->avbps >> 8; + sample->pps = est->avpps >> 8; + } while (read_seqcount_retry(&est->seq, seq)); - return res; + rcu_read_unlock(); + return true; } -EXPORT_SYMBOL(gen_estimator_active); +EXPORT_SYMBOL(gen_estimator_read); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 508e051304fb..87f28557b329 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle - * @b: basic statistics - * @r: rate estimator statistics + * @rate_est: rate estimator * * Appends the rate estimator statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); */ int gnet_stats_copy_rate_est(struct gnet_dump *d, - const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est64 *r) + struct net_rate_estimator __rcu **rate_est) { + struct gnet_stats_rate_est64 sample; struct gnet_stats_rate_est est; int res; - if (b && !gen_estimator_active(b, r)) + if (!gen_estimator_read(rate_est, &sample)) return 0; - - est.bps = min_t(u64, UINT_MAX, r->bps); + est.bps = min_t(u64, UINT_MAX, sample.bps); /* we have some time before reaching 2^32 packets per second */ - est.pps = r->pps; + est.pps = sample.pps; if (d->compat_tc_stats) { d->tc_stats.bps = est.bps; @@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, if (d->tail) { res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), TCA_STATS_PAD); - if (res < 0 || est.bps == r->bps) + if (res < 0 || est.bps == sample.bps) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r), - TCA_STATS_PAD); + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample, + sizeof(sample), TCA_STATS_PAD); } return 0; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..71bb3e2eca08 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,396 @@ +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <net/lwtunnel.h> + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Preempt disable is needed to protect per-cpu redirect_info between + * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and + * access to maps strictly require a rcu_read_lock() for protection, + * mixing with BH RCU lock doesn't work. + */ + preempt_disable(); + rcu_read_lock(); + bpf_compute_data_end(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + rcu_read_unlock(); + + switch (ret) { + case BPF_OK: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : "<unknown>"); + ret = BPF_OK; + } else { + ret = skb_do_redirect(skb); + if (ret == 0) + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + preempt_enable(); + + return ret; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_input)) { + pr_warn_once("orig_input not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb) +{ + int hh_len = skb_dst(skb)->dev->hard_header_len; + + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, +}; + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 03976e939818..a5d4e866ce88 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2ae929f9bd06..782dd8663665 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2291,13 +2291,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; n != NULL; n = rcu_dereference_bh(n->next)) { - if (!net_eq(dev_net(n->dev), net)) - continue; - if (neigh_ifindex_filtered(n->dev, filter_idx)) - continue; - if (neigh_master_filtered(n->dev, filter_master_idx)) - continue; - if (idx < s_idx) + if (idx < s_idx || !net_eq(dev_net(n->dev), net)) + goto next; + if (neigh_ifindex_filtered(n->dev, filter_idx) || + neigh_master_filtered(n->dev, filter_master_idx)) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -2332,9 +2329,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (pneigh_net(n) != net) - continue; - if (idx < s_idx) + if (idx < s_idx || pneigh_net(n) != net) goto next; if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 35d37b196e67..50fdc1b59777 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -39,6 +39,9 @@ EXPORT_SYMBOL(init_net); static bool init_net_initialized; +#define MIN_PERNET_OPS_ID \ + ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) + #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; @@ -46,11 +49,11 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; - size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->len = max_gen_ptrs; + ng->s.len = max_gen_ptrs; return ng; } @@ -60,13 +63,14 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) struct net_generic *ng, *old_ng; BUG_ON(!mutex_is_locked(&net_mutex)); - BUG_ON(id == 0); + BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, lockdep_is_held(&net_mutex)); - ng = old_ng; - if (old_ng->len >= id) - goto assign; + if (old_ng->s.len > id) { + old_ng->ptr[id] = data; + return 0; + } ng = net_alloc_generic(); if (ng == NULL) @@ -83,12 +87,12 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) * the old copy for kfree after a grace period. */ - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], + (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); + ng->ptr[id] = data; rcu_assign_pointer(net->gen, ng); - kfree_rcu(old_ng, rcu); -assign: - ng->ptr[id - 1] = data; + kfree_rcu(old_ng, s.rcu); return 0; } @@ -218,6 +222,8 @@ int peernet2id_alloc(struct net *net, struct net *peer) bool alloc; int id; + if (atomic_read(&net->count) == 0) + return NETNSA_NSID_NOT_ASSIGNED; spin_lock_irqsave(&net->nsid_lock, flags); alloc = atomic_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); @@ -872,7 +878,7 @@ static int register_pernet_operations(struct list_head *list, if (ops->id) { again: - error = ida_get_new_above(&net_generic_ids, 1, ops->id); + error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id); if (error < 0) { if (error == -EAGAIN) { ida_pre_get(&net_generic_ids, GFP_KERNEL); @@ -880,7 +886,7 @@ again: } return error; } - max_gen_ptrs = max(max_gen_ptrs, *ops->id); + max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index db313ec7af32..c482491a63d8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -840,18 +840,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, if (dev->dev.parent && dev_is_pci(dev->dev.parent) && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); - size_t size = nla_total_size(sizeof(struct nlattr)); - size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size_t size = nla_total_size(0); size += num_vfs * - (nla_total_size(sizeof(struct ifla_vf_mac)) + - nla_total_size(MAX_VLAN_LIST_LEN * - sizeof(struct nlattr)) + + (nla_total_size(0) + + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ @@ -899,7 +901,8 @@ static size_t rtnl_port_size(const struct net_device *dev, static size_t rtnl_xdp_size(const struct net_device *dev) { - size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */ + size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ + nla_total_size(1); /* XDP_ATTACHED */ if (!dev->netdev_ops->ndo_xdp) return 0; @@ -928,8 +931,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ - + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */ - + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */ + + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1502,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -1606,7 +1610,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (link_dump_filtered(dev, master_idx, kind_ops)) - continue; + goto cont; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -2161,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy); @@ -2171,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + } + if (xdp[IFLA_XDP_FD]) { err = dev_change_xdp_fd(dev, - nla_get_s32(xdp[IFLA_XDP_FD])); + nla_get_s32(xdp[IFLA_XDP_FD]), + xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; @@ -2734,7 +2749,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) ext_filter_mask)); } - return min_ifinfo_dump_size; + return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) @@ -2849,7 +2864,10 @@ nla_put_failure: static inline size_t rtnl_fdb_nlmsg_size(void) { - return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(ETH_ALEN) + /* NDA_LLADDR */ + nla_total_size(sizeof(u16)) + /* NDA_VLAN */ + 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, @@ -3159,7 +3177,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; - nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; @@ -3665,7 +3683,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, if (!size) continue; - if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; attr = nla_reserve_64bit(skb, attr_id, size, @@ -3706,7 +3724,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev) for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; size = rtnl_get_offload_stats_attr_size(attr_id); nla_size += nla_total_size_64bit(size); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index fd3ce461fbe6..88a8e429fc3e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -12,6 +12,7 @@ #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <net/tcp.h> #define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; @@ -40,8 +41,8 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 *tsoff) { u32 secret[MD5_MESSAGE_BYTES / 4]; u32 hash[MD5_DIGEST_WORDS]; @@ -58,6 +59,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, md5_transform(hash, secret); + *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; return seq_scale(hash[0]); } EXPORT_SYMBOL(secure_tcpv6_sequence_number); @@ -86,8 +88,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #ifdef CONFIG_INET -__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, u32 *tsoff) { u32 hash[MD5_DIGEST_WORDS]; @@ -99,6 +101,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; return seq_scale(hash[0]); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4c96cb18c214..84151cf40aeb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2656,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) struct skb_frag_struct *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + if (skb_headlen(skb)) + return 0; todo = shiftlen; from = 0; @@ -3712,20 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); +static bool is_icmp_err_skb(const struct sk_buff *skb) +{ + return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); +} + struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; - struct sk_buff *skb, *skb_next; + struct sk_buff *skb, *skb_next = NULL; + bool icmp_next = false; unsigned long flags; - int err = 0; spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) - err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + icmp_next = is_icmp_err_skb(skb_next); spin_unlock_irqrestore(&q->lock, flags); - if (err) + if (is_icmp_err_skb(skb) && !icmp_next) + sk->sk_err = 0; + + if (skb_next) sk->sk_error_report(sk); return skb; @@ -3837,10 +3848,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!skb_may_tx_timestamp(sk, tsonly)) return; - if (tsonly) - skb = alloc_skb(0, GFP_ATOMIC); - else + if (tsonly) { +#ifdef CONFIG_INET + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) + skb = tcp_get_timestamping_opt_stats(sk); + else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + } if (!skb) return; @@ -4912,3 +4931,31 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, return clone; } EXPORT_SYMBOL(pskb_extract); + +/** + * skb_condense - try to get rid of fragments/frag_list if possible + * @skb: buffer + * + * Can be used to save memory before skb is added to a busy queue. + * If packet has bytes in frags and enough tail room in skb->head, + * pull all of them, so that we can free the frags right now and adjust + * truesize. + * Notes: + * We do not reallocate skb->head thus can not fail. + * Caller must re-evaluate skb->truesize if needed. + */ +void skb_condense(struct sk_buff *skb) +{ + if (!skb->data_len || + skb->data_len > skb->end - skb->tail || + skb_cloned(skb)) + return; + + /* Nice, we can free page frag(s) right now */ + __pskb_pull_tail(skb, skb->data_len); + + /* Now adjust skb->truesize, since __pskb_pull_tail() does + * not do this. + */ + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); +} diff --git a/net/core/sock.c b/net/core/sock.c index 14e6145be33b..9fa46b956bdc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -715,7 +715,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -751,7 +751,7 @@ set_rcvbuf: * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_RCVBUFFORCE: @@ -854,6 +854,13 @@ set_rcvbuf: sk->sk_tskey = 0; } } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { + ret = -EINVAL; + break; + } + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0df2aa652530..2a46e4009f62 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); - if (sock_table) + if (sock_table) { static_key_slow_inc(&rps_needed); + static_key_slow_inc(&rfs_needed); + } if (orig_sock_table) { static_key_slow_dec(&rps_needed); + static_key_slow_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 4f6c1862dfd2..3202d75329b5 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1353,6 +1353,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: + err = -EMSGSIZE; return err; } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index fda321d814d6..d859a5c36e70 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -694,6 +694,7 @@ int dccp_invalid_packet(struct sk_buff *skb) { const struct dccp_hdr *dh; unsigned int cscov; + u8 dccph_doff; if (skb->pkt_type != PACKET_HOST) return 1; @@ -715,18 +716,19 @@ int dccp_invalid_packet(struct sk_buff *skb) /* * If P.Data Offset is too small for packet type, drop packet and return */ - if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff); + dccph_doff = dh->dccph_doff; + if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { + DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); return 1; } /* * If P.Data Offset is too too large for packet, drop packet and return */ - if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { - DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff); + if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { + DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); return 1; } - + dh = dccp_hdr(skb); /* * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet * has short sequence numbers), drop packet and return diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index a6902c1e2f28..7899919cd9f0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -233,6 +233,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, genphy_read_status(phydev); if (ds->ops->adjust_link) ds->ops->adjust_link(ds, port, phydev); + + put_device(&phydev->mdio.dev); } return 0; @@ -504,15 +506,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, void dsa_cpu_dsa_destroy(struct device_node *port_dn) { - struct phy_device *phydev; - - if (of_phy_is_fixed_link(port_dn)) { - phydev = of_phy_find_device(port_dn); - if (phydev) { - phy_device_free(phydev); - fixed_phy_unregister(phydev); - } - } + if (of_phy_is_fixed_link(port_dn)) + of_phy_deregister_fixed_link(port_dn); } static void dsa_switch_destroy(struct dsa_switch *ds) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f8a7d9aab437..5fff951a0a49 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -28,8 +28,10 @@ static struct dsa_switch_tree *dsa_get_dst(u32 tree) struct dsa_switch_tree *dst; list_for_each_entry(dst, &dsa_switch_trees, list) - if (dst->tree == tree) + if (dst->tree == tree) { + kref_get(&dst->refcount); return dst; + } return NULL; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d0c7bce88743..68c9eea00518 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1127,7 +1127,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, p->phy_interface = mode; phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); - if (of_phy_is_fixed_link(port_dn)) { + if (!phy_dn && of_phy_is_fixed_link(port_dn)) { /* In the case of a fixed PHY, the DT node associated * to the fixed PHY is the Port DT node */ @@ -1137,7 +1137,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, return ret; } phy_is_fixed = true; - phy_dn = port_dn; + phy_dn = of_node_get(port_dn); } if (ds->ops->get_phy_flags) @@ -1156,6 +1156,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, ret = dsa_slave_phy_connect(p, slave_dev, phy_id); if (ret) { netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); + of_node_put(phy_dn); return ret; } } else { @@ -1164,6 +1165,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_flags, p->phy_interface); } + + of_node_put(phy_dn); } if (p->phy && phy_is_fixed) @@ -1176,6 +1179,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, ret = dsa_slave_phy_connect(p, slave_dev, p->port); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); + if (phy_is_fixed) + of_phy_deregister_fixed_link(port_dn); return ret; } } @@ -1293,10 +1298,18 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, void dsa_slave_destroy(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); + struct dsa_switch *ds = p->parent; + struct device_node *port_dn; + + port_dn = ds->ports[p->port].dn; netif_carrier_off(slave_dev); - if (p->phy) + if (p->phy) { phy_disconnect(p->phy); + + if (of_phy_is_fixed_link(port_dn)) + of_phy_deregister_fixed_link(port_dn); + } unregister_netdev(slave_dev); free_netdev(slave_dev); } diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 77d73014bde3..dc2960be51e0 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -286,9 +286,12 @@ int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info) if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* name should be null-terminated */ + rc = -ENODEV; dev = dev_get_by_name(genl_info_net(info), name); if (!dev) - return -ENODEV; + return rc; + if (dev->type != ARPHRD_IEEE802154) + goto out; phy = dev->ieee802154_ptr->wpan_phy; BUG_ON(!phy); @@ -342,6 +345,7 @@ nla_put_failure: nlmsg_free(msg); out_dev: wpan_phy_put(phy); +out: if (dev) dev_put(dev); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 28e051a8e847..6e7baaf814c6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -723,6 +723,7 @@ config DEFAULT_TCP_CONG default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG + default "bbr" if DEFAULT_BBR default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5ddf5cda07f4..1830e6f0e9cc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -374,8 +374,18 @@ lookup_protocol: if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { + sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { sk_common_release(sk); + goto out; + } } out: return err; @@ -1233,7 +1243,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(iph->frag_off & htons(IP_DF))) + if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) goto out; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d95631d09248..20fb25e3027b 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -476,7 +476,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) esph = (void *)skb_push(skb, 4); *seqhi = esph->spi; esph->spi = esph->seq_no; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; aead_request_set_callback(req, 0, esp_input_done_esn, skb); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d93eea8e2409..dbad5a1c161a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -151,7 +151,7 @@ static void fib_replace_table(struct net *net, struct fib_table *old, int fib_unmerge(struct net *net) { - struct fib_table *old, *new; + struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); @@ -162,11 +162,21 @@ int fib_unmerge(struct net *net) if (!new) return -ENOMEM; + /* table is already unmerged */ + if (new == old) + return 0; + /* replace merged table with clean table */ - if (new != old) { - fib_replace_table(net, old, new); - fib_free_table(old); - } + fib_replace_table(net, old, new); + fib_free_table(old); + + /* attempt to fetch main table if it has been allocated */ + main_table = fib_get_table(net, RT_TABLE_MAIN); + if (!main_table) + return 0; + + /* flush local entries from main table */ + fib_table_flush_external(main_table); return 0; } @@ -1209,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net) int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; + net->ipv4.fib_seq = 0; + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 388d3e21629b..c1bc1e92de0e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi) #endif call_rcu(&fi->rcu, free_fib_info_rcu); } +EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4cff74d4133f..1b0e7d1f5217 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,25 +84,114 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static BLOCKING_NOTIFIER_HEAD(fib_chain); +static unsigned int fib_seq_sum(void) +{ + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) + fib_seq += net->ipv4.fib_seq; + rtnl_unlock(); + + return fib_seq; +} + +static ATOMIC_NOTIFIER_HEAD(fib_chain); + +static int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return nb->notifier_call(nb, event_type, info); +} + +static void fib_rules_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_notifier_info info; + + if (net->ipv4.fib_has_custom_rules) + call_fib_notifier(nb, net, event_type, &info); +#endif +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type); + +static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifier(nb, net, event_type, &info.info); +} -int register_fib_notifier(struct notifier_block *nb) +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) { - return blocking_notifier_chain_register(&fib_chain, nb); + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + /* Mutex semantics guarantee that every change done to + * FIB tries before we read the change sequence counter + * is now visible to us. + */ + rcu_read_lock(); + for_each_net_rcu(net) { + fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); + fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; } EXPORT_SYMBOL(register_fib_notifier); int unregister_fib_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_unregister(&fib_chain, nb); + return atomic_notifier_chain_unregister(&fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + net->ipv4.fib_seq++; info->net = net; - return blocking_notifier_call_chain(&fib_chain, event_type, info); + return atomic_notifier_call_chain(&fib_chain, event_type, info); } static int call_fib_entry_notifiers(struct net *net, @@ -719,6 +808,13 @@ static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; + unsigned char slen_max; + + /* only vector 0 can have a suffix length greater than or equal to + * tn->pos + tn->bits, the second highest node will have a suffix + * length at most of tn->pos + tn->bits - 1 + */ + slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is @@ -736,12 +832,8 @@ static unsigned char update_suffix(struct key_vector *tn) slen = n->slen; i &= ~(stride - 1); - /* if slen covers all but the last bit we can stop here - * there will be nothing longer than that since only node - * 0 and 1 << (bits - 1) could have that as their suffix - * length. - */ - if ((slen + 1) >= (tn->pos + tn->bits)) + /* stop searching if we have hit the maximum possible value */ + if (slen >= slen_max) break; } @@ -913,39 +1005,27 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn) return collapse(t, tn); /* update parent in case halve failed */ - tp = node_parent(tn); - - /* Return if at least one deflate was run */ - if (max_work != MAX_WORK) - return tp; - - /* push the suffix length to the parent node */ - if (tn->slen > tn->pos) { - unsigned char slen = update_suffix(tn); - - if (slen > tp->slen) - tp->slen = slen; - } - - return tp; + return node_parent(tn); } -static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l) +static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { - while ((tp->slen > tp->pos) && (tp->slen > l->slen)) { - if (update_suffix(tp) > l->slen) + unsigned char node_slen = tn->slen; + + while ((node_slen > tn->pos) && (node_slen > slen)) { + slen = update_suffix(tn); + if (node_slen == slen) break; - tp = node_parent(tp); + + tn = node_parent(tn); + node_slen = tn->slen; } } -static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l) +static void node_push_suffix(struct key_vector *tn, unsigned char slen) { - /* if this is a new leaf then tn will be NULL and we can sort - * out parent suffix lengths as a part of trie_rebalance - */ - while (tn->slen < l->slen) { - tn->slen = l->slen; + while (tn->slen < slen) { + tn->slen = slen; tn = node_parent(tn); } } @@ -1066,6 +1146,7 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp, } /* Case 3: n is NULL, and will just insert a new leaf */ + node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); @@ -1107,7 +1188,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; - leaf_push_suffix(tp, l); + node_push_suffix(tp, new->fa_slen); } return 0; @@ -1499,6 +1580,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { + if (tp->slen == l->slen) + node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); @@ -1511,7 +1594,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; - leaf_pull_suffix(tp, l); + node_pull_suffix(tp, fa->fa_slen); } /* Caller must hold RTNL. */ @@ -1743,8 +1826,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, - NULL, l->key)) + NULL, l->key)) { + kmem_cache_free(fn_alias_kmem, new_fa); goto out; + } } /* stop loop if key wrapped back to 0 */ @@ -1760,6 +1845,75 @@ out: return NULL; } +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; + + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; + + /* update the suffix to address pulled leaves */ + if (pn->slen > pn->pos) + update_suffix(pn); + + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); + + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + /* if alias was cloned to local then we just + * need to remove the local copy from main + */ + if (tb->tb_id != fa->tb_id) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + continue; + } + + /* record local slen */ + slen = fa->fa_slen; + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } + } +} + /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb) { @@ -1782,6 +1936,10 @@ int fib_table_flush(struct net *net, struct fib_table *tb) if (IS_TRIE(pn)) break; + /* update the suffix to address pulled leaves */ + if (pn->slen > pn->pos) + update_suffix(pn); + /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); @@ -1834,6 +1992,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb) return found; } +static void fib_leaf_notify(struct net *net, struct key_vector *l, + struct fib_table *tb, struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi) + continue; + + /* local and main table can share the same trie, + * so don't notify twice for the same entry. + */ + if (tb->tb_id != fa->tb_id) + continue; + + call_fib_entry_notifier(nb, net, event_type, l->key, + KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, + fa->fa_type, fa->tb_id, 0); + } +} + +static void fib_table_notify(struct net *net, struct fib_table *tb, + struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; + t_key key = 0; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + fib_leaf_notify(net, l, tb, nb, event_type); + + key = l->key + 1; + /* stop in case of wrap around */ + if (key < l->key) + break; + } +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + fib_table_notify(net, tb, nb, event_type); + } +} + static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 691146abde2d..f79d7a8ab1c6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1047,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb) if (success) { consume_skb(skb); - return 0; + return NET_RX_SUCCESS; } drop: kfree_skb(skb); - return 0; + return NET_RX_DROP; csum_error: __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 606cc3e85d2b..15db786d50ed 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -162,7 +162,7 @@ static int unsolicited_report_interval(struct in_device *in_dev) } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); -static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); +static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); @@ -1130,10 +1130,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_unlock_bh(&in_dev->mc_tomb_lock); } -static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) +/* + * restore ip_mc_list deleted records + */ +static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; - struct ip_sf_list *psf, *psf_next; + struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); + __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; @@ -1149,16 +1154,26 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); + + spin_lock_bh(&im->lock); if (pmc) { - for (psf = pmc->tomb; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + im->interface = pmc->interface; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->sfmode = pmc->sfmode; + if (pmc->sfmode == MCAST_INCLUDE) { + im->tomb = pmc->tomb; + im->sources = pmc->sources; + for (psf = im->sources; psf; psf = psf->sf_next) + psf->sf_crcount = im->crcount; } in_dev_put(pmc->interface); - kfree(pmc); } + spin_unlock_bh(&im->lock); } +/* + * flush ip_mc_list deleted records + */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; @@ -1366,7 +1381,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST - igmpv3_del_delrec(in_dev, im->multiaddr); + igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); if (!in_dev->dead) @@ -1626,8 +1641,12 @@ void ip_mc_remap(struct in_device *in_dev) ASSERT_RTNL(); - for_each_pmc_rtnl(in_dev, pmc) + for_each_pmc_rtnl(in_dev, pmc) { +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, pmc); +#endif igmp_group_added(pmc); + } } /* Device going down */ @@ -1648,7 +1667,6 @@ void ip_mc_down(struct in_device *in_dev) in_dev->mr_gq_running = 0; if (del_timer(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); - igmpv3_clear_delrec(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); @@ -1688,8 +1706,12 @@ void ip_mc_up(struct in_device *in_dev) #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); - for_each_pmc_rtnl(in_dev, pmc) + for_each_pmc_rtnl(in_dev, pmc) { +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, pmc); +#endif igmp_group_added(pmc); + } } /* @@ -1704,13 +1726,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* Deactivate timers */ ip_mc_down(in_dev); +#ifdef CONFIG_IP_MULTICAST + igmpv3_clear_delrec(in_dev); +#endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; - - /* We've dropped the groups in ip_mc_down already */ - ip_mc_clear_src(i); ip_ma_put(i); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 358f2c82b030..9ffc2625cddd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -74,6 +74,7 @@ #include <net/checksum.h> #include <net/inetpeer.h> #include <net/lwtunnel.h> +#include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -107,6 +108,8 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(!skb)) return 0; + skb->protocol = htons(ETH_P_IP); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); @@ -285,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -303,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_mc_finish_output(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return dev_loopback_xmit(net, sk, skb); +} + int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); @@ -340,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -356,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3776ff6749f..b3cc1335adbc 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + struct net_device *dev = skb_dst(skb)->dev; unsigned int hh_len; if (addr_type == RTN_UNSPEC) - addr_type = inet_addr_type(net, saddr); + addr_type = inet_addr_type_dev_table(net, dev, saddr); if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) flags |= FLOWI_FLAG_ANYSRC; else @@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t fl4.saddr = saddr; fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + if (!fl4.flowi4_oif) + fl4.flowi4_oif = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 39004da318e2..1258a9ab62ef 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -411,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name) } static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; - unsigned long pcnt; int ret; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, @@ -439,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -519,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -528,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e) static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct arpt_entry *iter; unsigned int *offsets; unsigned int i; @@ -590,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, repl->name, repl->size); + ret = find_check_entry(iter, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; @@ -1197,8 +1197,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; + newinfo->hook_entry[i] = compatr->hook_entry[i]; + newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 46815c8a60d7..308b456723f0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -531,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) static int find_check_entry(struct ipt_entry *e, struct net *net, const char *name, - unsigned int size) + unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; @@ -539,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - unsigned long pcnt; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -582,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, cleanup_match(ematch, net); } - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -670,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -679,6 +677,7 @@ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ipt_entry *iter; unsigned int *offsets; unsigned int i; @@ -738,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, net, repl->name, repl->size); + ret = find_check_entry(iter, net, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e6e206fa86c8..21db00d0362b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -419,7 +419,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) } cipinfo->config = config; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -444,7 +444,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) clusterip_config_put(cipinfo->config); - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_get(par->net, par->family); } #ifdef CONFIG_COMPAT diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 34cfb9b0bc0a..a03e4e7ef5f9 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) pr_debug("bad rangesize %u\n", mr->rangesize); return -EINVAL; } - return 0; + return nf_ct_netns_get(par->net, par->family); } static unsigned int @@ -59,6 +59,11 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) xt_out(par)); } +static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); +} + static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV4, @@ -67,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = { .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, + .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 361411688221..30c0de53e254 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -418,12 +418,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) e->ip.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_l3proto_try_module_get(par->family); + return nf_ct_netns_get(par->net, par->family); } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg4_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 59b49945b481..f273098e48fd 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -83,10 +83,12 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) return true ^ invert; iph = ip_hdr(skb); - if (ipv4_is_multicast(iph->daddr)) { - if (ipv4_is_zeronet(iph->saddr)) - return ipv4_is_local_multicast(iph->daddr) ^ invert; + if (ipv4_is_zeronet(iph->saddr)) { + if (ipv4_is_lbcast(iph->daddr) || + ipv4_is_local_multicast(iph->daddr)) + return true ^ invert; } + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 7130ed5dc1fa..fcfd071f4705 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -31,6 +31,13 @@ #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #include <net/netfilter/nf_log.h> +static int conntrack4_net_id __read_mostly; +static DEFINE_MUTEX(register_ipv4_hooks); + +struct conntrack4_net { + unsigned int users; +}; + static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple) { @@ -307,9 +314,42 @@ static struct nf_sockopt_ops so_getorigdst = { .owner = THIS_MODULE, }; -static int ipv4_init_net(struct net *net) +static int ipv4_hooks_register(struct net *net) { - return 0; + struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); + int err = 0; + + mutex_lock(®ister_ipv4_hooks); + + cnet->users++; + if (cnet->users > 1) + goto out_unlock; + + err = nf_defrag_ipv4_enable(net); + if (err) { + cnet->users = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + + if (err) + cnet->users = 0; + out_unlock: + mutex_unlock(®ister_ipv4_hooks); + return err; +} + +static void ipv4_hooks_unregister(struct net *net) +{ + struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); + + mutex_lock(®ister_ipv4_hooks); + if (cnet->users && (--cnet->users == 0)) + nf_unregister_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + mutex_unlock(®ister_ipv4_hooks); } struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { @@ -325,7 +365,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, #endif - .init_net = ipv4_init_net, + .net_ns_get = ipv4_hooks_register, + .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, }; @@ -340,6 +381,15 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = { &nf_conntrack_l4proto_tcp4, &nf_conntrack_l4proto_udp4, &nf_conntrack_l4proto_icmp, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite4, +#endif }; static int ipv4_net_init(struct net *net) @@ -369,6 +419,8 @@ static void ipv4_net_exit(struct net *net) static struct pernet_operations ipv4_net_ops = { .init = ipv4_net_init, .exit = ipv4_net_exit, + .id = &conntrack4_net_id, + .size = sizeof(struct conntrack4_net), }; static int __init nf_conntrack_l3proto_ipv4_init(void) @@ -376,7 +428,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) int ret = 0; need_conntrack(); - nf_defrag_ipv4_enable(); ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { @@ -390,17 +441,10 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) goto cleanup_sockopt; } - ret = nf_register_hooks(ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register hooks.\n"); - goto cleanup_pernet; - } - ret = nf_ct_l4proto_register(builtin_l4proto4, ARRAY_SIZE(builtin_l4proto4)); if (ret < 0) - goto cleanup_hooks; + goto cleanup_pernet; ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); if (ret < 0) { @@ -412,8 +456,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) cleanup_l4proto: nf_ct_l4proto_unregister(builtin_l4proto4, ARRAY_SIZE(builtin_l4proto4)); - cleanup_hooks: - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); cleanup_pernet: unregister_pernet_subsys(&ipv4_net_ops); cleanup_sockopt: @@ -427,7 +469,6 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); nf_ct_l4proto_unregister(builtin_l4proto4, ARRAY_SIZE(builtin_l4proto4)); - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); unregister_pernet_subsys(&ipv4_net_ops); nf_unregister_sockopt(&so_getorigdst); } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index d88da36b383c..49bd6a54404f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -11,6 +11,7 @@ #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> +#include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> @@ -22,6 +23,8 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> +static DEFINE_MUTEX(defrag4_mutex); + static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { }, }; +static void __net_exit defrag4_net_exit(struct net *net) +{ + if (net->nf.defrag_ipv4) { + nf_unregister_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + net->nf.defrag_ipv4 = false; + } +} + +static struct pernet_operations defrag4_net_ops = { + .exit = defrag4_net_exit, +}; + static int __init nf_defrag_init(void) { - return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); + return register_pernet_subsys(&defrag4_net_ops); } static void __exit nf_defrag_fini(void) { - nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); + unregister_pernet_subsys(&defrag4_net_ops); } -void nf_defrag_ipv4_enable(void) +int nf_defrag_ipv4_enable(struct net *net) { + int err = 0; + + might_sleep(); + + if (net->nf.defrag_ipv4) + return 0; + + mutex_lock(&defrag4_mutex); + if (net->nf.defrag_ipv4) + goto out_unlock; + + err = nf_register_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + if (err == 0) + net->nf.defrag_ipv4 = true; + + out_unlock: + mutex_unlock(&defrag4_mutex); + return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 1b49966484b3..965b1a161369 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -101,12 +101,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, } iph = ip_hdr(pkt->skb); - if (ipv4_is_multicast(iph->daddr) && - ipv4_is_zeronet(iph->saddr) && - ipv4_is_local_multicast(iph->daddr)) { - nft_fib_store_result(dest, priv->result, pkt, - get_ifindex(pkt->skb->dev)); - return; + if (ipv4_is_zeronet(iph->saddr)) { + if (ipv4_is_lbcast(iph->daddr) || + ipv4_is_local_multicast(iph->daddr)) { + nft_fib_store_result(dest, priv->result, pkt, + get_ifindex(pkt->skb->dev)); + return; + } } if (priv->flags & NFTA_FIB_F_MARK) @@ -122,6 +123,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, fl4.saddr = get_saddr(iph->daddr); } + *dest = 0; + if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return; @@ -198,7 +201,7 @@ nft_fib4_select_ops(const struct nft_ctx *ctx, if (!tb[NFTA_FIB_RESULT]) return ERR_PTR(-EINVAL); - result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT])); + result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); switch (result) { case NFT_FIB_RESULT_OIF: diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 4f697e431811..a0ea8aad1bf1 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -35,12 +35,19 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, &range, nft_out(pkt)); } +static void +nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); +} + static struct nft_expr_type nft_masq_ipv4_type; static const struct nft_expr_ops nft_masq_ipv4_ops = { .type = &nft_masq_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), .eval = nft_masq_ipv4_eval, .init = nft_masq_init, + .destroy = nft_masq_ipv4_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, }; @@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init); module_exit(nft_masq_ipv4_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 16df0493c5ce..1650ed23c15d 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -38,12 +38,19 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); } +static void +nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); +} + static struct nft_expr_type nft_redir_ipv4_type; static const struct nft_expr_ops nft_redir_ipv4_ops = { .type = &nft_redir_ipv4_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_ipv4_eval, .init = nft_redir_init, + .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, }; @@ -71,5 +78,5 @@ module_init(nft_redir_ipv4_module_init); module_exit(nft_redir_ipv4_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d11129f1178d..5b2635e69a92 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -657,6 +657,10 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, if (len > 0xFFFF) return -EMSGSIZE; + /* Must have at least a full ICMP header. */ + if (len < icmph_len) + return -EINVAL; + /* * Check the flags. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d37fc6f7e679..fa5c037227cb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -531,13 +531,14 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { + const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; u32 mark = skb->mark; - __build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) @@ -1602,6 +1603,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } +static void set_lwt_redirect(struct rtable *rth) +{ + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; + rth->dst.output = lwtunnel_output; + } + + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1691,14 +1705,7 @@ rt_cache: rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } + set_lwt_redirect(rth); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1925,8 +1932,18 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } + if (do_cache) { - if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + struct fib_nh *nh = &FIB_RES_NH(res); + + rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + WARN_ON(rth->dst.input == lwtunnel_input); + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } + + if (unlikely(!rt_cache_route(nh, rth))) { rth->dst.flags |= DST_NOCACHE; rt_add_uncached_list(rth); } @@ -2154,8 +2171,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (lwtunnel_output_redirect(rth->dst.lwtstate)) - rth->dst.output = lwtunnel_output; + set_lwt_redirect(rth); return rth; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0dc6286272aa..3e88467d70ee 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; + treq->ts_off = 0; req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 913f9bbfc030..1ef3165114ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -663,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ - if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { + if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); - set_bit(TSQ_THROTTLED, &tp->tsq_flags); + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. @@ -996,8 +996,11 @@ do_error: goto out; out_err: /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && + err == -EAGAIN)) { sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } return sk_stream_error(sk, flags, err); } @@ -1331,8 +1334,11 @@ do_error: out_err: err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && + err == -EAGAIN)) { sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } release_sock(sk); return err; } @@ -2702,6 +2708,25 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_setsockopt); #endif +static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + struct tcp_info *info) +{ + u64 stats[__TCP_CHRONO_MAX], total = 0; + enum tcp_chrono i; + + for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { + stats[i] = tp->chrono_stat[i - 1]; + if (i == tp->chrono_type) + stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] *= USEC_PER_SEC / HZ; + total += stats[i]; + } + + info->tcpi_busy_time = total; + info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; + info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; +} + /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { @@ -2794,6 +2819,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); + tcp_get_info_chrono_stats(tp, info); unlock_sock_fast(sk, slow); @@ -2815,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } EXPORT_SYMBOL_GPL(tcp_get_info); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *stats; + struct tcp_info info; + + stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + if (!stats) + return NULL; + + tcp_get_info_chrono_stats(tp, &info); + nla_put_u64_64bit(stats, TCP_NLA_BUSY, + info.tcpi_busy_time, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, + info.tcpi_rwnd_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, + info.tcpi_sndbuf_limited, TCP_NLA_PAD); + return stats; +} + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1294af4e0127..79c4817abc94 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret = 0; - /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { + /* all algorithms must implement these */ + if (!ca->ssthresh || !ca->undo_cwnd || + !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } @@ -200,8 +201,10 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE) + if (sk->sk_state != TCP_CLOSE) { + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_init_congestion_control(sk); + } } /* Manage refcounts on socket close. */ @@ -441,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); +u32 tcp_reno_undo_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd, tp->snd_ssthresh << 1); +} +EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); + struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, }; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 51139175bf61..5f5e5936760e 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -188,8 +188,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) static void dctcp_update_alpha(struct sock *sk, u32 flags) { + const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); u32 acked_bytes = tp->snd_una - ca->prior_snd_una; /* If ack did not advance snd_una, count dupack as MSS size. @@ -229,13 +229,6 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } - - if (flags & CA_ACK_ECE) { - unsigned int cwnd = dctcp_ssthresh(sk); - - if (cwnd != tp->snd_cwnd) - tp->snd_cwnd = cwnd; - } } static void dctcp_state(struct sock *sk, u8 new_state) @@ -342,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = { static struct tcp_congestion_ops dctcp_reno __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, .get_info = dctcp_get_info, .owner = THIS_MODULE, .name = "dctcp-reno", diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index db7842495a64..6d9879e93648 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -94,6 +94,7 @@ static const struct hstcp_aimd_val { struct hstcp { u32 ai; + u32 loss_cwnd; }; static void hstcp_init(struct sock *sk) @@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 hstcp_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - const struct hstcp *ca = inet_csk_ca(sk); + struct hstcp *ca = inet_csk_ca(sk); + ca->loss_cwnd = tp->snd_cwnd; /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); } +static u32 hstcp_cwnd_undo(struct sock *sk) +{ + const struct hstcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, + .undo_cwnd = hstcp_cwnd_undo, .cong_avoid = hstcp_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 083831e359df..0f7175c3338e 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index c8e6d86be114..60352ff4f5a8 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -48,6 +48,7 @@ struct illinois { u32 end_seq; /* right edge of current RTT */ u32 alpha; /* Additive increase */ u32 beta; /* Muliplicative decrease */ + u32 loss_cwnd; /* cwnd on loss */ u16 acked; /* # packets acked by current ACK */ u8 rtt_above; /* average rtt has gone above threshold */ u8 rtt_low; /* # of rtts measurements below threshold */ @@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); + ca->loss_cwnd = tp->snd_cwnd; /* Multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } +static u32 tcp_illinois_cwnd_undo(struct sock *sk) +{ + const struct illinois *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) @@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, static struct tcp_congestion_ops tcp_illinois __read_mostly = { .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, + .undo_cwnd = tcp_illinois_cwnd_undo, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a70046fea0e8..6c790754ae3e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,6 +85,7 @@ int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +EXPORT_SYMBOL(sysctl_tcp_timestamps); /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; @@ -128,6 +129,23 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ +static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb) +{ + static bool __once __read_mostly; + + if (!__once) { + struct net_device *dev; + + __once = true; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", + dev ? dev->name : "Unknown driver"); + rcu_read_unlock(); + } +} + /* Adapt the MSS value used to make delayed ack decision to the * real world. */ @@ -144,7 +162,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { - icsk->icsk_ack.rcv_mss = len; + icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, + tcp_sk(sk)->advmss); + if (unlikely(icsk->icsk_ack.rcv_mss != len)) + tcp_gro_dev_warn(sk, skb); } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -2394,10 +2415,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->undo_cwnd) - tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); - else - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -3181,6 +3199,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->lost_skb_hint = NULL; } + if (!skb) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; @@ -5059,8 +5080,11 @@ static void tcp_check_space(struct sock *sk) /* pairs with tcp_poll() */ smp_mb__after_atomic(); if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } } } @@ -6304,6 +6328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; tcp_rsk(req)->af_specific = af_ops; + tcp_rsk(req)->ts_off = 0; tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; @@ -6325,6 +6350,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; + if (isn && tmp_opt.tstamp_ok) + af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering @@ -6365,7 +6393,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb); + isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); } if (!dst) { dst = af_ops->route_req(sk, &fl, req, NULL); @@ -6377,6 +6405,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + tcp_rsk(req)->ts_off = 0; req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5555eb86e549..30d81f533ada 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -95,12 +95,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); + tcp_hdr(skb)->source, tsoff); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -237,7 +237,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, - usin->sin_port); + usin->sin_port, + &tp->tsoffset); inet->inet_id = tp->write_seq ^ jiffies; @@ -442,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { - if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } goto out; @@ -824,7 +825,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp, + tcp_time_stamp + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index c67ece1390c2..046fd3910873 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) static struct tcp_congestion_ops tcp_lp __read_mostly = { .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_lp_cong_avoid, .pkts_acked = tcp_lp_pkts_acked, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6234ebaa7db1..28ce5ee831f5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -532,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } - newtp->tsoffset = 0; + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -581,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) + tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f57b5aa51b59..b45101f3d2bd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb); + opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -769,25 +769,26 @@ static void tcp_tasklet_func(unsigned long data) list_del(&tp->tsq_node); sk = (struct sock *)tp; - bh_lock_sock(sk); - - if (!sock_owned_by_user(sk)) { - tcp_tsq_handler(sk); - } else { - /* defer the work to tcp_release_cb() */ - set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); + + if (!sk->sk_lock.owned && + test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); + tcp_tsq_handler(sk); + } + bh_unlock_sock(sk); } - bh_unlock_sock(sk); - clear_bit(TSQ_QUEUED, &tp->tsq_flags); sk_free(sk); } } -#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ - (1UL << TCP_WRITE_TIMER_DEFERRED) | \ - (1UL << TCP_DELACK_TIMER_DEFERRED) | \ - (1UL << TCP_MTU_REDUCED_DEFERRED)) +#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_MTU_REDUCED_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -797,18 +798,17 @@ static void tcp_tasklet_func(unsigned long data) */ void tcp_release_cb(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nflags; /* perform an atomic operation only if at least one flag is set */ do { - flags = tp->tsq_flags; + flags = sk->sk_tsq_flags; if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; - } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - if (flags & (1UL << TCP_TSQ_DEFERRED)) + if (flags & TCPF_TSQ_DEFERRED) tcp_tsq_handler(sk); /* Here begins the tricky part : @@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk) */ sock_release_ownership(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { + if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { + if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } @@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nval, oval; int wmem; /* Keep one reference on sk_wmem_alloc. @@ -877,16 +878,25 @@ void tcp_wfree(struct sk_buff *skb) if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; - if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && - !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { - unsigned long flags; + for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { struct tsq_tasklet *tsq; + bool empty; + + if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) + goto out; + + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); + if (nval != oval) + continue; /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); + empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); - tasklet_schedule(&tsq->tasklet); + if (empty) + tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; } @@ -1514,6 +1524,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (sysctl_tcp_slow_start_after_idle && (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); + + /* The following conditions together indicate the starvation + * is caused by insufficient sender buffer: + * 1) just sent some data (see tcp_write_xmit) + * 2) not cwnd limited (this else condition) + * 3) no more data to send (null tcp_send_head ) + * 4) application is hitting buffer limit (SOCK_NOSPACE) + */ + if (!tcp_send_head(sk) && sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && + (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -1910,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) */ static int tcp_mtu_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); - int len; int probe_size; int size_needed; - int copy; + int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and - * not SACKing (the variable headers throw things off) */ - if (!icsk->icsk_mtup.enabled || - icsk->icsk_mtup.probe_size || - inet_csk(sk)->icsk_ca_state != TCP_CA_Open || - tp->snd_cwnd < 11 || - tp->rx_opt.num_sacks || tp->rx_opt.dsack) + * not SACKing (the variable headers throw things off) + */ + if (likely(!icsk->icsk_mtup.enabled || + icsk->icsk_mtup.probe_size || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, @@ -2069,7 +2091,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit <<= factor; if (atomic_read(&sk->sk_wmem_alloc) > limit) { - set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); + /* Always send the 1st or 2nd skb in write queue. + * No need to wait for TX completion to call us back, + * after softirq/tasklet schedule. + * This helps when TX completions are delayed too much. + */ + if (skb == sk->sk_write_queue.next || + skb->prev == sk->sk_write_queue.next) + return false; + + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. @@ -2081,6 +2112,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, return false; } +static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) +{ + const u32 now = tcp_time_stamp; + + if (tp->chrono_type > TCP_CHRONO_UNSPEC) + tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + tp->chrono_start = now; + tp->chrono_type = new; +} + +void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If there are multiple conditions worthy of tracking in a + * chronograph then the highest priority enum takes precedence + * over the other conditions. So that if something "more interesting" + * starts happening, stop the previous chrono and start a new one. + */ + if (type > tp->chrono_type) + tcp_chrono_set(tp, type); +} + +void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + + /* There are multiple conditions worthy of tracking in a + * chronograph, so that the highest priority enum takes + * precedence over the other conditions (see tcp_chrono_start). + * If a condition stops, we only stop chrono tracking if + * it's the "most interesting" or current chrono we are + * tracking and starts busy chrono if we have pending data. + */ + if (tcp_write_queue_empty(sk)) + tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); + else if (type == tp->chrono_type) + tcp_chrono_set(tp, TCP_CHRONO_BUSY); +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2103,7 +2175,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; - bool is_cwnd_limited = false; + bool is_cwnd_limited = false, is_rwnd_limited = false; u32 max_segs; sent_pkts = 0; @@ -2140,8 +2212,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { + is_rwnd_limited = true; break; + } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, @@ -2167,6 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; + if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -2186,6 +2262,11 @@ repair: break; } + if (is_rwnd_limited) + tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); + else + tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2514,7 +2595,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, } /* Collapses two adjacent SKB's during retransmission. */ -static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) +static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); @@ -2525,14 +2606,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); + if (next_skb_size) { + if (next_skb_size <= skb_availroom(skb)) + skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), + next_skb_size); + else if (!skb_shift(skb, next_skb, next_skb_size)) + return false; + } tcp_highest_sack_combine(sk, next_skb, skb); tcp_unlink_write_queue(next_skb, sk); - if (next_skb_size) - skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), - next_skb_size); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2561,6 +2645,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); sk_wmem_free_skb(sk, next_skb); + return true; } /* Check if coalescing SKBs is legal. */ @@ -2610,16 +2695,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (space < 0) break; - /* Punt if not enough space exists in the first SKB for - * the data in the second - */ - if (skb->len > skb_availroom(to)) - break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; - tcp_collapse_retrans(sk, to); + if (!tcp_collapse_retrans(sk, to)) + break; } } @@ -3298,6 +3379,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) fo->copied = space; tcp_connect_queue_skb(sk, syn_data); + if (syn_data->len) + tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); @@ -3462,8 +3545,6 @@ void tcp_send_ack(struct sock *sk) /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 - * We also avoid tcp_wfree() overhead (cache line miss accessing - * tp->tsq_flags) by using regular sock_wfree() */ skb_set_tcp_pure_ack(buff); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index bf5ea9e9bbc1..f2123075ce6e 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,6 +15,10 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 +struct scalable { + u32 loss_cwnd; +}; + static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + struct scalable *ca = inet_csk_ca(sk); + + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } +static u32 tcp_scalable_cwnd_undo(struct sock *sk) +{ + const struct scalable *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, + .undo_cwnd = tcp_scalable_cwnd_undo, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3ea1cf804748..3705075f42c3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); @@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 4c4bac1b5eab..218cfcc77650 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_vegas_cong_avoid, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 40171e163cff..76005d4b8dfc 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -30,6 +30,7 @@ struct veno { u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ + u32 loss_cwnd; /* cwnd when loss occured */ }; /* There are several situations when we must "re-start" Veno: @@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); + veno->loss_cwnd = tp->snd_cwnd; if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tp->snd_cwnd * 4 / 5, 2U); @@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } +static u32 tcp_veno_cwnd_undo(struct sock *sk) +{ + const struct veno *veno = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd); +} + static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, + .undo_cwnd = tcp_veno_cwnd_undo, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 4b03a2e2a050..fed66dc0e0f5 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = tcp_westwood_event, .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 9c5fc973267f..e6ff99c4bd3b 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -37,6 +37,7 @@ struct yeah { u32 fast_count; u32 pkts_acked; + u32 loss_cwnd; }; static void tcp_yeah_init(struct sock *sk) @@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); + yeah->loss_cwnd = tp->snd_cwnd; return max_t(int, tp->snd_cwnd - reduction, 2); } +static u32 tcp_yeah_cwnd_undo(struct sock *sk) +{ + const struct yeah *yeah = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd); +} + static struct tcp_congestion_ops tcp_yeah __read_mostly = { .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, + .undo_cwnd = tcp_yeah_cwnd_undo, .cong_avoid = tcp_yeah_cong_avoid, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e1fc0116e8d5..9ca279b130d5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1177,42 +1177,102 @@ out: /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial) { + struct udp_sock *up = udp_sk(sk); int amt; - atomic_sub(size, &sk->sk_rmem_alloc); + if (likely(partial)) { + up->forward_deficit += size; + size = up->forward_deficit; + if (size < (sk->sk_rcvbuf >> 2) && + !skb_queue_empty(&sk->sk_receive_queue)) + return; + } else { + size += up->forward_deficit; + } + up->forward_deficit = 0; + sk->sk_forward_alloc += size; amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); sk->sk_forward_alloc -= amt; if (amt) __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); + + atomic_sub(size, &sk->sk_rmem_alloc); } -/* Note: called with sk_receive_queue.lock held */ +/* Note: called with sk_receive_queue.lock held. + * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch + * This avoids a cache line miss while receive_queue lock is held. + * Look at __udp_enqueue_schedule_skb() to find where this copy is done. + */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { - udp_rmem_release(sk, skb->truesize, 1); + udp_rmem_release(sk, skb->dev_scratch, 1); } EXPORT_SYMBOL(udp_skb_destructor); +/* Idea of busylocks is to let producers grab an extra spinlock + * to relieve pressure on the receive_queue spinlock shared by consumer. + * Under flood, this means that only one producer can be in line + * trying to acquire the receive_queue spinlock. + * These busylock can be allocated on a per cpu manner, instead of a + * per socket one (that would consume a cache line per socket) + */ +static int udp_busylocks_log __read_mostly; +static spinlock_t *udp_busylocks __read_mostly; + +static spinlock_t *busylock_acquire(void *ptr) +{ + spinlock_t *busy; + + busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); + spin_lock(busy); + return busy; +} + +static void busylock_release(spinlock_t *busy) +{ + if (busy) + spin_unlock(busy); +} + int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, delta, amt, err = -ENOMEM; - int size = skb->truesize; + spinlock_t *busy = NULL; + int size; /* try to avoid the costly atomic add/sub pair when the receive * queue is full; always allow at least a packet */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem && (rmem + size > sk->sk_rcvbuf)) + if (rmem > sk->sk_rcvbuf) goto drop; + /* Under mem pressure, it might be helpful to help udp_recvmsg() + * having linear skbs : + * - Reduce memory overhead and thus increase receive queue capacity + * - Less cache line misses at copyout() time + * - Less work at consume_skb() (less alien page frag freeing) + */ + if (rmem > (sk->sk_rcvbuf >> 1)) { + skb_condense(skb); + + busy = busylock_acquire(sk); + } + size = skb->truesize; + /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss + * in udp_skb_destructor() + */ + skb->dev_scratch = size; + /* we drop only if the receive buf is full and the receive * queue contains some other skb */ rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if ((rmem > sk->sk_rcvbuf) && (rmem > size)) + if (rmem > (size + sk->sk_rcvbuf)) goto uncharge_drop; spin_lock(&list->lock); @@ -1233,7 +1293,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) /* no need to setup a destructor, we will explicitly release the * forward allocated memory on dequeue */ - skb->dev = NULL; sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); @@ -1242,6 +1301,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); + busylock_release(busy); return 0; uncharge_drop: @@ -1249,6 +1309,7 @@ uncharge_drop: drop: atomic_inc(&sk->sk_drops); + busylock_release(busy); return err; } EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); @@ -1389,7 +1450,8 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { + if (copied < ulen || peeking || + (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; @@ -1561,7 +1623,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1742,10 +1804,10 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & - udp_table.mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; + udptable->mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udp_table.hash2[hash2]; + hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -2602,6 +2664,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd); void __init udp_init(void) { unsigned long limit; + unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -2612,4 +2675,13 @@ void __init udp_init(void) sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; + + /* 16 spinlocks per cpu */ + udp_busylocks_log = ilog2(nr_cpu_ids) + 4; + udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, + GFP_KERNEL); + if (!udp_busylocks) + panic("UDP: failed to alloc udp_busylocks\n"); + for (i = 0; i < (1U << udp_busylocks_log); i++) + spin_lock_init(udp_busylocks + i); } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 7e0fe4bdd967..feb50a16398d 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 86219c0a0104..c1e124bc8e1e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -183,7 +183,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); -static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id); static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); @@ -242,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { #ifdef CONFIG_IPV6_SEG6_HMAC .seg6_require_hmac = 0, #endif + .enhanced_dad = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -292,6 +293,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { #ifdef CONFIG_IPV6_SEG6_HMAC .seg6_require_hmac = 0, #endif + .enhanced_dad = 1, }; /* Check if a valid qdisc is available */ @@ -2906,6 +2908,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; spin_unlock_bh(&ifp->lock); + rt_genid_bump_ipv6(dev_net(idev->dev)); ipv6_ifa_notify(RTM_NEWADDR, ifp); in6_ifa_put(ifp); } @@ -3734,12 +3737,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) { unsigned long rand_num; struct inet6_dev *idev = ifp->idev; + u64 nonce; if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); + nonce = 0; + if (idev->cnf.enhanced_dad || + dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) { + do + get_random_bytes(&nonce, 6); + while (nonce == 0); + } + ifp->dad_nonce = nonce; ifp->dad_probes = idev->cnf.dad_transmits; addrconf_mod_dad_work(ifp, rand_num); } @@ -3748,7 +3760,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; - bool notify = false; + bool bump_id, notify = false; addrconf_join_solict(dev, &ifp->addr); @@ -3763,11 +3775,12 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { + bump_id = ifp->flags & IFA_F_TENTATIVE; ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); - addrconf_dad_completed(ifp); + addrconf_dad_completed(ifp, bump_id); return; } @@ -3827,8 +3840,8 @@ static void addrconf_dad_work(struct work_struct *w) struct inet6_ifaddr, dad_work); struct inet6_dev *idev = ifp->idev; + bool bump_id, disable_ipv6 = false; struct in6_addr mcaddr; - bool disable_ipv6 = false; enum { DAD_PROCESS, @@ -3898,11 +3911,12 @@ static void addrconf_dad_work(struct work_struct *w) * DAD was successful */ + bump_id = ifp->flags & IFA_F_TENTATIVE; ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); - addrconf_dad_completed(ifp); + addrconf_dad_completed(ifp, bump_id); goto out; } @@ -3915,7 +3929,8 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, + ifp->dad_nonce); out: in6_ifa_put(ifp); rtnl_unlock(); @@ -3939,7 +3954,7 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp) return true; } -static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id) { struct net_device *dev = ifp->idev->dev; struct in6_addr lladdr; @@ -3991,6 +4006,9 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) spin_unlock(&ifp->lock); write_unlock_bh(&ifp->idev->lock); } + + if (bump_id) + rt_genid_bump_ipv6(dev_net(dev)); } static void addrconf_dad_run(struct inet6_dev *idev) @@ -4956,6 +4974,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #ifdef CONFIG_IPV6_SEG6_HMAC array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; #endif + array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; } static inline size_t inet6_ifla6_size(void) @@ -6064,6 +6083,13 @@ static const struct ctl_table addrconf_sysctl[] = { }, #endif { + .procname = "enhanced_dad", + .data = &ipv6_devconf.enhanced_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d424f3a3737a..237e654ba717 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -258,6 +258,14 @@ lookup_protocol: goto out; } } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } out: return err; out_rcu_unlock: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c5d76d2edd26..0489e19258ad 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -140,7 +140,8 @@ void ip6_datagram_release_cb(struct sock *sk) } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); -static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); @@ -253,6 +254,7 @@ ipv4_connected: out: return err; } +EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 218f0cba231c..cbcdd5db31f4 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -418,7 +418,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) esph = (void *)skb_push(skb, 4); *seqhi = esph->spi; esph->spi = esph->seq_no; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; aead_request_set_callback(req, 0, esp_input_done_esn, skb); } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index eb948ffd734b..17fa28f7a0ff 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -448,8 +448,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; - else - iif = l3mdev_master_ifindex(skb_dst(skb)->dev); + else { + dst = skb_dst(skb); + iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev); + } /* * Must not send error if the source does not uniquely diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 1fcf61f1cbc3..89c59e656f44 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -99,7 +99,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, segs = ops->callbacks.gso_segment(skb, features); } - if (IS_ERR(segs)) + if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 312cbd0e5038..70d0de404197 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include <linux/module.h> #include <linux/slab.h> +#include <linux/bpf-cgroup.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -131,6 +132,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d3c619eda051..8b186b56183a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1034,6 +1034,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, int mtu; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; + bool use_cache = false; u8 hop_limit; int err = -1; @@ -1066,7 +1067,15 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); - } else if (!fl6->flowi6_mark) + } else if (!(t->parms.flags & + (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { + /* enable the cache only only if the routing decision does + * not depend on the current inner header value + */ + use_cache = true; + } + + if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) @@ -1150,7 +1159,7 @@ route_lookup: if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { - if (!fl6->flowi6_mark && ndst) + if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); @@ -1172,7 +1181,6 @@ route_lookup: if (err) return err; - skb->protocol = htons(ETH_P_IPV6); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index c476bb8e9cdb..f4b4a4a5f4ba 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1122,6 +1122,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .priority = 100, }; +static bool is_vti6_tunnel(const struct net_device *dev) +{ + return dev->netdev_ops == &vti6_netdev_ops; +} + +static int vti6_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct ip6_tnl *t = netdev_priv(dev); + + if (!is_vti6_tunnel(dev)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_DOWN: + if (!net_eq(t->net, dev_net(dev))) + xfrm_garbage_collect(t->net); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block vti6_notifier_block __read_mostly = { + .notifier_call = vti6_device_event, +}; + /** * vti6_tunnel_init - register protocol and reserve needed resources * @@ -1132,6 +1159,8 @@ static int __init vti6_tunnel_init(void) const char *msg; int err; + register_netdevice_notifier(&vti6_notifier_block); + msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) @@ -1164,6 +1193,7 @@ xfrm_proto_ah_failed: xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); pernet_dev_failed: + unregister_netdevice_notifier(&vti6_notifier_block); pr_err("vti6 init: failed to register %s\n", msg); return err; } @@ -1178,6 +1208,7 @@ static void __exit vti6_tunnel_cleanup(void) xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); + unregister_netdevice_notifier(&vti6_notifier_block); } module_init(vti6_tunnel_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d8e671457d10..7ebac630d3c6 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -233,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: + case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, @@ -568,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev) } void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr) + const struct in6_addr *daddr, const struct in6_addr *saddr, + u64 nonce) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -588,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); + if (nonce != 0) + optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -605,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); + if (nonce != 0) { + u8 *opt = skb_put(skb, 8); + + opt[0] = ND_OPT_NONCE; + opt[1] = 8 >> 3; + memcpy(opt + 2, &nonce, 6); + } ndisc_send_skb(skb, daddr, saddr); } @@ -693,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, target, target, saddr); + ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, target, &mcaddr, saddr); + ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } @@ -742,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) int dad = ipv6_addr_any(saddr); bool inc; int is_router = -1; + u64 nonce = 0; if (skb->len < sizeof(struct nd_msg)) { ND_PRINTK(2, warn, "NS: packet too short\n"); @@ -786,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } } + if (ndopts.nd_opts_nonce) + memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); @@ -794,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb) have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { + if (nonce != 0 && ifp->dad_nonce == nonce) { + u8 *np = (u8 *)&nonce; + /* Matching nonce if looped back */ + ND_PRINTK(2, notice, + "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", + ifp->idev->dev->name, + &ifp->addr, np); + goto out; + } /* * We are colliding with another node * who is doing DAD diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 6ff42b8301cc..d56d8ac09a94 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -562,7 +562,8 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) static int find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, - unsigned int size) + unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; @@ -570,12 +571,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - unsigned long pcnt; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -612,7 +610,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, cleanup_match(ematch, net); } - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -699,8 +697,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -709,6 +706,7 @@ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ip6t_entry *iter; unsigned int *offsets; unsigned int i; @@ -768,7 +766,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, net, repl->name, repl->size); + ret = find_check_entry(iter, net, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 99a1216287c8..98c8dd38575a 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -440,12 +440,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_l3proto_try_module_get(par->family); + return nf_ct_netns_get(par->net, par->family); } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 500be28ff563..4e3402486833 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -34,6 +34,13 @@ #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netfilter/nf_log.h> +static int conntrack6_net_id; +static DEFINE_MUTEX(register_ipv6_hooks); + +struct conntrack6_net { + unsigned int users; +}; + static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple) { @@ -308,6 +315,42 @@ static int ipv6_nlattr_tuple_size(void) } #endif +static int ipv6_hooks_register(struct net *net) +{ + struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); + int err = 0; + + mutex_lock(®ister_ipv6_hooks); + cnet->users++; + if (cnet->users > 1) + goto out_unlock; + + err = nf_defrag_ipv6_enable(net); + if (err < 0) { + cnet->users = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + if (err) + cnet->users = 0; + out_unlock: + mutex_unlock(®ister_ipv6_hooks); + return err; +} + +static void ipv6_hooks_unregister(struct net *net) +{ + struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); + + mutex_lock(®ister_ipv6_hooks); + if (cnet->users && (--cnet->users == 0)) + nf_unregister_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + mutex_unlock(®ister_ipv6_hooks); +} + struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .l3proto = PF_INET6, .name = "ipv6", @@ -321,6 +364,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, #endif + .net_ns_get = ipv6_hooks_register, + .net_ns_put = ipv6_hooks_unregister, .me = THIS_MODULE, }; @@ -340,6 +385,15 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = { &nf_conntrack_l4proto_tcp6, &nf_conntrack_l4proto_udp6, &nf_conntrack_l4proto_icmpv6, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite6, +#endif }; static int ipv6_net_init(struct net *net) @@ -370,6 +424,8 @@ static void ipv6_net_exit(struct net *net) static struct pernet_operations ipv6_net_ops = { .init = ipv6_net_init, .exit = ipv6_net_exit, + .id = &conntrack6_net_id, + .size = sizeof(struct conntrack6_net), }; static int __init nf_conntrack_l3proto_ipv6_init(void) @@ -377,7 +433,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) int ret = 0; need_conntrack(); - nf_defrag_ipv6_enable(); ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { @@ -389,18 +444,10 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) if (ret < 0) goto cleanup_sockopt; - ret = nf_register_hooks(ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: can't register pre-routing defrag " - "hook.\n"); - goto cleanup_pernet; - } - ret = nf_ct_l4proto_register(builtin_l4proto6, ARRAY_SIZE(builtin_l4proto6)); if (ret < 0) - goto cleanup_hooks; + goto cleanup_pernet; ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); if (ret < 0) { @@ -411,8 +458,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) cleanup_l4proto: nf_ct_l4proto_unregister(builtin_l4proto6, ARRAY_SIZE(builtin_l4proto6)); - cleanup_hooks: - nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); cleanup_pernet: unregister_pernet_subsys(&ipv6_net_ops); cleanup_sockopt: @@ -426,7 +471,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void) nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); nf_ct_l4proto_unregister(builtin_l4proto6, ARRAY_SIZE(builtin_l4proto6)); - nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); unregister_pernet_subsys(&ipv6_net_ops); nf_unregister_sockopt(&so_getorigdst6); } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index e4347aeb2e65..9948b5ce52da 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -576,11 +576,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) /* Jumbo payload inhibits frag. header */ if (ipv6_hdr(skb)->payload_len == 0) { pr_debug("payload len = 0\n"); - return -EINVAL; + return 0; } if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) - return -EINVAL; + return 0; if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index f7aab5ab93a5..8e0bdd058787 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -30,6 +30,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +static DEFINE_MUTEX(defrag6_mutex); + static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { @@ -69,7 +71,7 @@ static unsigned int ipv6_defrag(void *priv, if (err == -EINPROGRESS) return NF_STOLEN; - return NF_ACCEPT; + return err == 0 ? NF_ACCEPT : NF_DROP; } static struct nf_hook_ops ipv6_defrag_ops[] = { @@ -87,6 +89,19 @@ static struct nf_hook_ops ipv6_defrag_ops[] = { }, }; +static void __net_exit defrag6_net_exit(struct net *net) +{ + if (net->nf.defrag_ipv6) { + nf_unregister_net_hooks(net, ipv6_defrag_ops, + ARRAY_SIZE(ipv6_defrag_ops)); + net->nf.defrag_ipv6 = false; + } +} + +static struct pernet_operations defrag6_net_ops = { + .exit = defrag6_net_exit, +}; + static int __init nf_defrag_init(void) { int ret = 0; @@ -96,9 +111,9 @@ static int __init nf_defrag_init(void) pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); return ret; } - ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + ret = register_pernet_subsys(&defrag6_net_ops); if (ret < 0) { - pr_err("nf_defrag_ipv6: can't register hooks\n"); + pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } return ret; @@ -111,12 +126,31 @@ cleanup_frag6: static void __exit nf_defrag_fini(void) { - nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } -void nf_defrag_ipv6_enable(void) +int nf_defrag_ipv6_enable(struct net *net) { + int err = 0; + + might_sleep(); + + if (net->nf.defrag_ipv6) + return 0; + + mutex_lock(&defrag6_mutex); + if (net->nf.defrag_ipv6) + goto out_unlock; + + err = nf_register_net_hooks(net, ipv6_defrag_ops, + ARRAY_SIZE(ipv6_defrag_ops)); + if (err == 0) + net->nf.defrag_ipv6 = true; + + out_unlock: + mutex_unlock(&defrag6_mutex); + return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index a5400223fd74..10090400c72f 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -156,6 +156,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; + fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index d526bb594956..c947aad8bcc6 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -235,7 +235,7 @@ nft_fib6_select_ops(const struct nft_ctx *ctx, if (!tb[NFTA_FIB_RESULT]) return ERR_PTR(-EINVAL); - result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT])); + result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); switch (result) { case NFT_FIB_RESULT_OIF: diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index a2aff1277b40..6c5b5b1830a7 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -36,12 +36,19 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, nft_out(pkt)); } +static void +nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV6); +} + static struct nft_expr_type nft_masq_ipv6_type; static const struct nft_expr_ops nft_masq_ipv6_ops = { .type = &nft_masq_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), .eval = nft_masq_ipv6_eval, .init = nft_masq_init, + .destroy = nft_masq_ipv6_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, }; @@ -78,5 +85,5 @@ module_init(nft_masq_ipv6_module_init); module_exit(nft_masq_ipv6_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index bfcd5af6bc15..f5ac080fc084 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -39,12 +39,19 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt)); } +static void +nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV6); +} + static struct nft_expr_type nft_redir_ipv6_type; static const struct nft_expr_ops nft_redir_ipv6_ops = { .type = &nft_redir_ipv6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_ipv6_eval, .init = nft_redir_init, + .destroy = nft_redir_ipv6_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, }; @@ -72,5 +79,5 @@ module_init(nft_redir_ipv6_module_init); module_exit(nft_redir_ipv6_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir"); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 7cca8ac66fe9..cd4252346a32 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -155,6 +155,8 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(!skb)) return 0; + skb->protocol = htons(ETH_P_IPV6); + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b317bb135ed4..2413a0637d99 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -527,7 +527,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); dev_put(work->dev); kfree(work); } @@ -2000,8 +2000,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) It is very good, but in some (rare!) circumstances (SIT, PtP, NBMA NOARP links) it is handy to allow some exceptions. --ANK + We allow IPv4-mapped nexthops to support RFC4798-type + addressing */ - if (!(gwa_type & IPV6_ADDR_UNICAST)) + if (!(gwa_type & (IPV6_ADDR_UNICAST | + IPV6_ADDR_MAPPED))) goto out; if (cfg->fc_table) { diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 97830a6a9cbb..a4d49760bf43 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -209,6 +209,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack.v64 = 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; + treq->ts_off = 0; /* * We need to lookup the dst_entry to get the correct window size. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 28ec0a2e7b72..73bc8fc68acd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) } } -static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) +static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff) { return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); + tcp_hdr(skb)->source, tsoff); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, @@ -283,7 +283,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, - inet->inet_dport); + inet->inet_dport, + &tp->tsoffset); err = tcp_connect(sk); if (err) @@ -398,7 +399,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, - &tp->tsq_flags)) + &sk->sk_tsq_flags)) sock_hold(sk); goto out; } @@ -956,7 +957,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, + tcp_time_stamp + tcp_rsk(req)->ts_off, + req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0, 0); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4f99417d9b40..649efc26a252 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -363,7 +363,8 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { + if (copied < ulen || peeking || + (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; @@ -511,7 +512,7 @@ out: return; } -static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -690,10 +691,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & - udp_table.mask; - hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; + udptable->mask; + hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udp_table.hash2[hash2]; + hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index f6eb1ab34f4b..e78bdc76dcc3 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -26,7 +26,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); -int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index fce25afb652a..8938b6ba57a0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -61,7 +61,8 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif if ((l2tp->conn_id == tunnel_id) && net_eq(sock_net(sk), net) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (!sk->sk_bound_dev_if || !dif || + sk->sk_bound_dev_if == dif)) goto found; } @@ -182,15 +183,17 @@ pass_up: struct iphdr *iph = (struct iphdr *) skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); - sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id); + sk = __l2tp_ip_bind_lookup(net, iph->daddr, inet_iif(skb), + tunnel_id); + if (!sk) { + read_unlock_bh(&l2tp_ip_lock); + goto discard; + } + + sock_hold(sk); read_unlock_bh(&l2tp_ip_lock); } - if (sk == NULL) - goto discard; - - sock_hold(sk); - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; @@ -251,22 +254,17 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret; int chk_addr_ret; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; if (addr_len < sizeof(struct sockaddr_l2tpip)) return -EINVAL; if (addr->l2tp_family != AF_INET) return -EINVAL; - ret = -EADDRINUSE; - read_lock_bh(&l2tp_ip_lock); - if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, - sk->sk_bound_dev_if, addr->l2tp_conn_id)) - goto out_in_use; + lock_sock(sk); - read_unlock_bh(&l2tp_ip_lock); + ret = -EINVAL; + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out; - lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) goto out; @@ -280,14 +278,22 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ - sk_dst_reset(sk); + write_lock_bh(&l2tp_ip_lock); + if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, + sk->sk_bound_dev_if, addr->l2tp_conn_id)) { + write_unlock_bh(&l2tp_ip_lock); + ret = -EADDRINUSE; + goto out; + } + + sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; - write_lock_bh(&l2tp_ip_lock); sk_add_bind_node(sk, &l2tp_ip_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); + ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); @@ -295,11 +301,6 @@ out: release_sock(sk); return ret; - -out_in_use: - read_unlock_bh(&l2tp_ip_lock); - - return ret; } static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -307,21 +308,24 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr; int rc; - if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ - return -EINVAL; - if (addr_len < sizeof(*lsa)) return -EINVAL; if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) return -EINVAL; - rc = ip4_datagram_connect(sk, uaddr, addr_len); - if (rc < 0) - return rc; - lock_sock(sk); + /* Must bind first - autobinding does not work */ + if (sock_flag(sk, SOCK_ZAPPED)) { + rc = -EINVAL; + goto out_sk; + } + + rc = __ip4_datagram_connect(sk, uaddr, addr_len); + if (rc < 0) + goto out_sk; + l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip_lock); @@ -329,7 +333,9 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len sk_add_bind_node(sk, &l2tp_ip_bind_table); write_unlock_bh(&l2tp_ip_lock); +out_sk: release_sock(sk); + return rc; } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 1cea54feab27..f092ac441fdd 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -72,8 +72,9 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net, if ((l2tp->conn_id == tunnel_id) && net_eq(sock_net(sk), net) && - !(addr && ipv6_addr_equal(addr, laddr)) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (!addr || ipv6_addr_equal(addr, laddr)) && + (!sk->sk_bound_dev_if || !dif || + sk->sk_bound_dev_if == dif)) goto found; } @@ -196,16 +197,17 @@ pass_up: struct ipv6hdr *iph = ipv6_hdr(skb); read_lock_bh(&l2tp_ip6_lock); - sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, - 0, tunnel_id); + sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, inet6_iif(skb), + tunnel_id); + if (!sk) { + read_unlock_bh(&l2tp_ip6_lock); + goto discard; + } + + sock_hold(sk); read_unlock_bh(&l2tp_ip6_lock); } - if (sk == NULL) - goto discard; - - sock_hold(sk); - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; @@ -266,11 +268,10 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr; struct net *net = sock_net(sk); __be32 v4addr = 0; + int bound_dev_if; int addr_type; int err; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) @@ -286,41 +287,34 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; - err = -EADDRINUSE; - read_lock_bh(&l2tp_ip6_lock); - if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, - sk->sk_bound_dev_if, addr->l2tp_conn_id)) - goto out_in_use; - read_unlock_bh(&l2tp_ip6_lock); - lock_sock(sk); err = -EINVAL; + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out_unlock; + if (sk->sk_state != TCP_CLOSE) goto out_unlock; + bound_dev_if = sk->sk_bound_dev_if; + /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - addr->l2tp_scope_id) { - /* Override any existing binding, if another - * one is supplied by user. - */ - sk->sk_bound_dev_if = addr->l2tp_scope_id; - } + if (addr->l2tp_scope_id) + bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an - interface */ - if (!sk->sk_bound_dev_if) + * interface. + */ + if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; - dev = dev_get_by_index_rcu(sock_net(sk), - sk->sk_bound_dev_if); + dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } @@ -335,13 +329,22 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) } rcu_read_unlock(); - inet->inet_rcv_saddr = inet->inet_saddr = v4addr; + write_lock_bh(&l2tp_ip6_lock); + if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if, + addr->l2tp_conn_id)) { + write_unlock_bh(&l2tp_ip6_lock); + err = -EADDRINUSE; + goto out_unlock; + } + + inet->inet_saddr = v4addr; + inet->inet_rcv_saddr = v4addr; + sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; - write_lock_bh(&l2tp_ip6_lock); sk_add_bind_node(sk, &l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip6_lock); @@ -354,10 +357,7 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); - return err; -out_in_use: - read_unlock_bh(&l2tp_ip6_lock); return err; } @@ -370,9 +370,6 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_type; int rc; - if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ - return -EINVAL; - if (addr_len < sizeof(*lsa)) return -EINVAL; @@ -389,10 +386,18 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, return -EINVAL; } - rc = ip6_datagram_connect(sk, uaddr, addr_len); - lock_sock(sk); + /* Must bind first - autobinding does not work */ + if (sock_flag(sk, SOCK_ZAPPED)) { + rc = -EINVAL; + goto out_sk; + } + + rc = __ip6_datagram_connect(sk, uaddr, addr_len); + if (rc < 0) + goto out_sk; + l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip6_lock); @@ -400,6 +405,7 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, sk_add_bind_node(sk, &l2tp_ip6_bind_table); write_unlock_bh(&l2tp_ip6_lock); +out_sk: release_sock(sk); return rc; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 236d47e76ced..1711bae4abf2 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (!local->ops->set_tim) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) return; if (sta->dead) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 62ccaf6f585d..2c21b7039136 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1500,7 +1500,6 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct fq *fq = &local->fq; struct ieee80211_vif *vif; struct txq_info *txqi; @@ -1525,8 +1524,6 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, if (!txqi) return false; - info->control.vif = vif; - spin_lock_bh(&fq->lock); ieee80211_txq_enqueue(local, txqi, skb); spin_unlock_bh(&fq->lock); @@ -3238,7 +3235,6 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; - *ieee80211_get_qos_ctl(hdr) = tid; hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); } else { info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; @@ -3363,6 +3359,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT; + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + *ieee80211_get_qos_ctl(hdr) = tid; + } + __skb_queue_head_init(&tx.skbs); tx.flags = IEEE80211_TX_UNICAST; @@ -3451,6 +3452,11 @@ begin: goto begin; } + if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) + info->flags |= IEEE80211_TX_CTL_AMPDU; + else + info->flags &= ~IEEE80211_TX_CTL_AMPDU; + if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { struct sta_info *sta = container_of(txq->sta, struct sta_info, sta); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index ee715764a828..6832bf6ab69f 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -270,6 +270,22 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2); } + /* + * This is a workaround for VHT-enabled STAs which break the spec + * and have the VHT-MCS Rx map filled in with value 3 for all eight + * spacial streams, an example is AR9462. + * + * As per spec, in section 22.1.1 Introduction to the VHT PHY + * A VHT STA shall support at least single spactial stream VHT-MCSs + * 0 to 7 (transmit and receive) in all supported channel widths. + */ + if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) { + vht_cap->vht_supported = false; + sdata_info(sdata, "Ignoring VHT IE from %pM due to invalid rx_mcs_map\n", + sta->addr); + return; + } + /* finally set up the bandwidth */ switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0e4334cbde17..15fe97644ffe 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1252,7 +1252,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (!nla) continue; - switch(index) { + switch (index) { case RTA_OIF: cfg->rc_ifindex = nla_get_u32(nla); break; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 44410d30d461..63729b489c2c 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -146,38 +146,38 @@ config NF_CONNTRACK_LABELS to connection tracking entries. It selected by the connlabel match. config NF_CT_PROTO_DCCP - tristate 'DCCP protocol connection tracking support' + bool 'DCCP protocol connection tracking support' depends on NETFILTER_ADVANCED - default IP_DCCP + default y help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on DCCP connections. - If unsure, say 'N'. + If unsure, say Y. config NF_CT_PROTO_GRE tristate config NF_CT_PROTO_SCTP - tristate 'SCTP protocol connection tracking support' + bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED - default IP_SCTP + default y help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + If unsure, say Y. config NF_CT_PROTO_UDPLITE - tristate 'UDP-Lite protocol connection tracking support' + bool 'UDP-Lite protocol connection tracking support' depends on NETFILTER_ADVANCED + default y help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on UDP-Lite connections. - To compile it as a module, choose M here. If unsure, say N. + If unsure, say Y. config NF_CONNTRACK_AMANDA tristate "Amanda backup protocol support" @@ -384,17 +384,17 @@ config NF_NAT_NEEDED default y config NF_NAT_PROTO_DCCP - tristate + bool depends on NF_NAT && NF_CT_PROTO_DCCP default NF_NAT && NF_CT_PROTO_DCCP config NF_NAT_PROTO_UDPLITE - tristate + bool depends on NF_NAT && NF_CT_PROTO_UDPLITE default NF_NAT && NF_CT_PROTO_UDPLITE config NF_NAT_PROTO_SCTP - tristate + bool default NF_NAT && NF_CT_PROTO_SCTP depends on NF_NAT && NF_CT_PROTO_SCTP select LIBCRC32C @@ -551,6 +551,12 @@ config NFT_NAT This option adds the "nat" expression that you can use to perform typical Network Address Translation (NAT) packet transformations. +config NFT_OBJREF + tristate "Netfilter nf_tables stateful object reference module" + help + This option adds the "objref" expression that allows you to refer to + stateful objects, such as counters and quotas. + config NFT_QUEUE depends on NETFILTER_NETLINK_QUEUE tristate "Netfilter nf_tables queue module" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5bbf767672ec..ca30d1960f1d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -5,6 +5,9 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o +nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o +nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o +nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o obj-$(CONFIG_NETFILTER) = netfilter.o @@ -16,11 +19,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o # connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o -# SCTP protocol connection tracking -obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o -obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o -obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o # netlink interface for nf_conntrack obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o @@ -45,6 +44,11 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o +# NAT protocols (nf_nat) +nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o +nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o +nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o + # generic transport layer logging obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o @@ -54,11 +58,6 @@ obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o obj-$(CONFIG_NF_NAT) += nf_nat.o obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o -# NAT protocols (nf_nat) -obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o -obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o -obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o - # NAT helpers obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o @@ -89,6 +88,7 @@ obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o +obj-$(CONFIG_NFT_OBJREF) += nft_objref.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index de30e08d58f2..ce6adfae521a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -102,17 +102,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) if (!entry) return -ENOMEM; - entry->orig_ops = reg; - entry->ops = *reg; - entry->next = NULL; + nf_hook_entry_init(entry, reg); mutex_lock(&nf_hook_mutex); /* Find the spot in the list */ - while ((p = nf_entry_dereference(*pp)) != NULL) { - if (reg->priority < p->orig_ops->priority) + for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) { + if (reg->priority < nf_hook_entry_priority(p)) break; - pp = &p->next; } rcu_assign_pointer(entry->next, p); rcu_assign_pointer(*pp, entry); @@ -139,12 +136,11 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) return; mutex_lock(&nf_hook_mutex); - while ((p = nf_entry_dereference(*pp)) != NULL) { - if (p->orig_ops == reg) { + for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) { + if (nf_hook_entry_ops(p) == reg) { rcu_assign_pointer(*pp, p->next); break; } - pp = &p->next; } mutex_unlock(&nf_hook_mutex); if (!p) { @@ -311,7 +307,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, int ret; do { - verdict = entry->ops.hook(entry->ops.priv, skb, state); + verdict = nf_hook_entry_hookfn(entry, skb, state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: entry = rcu_dereference(entry->next); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 038c2ba0ae0f..3d02b0c13547 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3260,7 +3260,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc) || svc == NULL) + if (IS_ERR_OR_NULL(svc)) goto out_err; /* Dump the destinations */ diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 01d3d894de46..4e1a98fcc8c3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -254,6 +254,54 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, return true; } +static inline bool decrement_ttl(struct netns_ipvs *ipvs, + int skb_af, + struct sk_buff *skb) +{ + struct net *net = ipvs->net; + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + struct dst_entry *dst = skb_dst(skb); + + /* check and decrement ttl */ + if (ipv6_hdr(skb)->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, + ICMPV6_EXC_HOPLIMIT, 0); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); + + return false; + } + + /* don't propagate ttl change to cloned packets */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + return false; + + ipv6_hdr(skb)->hop_limit--; + } else +#endif + { + if (ip_hdr(skb)->ttl <= 1) { + /* Tell the sender its packet died... */ + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + return false; + } + + /* don't propagate ttl change to cloned packets */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return false; + + /* Decrease ttl */ + ip_decrease_ttl(ip_hdr(skb)); + } + + return true; +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, @@ -326,6 +374,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, return local; } + if (!decrement_ttl(ipvs, skb_af, skb)) + goto err_put; + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { mtu = dst_mtu(&rt->dst); } else { @@ -473,6 +524,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, return local; } + if (!decrement_ttl(ipvs, skb_af, skb)) + goto err_put; + /* MTU checking */ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) mtu = dst_mtu(&rt->dst); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 9bd34647225a..2d6ee1803415 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -125,6 +125,54 @@ void nf_ct_l3proto_module_put(unsigned short l3proto) } EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + const struct nf_conntrack_l3proto *l3proto; + int ret; + + might_sleep(); + + ret = nf_ct_l3proto_try_module_get(nfproto); + if (ret < 0) + return ret; + + /* we already have a reference, can't fail */ + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(nfproto); + rcu_read_unlock(); + + if (!l3proto->net_ns_get) + return 0; + + ret = l3proto->net_ns_get(net); + if (ret < 0) + nf_ct_l3proto_module_put(nfproto); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_netns_get); + +void nf_ct_netns_put(struct net *net, u8 nfproto) +{ + const struct nf_conntrack_l3proto *l3proto; + + might_sleep(); + + /* same as nf_conntrack_netns_get(), reference assumed */ + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(nfproto); + rcu_read_unlock(); + + if (WARN_ON(!l3proto)) + return; + + if (l3proto->net_ns_put) + l3proto->net_ns_put(net); + + nf_ct_l3proto_module_put(nfproto); +} +EXPORT_SYMBOL_GPL(nf_ct_netns_put); + struct nf_conntrack_l4proto * nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) { @@ -190,20 +238,19 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); +#ifdef CONFIG_SYSCTL +extern unsigned int nf_conntrack_default_on; + int nf_ct_l3proto_pernet_register(struct net *net, struct nf_conntrack_l3proto *proto) { - int ret; - - if (proto->init_net) { - ret = proto->init_net(net); - if (ret < 0) - return ret; - } + if (nf_conntrack_default_on == 0) + return 0; - return 0; + return proto->net_ns_get ? proto->net_ns_get(net) : 0; } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); +#endif void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) { @@ -224,6 +271,16 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); void nf_ct_l3proto_pernet_unregister(struct net *net, struct nf_conntrack_l3proto *proto) { + /* + * nf_conntrack_default_on *might* have registered hooks. + * ->net_ns_put must cope with more puts() than get(), i.e. + * if nf_conntrack_default_on was 0 at time of + * nf_ct_l3proto_pernet_register invocation this net_ns_put() + * should be a noop. + */ + if (proto->net_ns_put) + proto->net_ns_put(net); + /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 073b047314dc..b68ce6ac13b3 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -9,7 +9,6 @@ * */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/spinlock.h> @@ -384,17 +383,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }, }; -/* this module per-net specifics */ -static unsigned int dccp_net_id __read_mostly; -struct dccp_net { - struct nf_proto_net pn; - int dccp_loose; - unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -}; - -static inline struct dccp_net *dccp_pernet(struct net *net) +static inline struct nf_dccp_net *dccp_pernet(struct net *net) { - return net_generic(net, dccp_net_id); + return &net->ct.nf_ct_proto.dccp; } static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -424,7 +415,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { struct net *net = nf_ct_net(ct); - struct dccp_net *dn; + struct nf_dccp_net *dn; struct dccp_hdr _dh, *dh; const char *msg; u_int8_t state; @@ -719,7 +710,7 @@ static int dccp_nlattr_size(void) static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - struct dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = dccp_pernet(net); unsigned int *timeouts = data; int i; @@ -820,7 +811,7 @@ static struct ctl_table dccp_sysctl_table[] = { #endif /* CONFIG_SYSCTL */ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, - struct dccp_net *dn) + struct nf_dccp_net *dn) { #ifdef CONFIG_SYSCTL if (pn->ctl_table) @@ -850,7 +841,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, static int dccp_init_net(struct net *net, u_int16_t proto) { - struct dccp_net *dn = dccp_pernet(net); + struct nf_dccp_net *dn = dccp_pernet(net); struct nf_proto_net *pn = &dn->pn; if (!pn->users) { @@ -868,7 +859,7 @@ static int dccp_init_net(struct net *net, u_int16_t proto) return dccp_kmemdup_sysctl_table(net, pn, dn); } -static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { +struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, .name = "dccp", @@ -898,11 +889,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { .nla_policy = dccp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .net_id = &dccp_net_id, .init_net = dccp_init_net, }; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); -static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { +struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, .name = "dccp", @@ -932,56 +923,6 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { .nla_policy = dccp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .net_id = &dccp_net_id, .init_net = dccp_init_net, }; - -static struct nf_conntrack_l4proto *dccp_proto[] = { - &dccp_proto4, - &dccp_proto6, -}; - -static __net_init int dccp_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, dccp_proto, - ARRAY_SIZE(dccp_proto)); -} - -static __net_exit void dccp_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, dccp_proto, - ARRAY_SIZE(dccp_proto)); -} - -static struct pernet_operations dccp_net_ops = { - .init = dccp_net_init, - .exit = dccp_net_exit, - .id = &dccp_net_id, - .size = sizeof(struct dccp_net), -}; - -static int __init nf_conntrack_proto_dccp_init(void) -{ - int ret; - - ret = register_pernet_subsys(&dccp_net_ops); - if (ret < 0) - return ret; - ret = nf_ct_l4proto_register(dccp_proto, ARRAY_SIZE(dccp_proto)); - if (ret < 0) - unregister_pernet_subsys(&dccp_net_ops); - return ret; -} - -static void __exit nf_conntrack_proto_dccp_fini(void) -{ - nf_ct_l4proto_unregister(dccp_proto, ARRAY_SIZE(dccp_proto)); - unregister_pernet_subsys(&dccp_net_ops); -} - -module_init(nf_conntrack_proto_dccp_init); -module_exit(nf_conntrack_proto_dccp_fini); - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("DCCP connection tracking protocol helper"); -MODULE_LICENSE("GPL"); +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index d096c2d6b87b..a0efde38da44 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -15,7 +15,6 @@ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> -#include <linux/module.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> @@ -144,15 +143,9 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { } }; -static unsigned int sctp_net_id __read_mostly; -struct sctp_net { - struct nf_proto_net pn; - unsigned int timeouts[SCTP_CONNTRACK_MAX]; -}; - -static inline struct sctp_net *sctp_pernet(struct net *net) +static inline struct nf_sctp_net *sctp_pernet(struct net *net) { - return net_generic(net, sctp_net_id); + return &net->ct.nf_ct_proto.sctp; } static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -600,7 +593,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = sctp_pernet(net); int i; /* set default SCTP timeouts. */ @@ -708,7 +701,7 @@ static struct ctl_table sctp_sysctl_table[] = { #endif static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct sctp_net *sn) + struct nf_sctp_net *sn) { #ifdef CONFIG_SYSCTL if (pn->ctl_table) @@ -735,7 +728,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, static int sctp_init_net(struct net *net, u_int16_t proto) { - struct sctp_net *sn = sctp_pernet(net); + struct nf_sctp_net *sn = sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; if (!pn->users) { @@ -748,7 +741,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto) return sctp_kmemdup_sysctl_table(pn, sn); } -static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { +struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, .name = "sctp", @@ -778,11 +771,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .net_id = &sctp_net_id, .init_net = sctp_init_net, }; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); -static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { +struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, .name = "sctp", @@ -812,57 +805,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ #endif - .net_id = &sctp_net_id, .init_net = sctp_init_net, }; - -static struct nf_conntrack_l4proto *sctp_proto[] = { - &nf_conntrack_l4proto_sctp4, - &nf_conntrack_l4proto_sctp6, -}; - -static int sctp_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, sctp_proto, - ARRAY_SIZE(sctp_proto)); -} - -static void sctp_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, sctp_proto, - ARRAY_SIZE(sctp_proto)); -} - -static struct pernet_operations sctp_net_ops = { - .init = sctp_net_init, - .exit = sctp_net_exit, - .id = &sctp_net_id, - .size = sizeof(struct sctp_net), -}; - -static int __init nf_conntrack_proto_sctp_init(void) -{ - int ret; - - ret = register_pernet_subsys(&sctp_net_ops); - if (ret < 0) - return ret; - ret = nf_ct_l4proto_register(sctp_proto, ARRAY_SIZE(sctp_proto)); - if (ret < 0) - unregister_pernet_subsys(&sctp_net_ops); - return ret; -} - -static void __exit nf_conntrack_proto_sctp_fini(void) -{ - nf_ct_l4proto_unregister(sctp_proto, ARRAY_SIZE(sctp_proto)); - unregister_pernet_subsys(&sctp_net_ops); -} - -module_init(nf_conntrack_proto_sctp_init); -module_exit(nf_conntrack_proto_sctp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kiran Kumar Immidi"); -MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); -MODULE_ALIAS("ip_conntrack_proto_sctp"); +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 7808604c70a2..c35f7bf05d8c 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/timer.h> -#include <linux/module.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <linux/skbuff.h> @@ -24,26 +23,14 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_log.h> -enum udplite_conntrack { - UDPLITE_CT_UNREPLIED, - UDPLITE_CT_REPLIED, - UDPLITE_CT_MAX -}; - static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { [UDPLITE_CT_UNREPLIED] = 30*HZ, [UDPLITE_CT_REPLIED] = 180*HZ, }; -static unsigned int udplite_net_id __read_mostly; -struct udplite_net { - struct nf_proto_net pn; - unsigned int timeouts[UDPLITE_CT_MAX]; -}; - -static inline struct udplite_net *udplite_pernet(struct net *net) +static inline struct nf_udplite_net *udplite_pernet(struct net *net) { - return net_generic(net, udplite_net_id); + return &net->ct.nf_ct_proto.udplite; } static bool udplite_pkt_to_tuple(const struct sk_buff *skb, @@ -178,7 +165,7 @@ static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; - struct udplite_net *un = udplite_pernet(net); + struct nf_udplite_net *un = udplite_pernet(net); /* set default timeouts for UDPlite. */ timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED]; @@ -237,7 +224,7 @@ static struct ctl_table udplite_sysctl_table[] = { #endif /* CONFIG_SYSCTL */ static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct udplite_net *un) + struct nf_udplite_net *un) { #ifdef CONFIG_SYSCTL if (pn->ctl_table) @@ -257,7 +244,7 @@ static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn, static int udplite_init_net(struct net *net, u_int16_t proto) { - struct udplite_net *un = udplite_pernet(net); + struct nf_udplite_net *un = udplite_pernet(net); struct nf_proto_net *pn = &un->pn; if (!pn->users) { @@ -270,7 +257,7 @@ static int udplite_init_net(struct net *net, u_int16_t proto) return udplite_kmemdup_sysctl_table(pn, un); } -static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = +struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, @@ -299,11 +286,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .nla_policy = udplite_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .net_id = &udplite_net_id, .init_net = udplite_init_net, }; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4); -static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = +struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = { .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, @@ -332,54 +319,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .nla_policy = udplite_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .net_id = &udplite_net_id, .init_net = udplite_init_net, }; - -static struct nf_conntrack_l4proto *udplite_proto[] = { - &nf_conntrack_l4proto_udplite4, - &nf_conntrack_l4proto_udplite6, -}; - -static int udplite_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, udplite_proto, - ARRAY_SIZE(udplite_proto)); -} - -static void udplite_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, udplite_proto, - ARRAY_SIZE(udplite_proto)); -} - -static struct pernet_operations udplite_net_ops = { - .init = udplite_net_init, - .exit = udplite_net_exit, - .id = &udplite_net_id, - .size = sizeof(struct udplite_net), -}; - -static int __init nf_conntrack_proto_udplite_init(void) -{ - int ret; - - ret = register_pernet_subsys(&udplite_net_ops); - if (ret < 0) - return ret; - ret = nf_ct_l4proto_register(udplite_proto, ARRAY_SIZE(udplite_proto)); - if (ret < 0) - unregister_pernet_subsys(&udplite_net_ops); - return ret; -} - -static void __exit nf_conntrack_proto_udplite_exit(void) -{ - nf_ct_l4proto_unregister(udplite_proto, ARRAY_SIZE(udplite_proto)); - unregister_pernet_subsys(&udplite_net_ops); -} - -module_init(nf_conntrack_proto_udplite_init); -module_exit(nf_conntrack_proto_udplite_exit); - -MODULE_LICENSE("GPL"); +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5f446cd9f3fd..d009ae663453 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -452,6 +452,9 @@ static int log_invalid_proto_max __read_mostly = 255; /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; +extern unsigned int nf_conntrack_default_on; +unsigned int nf_conntrack_default_on __read_mostly = 1; + static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -517,6 +520,13 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "nf_conntrack_default_on", + .data = &nf_conntrack_default_on, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 44ae986c383f..c9d7f95768ab 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -14,6 +14,29 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) +{ + if (skb_mac_header_was_set(skb)) + skb_push(skb, skb->mac_len); + + skb->dev = dev; + dev_queue_xmit(skb); +} + +void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) +{ + struct net_device *dev; + + dev = dev_get_by_index_rcu(nft_net(pkt), oif); + if (!dev) { + kfree_skb(pkt->skb); + return; + } + + nf_do_netdev_egress(pkt->skb, dev); +} +EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress); + void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) { struct net_device *dev; @@ -24,14 +47,8 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) return; skb = skb_clone(pkt->skb, GFP_ATOMIC); - if (skb == NULL) - return; - - if (skb_mac_header_was_set(skb)) - skb_push(skb, skb->mac_len); - - skb->dev = dev; - dev_queue_xmit(skb); + if (skb) + nf_do_netdev_egress(skb, dev); } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index ed9b80815fa0..dc61399e30be 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -177,6 +177,7 @@ EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); /* bridge and netdev logging families share this code. */ void nf_log_l2packet(struct net *net, u_int8_t pf, + __be16 protocol, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -184,7 +185,7 @@ void nf_log_l2packet(struct net *net, u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - switch (eth_hdr(skb)->h_proto) { + switch (protocol) { case htons(ETH_P_IP): nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out, loginfo, "%s", prefix); diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c index 1f645949f3d8..350eb147754d 100644 --- a/net/netfilter/nf_log_netdev.c +++ b/net/netfilter/nf_log_netdev.c @@ -23,7 +23,8 @@ static void nf_log_netdev_packet(struct net *net, u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - nf_log_l2packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out, + loginfo, prefix); } static struct nf_logger nf_netdev_logger __read_mostly = { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index bbb8f3df79f7..94b14c5a8b17 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -42,7 +42,7 @@ struct nf_nat_conn_key { const struct nf_conntrack_zone *zone; }; -static struct rhashtable nf_nat_bysource_table; +static struct rhltable nf_nat_bysource_table; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -193,9 +193,12 @@ static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, const struct nf_nat_conn_key *key = arg->key; const struct nf_conn *ct = obj; - return same_src(ct, key->tuple) && - net_eq(nf_ct_net(ct), key->net) && - nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL); + if (!same_src(ct, key->tuple) || + !net_eq(nf_ct_net(ct), key->net) || + !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL)) + return 1; + + return 0; } static struct rhashtable_params nf_nat_bysource_params = { @@ -204,7 +207,6 @@ static struct rhashtable_params nf_nat_bysource_params = { .obj_cmpfn = nf_nat_bysource_cmp, .nelem_hint = 256, .min_size = 1024, - .nulls_base = (1U << RHT_BASE_SHIFT), }; /* Only called for SRC manip */ @@ -223,12 +225,15 @@ find_appropriate_src(struct net *net, .tuple = tuple, .zone = zone }; + struct rhlist_head *hl; - ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key, - nf_nat_bysource_params); - if (!ct) + hl = rhltable_lookup(&nf_nat_bysource_table, &key, + nf_nat_bysource_params); + if (!hl) return 0; + ct = container_of(hl, typeof(*ct), nat_bysource); + nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; @@ -446,11 +451,17 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) { + struct nf_nat_conn_key key = { + .net = nf_ct_net(ct), + .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + .zone = nf_ct_zone(ct), + }; int err; - err = rhashtable_insert_fast(&nf_nat_bysource_table, - &ct->nat_bysource, - nf_nat_bysource_params); + err = rhltable_insert_key(&nf_nat_bysource_table, + &key, + &ct->nat_bysource, + nf_nat_bysource_params); if (err) return NF_DROP; } @@ -567,8 +578,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ ct->status &= ~IPS_NAT_DONE_MASK; - rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -671,6 +682,18 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) &nf_nat_l4proto_tcp); RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], &nf_nat_l4proto_udp); +#ifdef CONFIG_NF_NAT_PROTO_DCCP + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP], + &nf_nat_l4proto_dccp); +#endif +#ifdef CONFIG_NF_NAT_PROTO_SCTP + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP], + &nf_nat_l4proto_sctp); +#endif +#ifdef CONFIG_NF_NAT_PROTO_UDPLITE + RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE], + &nf_nat_l4proto_udplite); +#endif mutex_unlock(&nf_nat_proto_mutex); RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); @@ -698,8 +721,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (!nat) return; - rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -834,13 +857,13 @@ static int __init nf_nat_init(void) { int ret; - ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); + ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); if (ret) return ret; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - rhashtable_destroy(&nf_nat_bysource_table); + rhltable_destroy(&nf_nat_bysource_table); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -864,7 +887,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: - rhashtable_destroy(&nf_nat_bysource_table); + rhltable_destroy(&nf_nat_bysource_table); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -883,7 +906,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - rhashtable_destroy(&nf_nat_bysource_table); + rhltable_destroy(&nf_nat_bysource_table); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index 15c47b246d0d..269fcd5dc34c 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -10,8 +10,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/skbuff.h> #include <linux/dccp.h> @@ -73,7 +71,7 @@ dccp_manip_pkt(struct sk_buff *skb, return true; } -static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { +const struct nf_nat_l4proto nf_nat_l4proto_dccp = { .l4proto = IPPROTO_DCCP, .manip_pkt = dccp_manip_pkt, .in_range = nf_nat_l4proto_in_range, @@ -82,35 +80,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; - -static int __init nf_nat_proto_dccp_init(void) -{ - int err; - - err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp); - if (err < 0) - goto err1; - err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp); - if (err < 0) - goto err2; - return 0; - -err2: - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); -err1: - return err; -} - -static void __exit nf_nat_proto_dccp_fini(void) -{ - nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp); - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp); - -} - -module_init(nf_nat_proto_dccp_init); -module_exit(nf_nat_proto_dccp_fini); - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("DCCP NAT protocol helper"); -MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index cbc7ade1487b..31d358691af0 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -7,9 +7,7 @@ */ #include <linux/types.h> -#include <linux/init.h> #include <linux/sctp.h> -#include <linux/module.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_nat_l4proto.h> @@ -49,12 +47,15 @@ sctp_manip_pkt(struct sk_buff *skb, hdr->dest = tuple->dst.u.sctp.port; } - hdr->checksum = sctp_compute_cksum(skb, hdroff); + if (skb->ip_summed != CHECKSUM_PARTIAL) { + hdr->checksum = sctp_compute_cksum(skb, hdroff); + skb->ip_summed = CHECKSUM_NONE; + } return true; } -static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { +const struct nf_nat_l4proto nf_nat_l4proto_sctp = { .l4proto = IPPROTO_SCTP, .manip_pkt = sctp_manip_pkt, .in_range = nf_nat_l4proto_in_range, @@ -63,34 +64,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; - -static int __init nf_nat_proto_sctp_init(void) -{ - int err; - - err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp); - if (err < 0) - goto err1; - err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp); - if (err < 0) - goto err2; - return 0; - -err2: - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); -err1: - return err; -} - -static void __exit nf_nat_proto_sctp_exit(void) -{ - nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp); - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp); -} - -module_init(nf_nat_proto_sctp_init); -module_exit(nf_nat_proto_sctp_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SCTP NAT protocol helper"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c index 58340c97bd83..366bfbfd82a1 100644 --- a/net/netfilter/nf_nat_proto_udplite.c +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -8,11 +8,9 @@ */ #include <linux/types.h> -#include <linux/init.h> #include <linux/udp.h> #include <linux/netfilter.h> -#include <linux/module.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_l3proto.h> #include <net/netfilter/nf_nat_l4proto.h> @@ -64,7 +62,7 @@ udplite_manip_pkt(struct sk_buff *skb, return true; } -static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { +const struct nf_nat_l4proto nf_nat_l4proto_udplite = { .l4proto = IPPROTO_UDPLITE, .manip_pkt = udplite_manip_pkt, .in_range = nf_nat_l4proto_in_range, @@ -73,34 +71,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; - -static int __init nf_nat_proto_udplite_init(void) -{ - int err; - - err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite); - if (err < 0) - goto err1; - err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite); - if (err < 0) - goto err2; - return 0; - -err2: - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); -err1: - return err; -} - -static void __exit nf_nat_proto_udplite_fini(void) -{ - nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite); - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite); -} - -module_init(nf_nat_proto_udplite_init); -module_exit(nf_nat_proto_udplite_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 77cba9f6ccb6..4a7662486f44 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -185,7 +185,7 @@ static unsigned int nf_iterate(struct sk_buff *skb, do { repeat: - verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state); + verdict = nf_hook_entry_hookfn((*entryp), skb, state); if (verdict != NF_ACCEPT) { if (verdict != NF_REPEAT) return verdict; @@ -200,7 +200,6 @@ repeat: void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { struct nf_hook_entry *hook_entry = entry->hook; - struct nf_hook_ops *elem = &hook_entry->ops; struct sk_buff *skb = entry->skb; const struct nf_afinfo *afinfo; int err; @@ -209,7 +208,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) - verdict = elem->hook(elem->priv, skb, &entry->state); + verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); if (verdict == NF_ACCEPT) { afinfo = nf_get_afinfo(entry->state.pf); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 026581b04ea8..a019a87e58ee 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -22,6 +22,7 @@ #include <net/sock.h> static LIST_HEAD(nf_tables_expressions); +static LIST_HEAD(nf_tables_objects); /** * nft_register_afinfo - register nf_tables address family info @@ -110,12 +111,12 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->seq = nlh->nlmsg_seq; } -static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, - u32 size) +static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, + int msg_type, u32 size, gfp_t gfp) { struct nft_trans *trans; - trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); + trans = kzalloc(sizeof(struct nft_trans) + size, gfp); if (trans == NULL) return NULL; @@ -125,6 +126,12 @@ static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, return trans; } +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) +{ + return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); +} + static void nft_trans_destroy(struct nft_trans *trans) { list_del(&trans->list); @@ -304,6 +311,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) return err; } +static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, + struct nft_object *obj) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWOBJ) + nft_activate_next(ctx->net, obj); + + nft_trans_obj(trans) = obj; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) +{ + int err; + + err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj); + if (err < 0) + return err; + + nft_deactivate_next(ctx->net, obj); + ctx->table->use--; + + return err; +} + /* * Tables */ @@ -688,6 +727,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); + INIT_LIST_HEAD(&table->objects); table->flags = flags; nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); @@ -709,6 +749,7 @@ static int nft_flush_table(struct nft_ctx *ctx) { int err; struct nft_chain *chain, *nc; + struct nft_object *obj, *ne; struct nft_set *set, *ns; list_for_each_entry(chain, &ctx->table->chains, list) { @@ -735,6 +776,12 @@ static int nft_flush_table(struct nft_ctx *ctx) goto out; } + list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { + err = nft_delobj(ctx, obj); + if (err < 0) + goto out; + } + list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) continue; @@ -2411,6 +2458,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, + [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2462,6 +2510,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } +EXPORT_SYMBOL_GPL(nf_tables_set_lookup); struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla, @@ -2480,6 +2529,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net, } return ERR_PTR(-ENOENT); } +EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) @@ -2568,9 +2618,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) goto nla_put_failure; } + if (set->flags & NFT_SET_OBJECT && + nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) + goto nla_put_failure; if (set->timeout && - nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout), + nla_put_be64(skb, NFTA_SET_TIMEOUT, + cpu_to_be64(jiffies_to_msecs(set->timeout)), NFTA_SET_PAD)) goto nla_put_failure; if (set->gc_int && @@ -2796,7 +2850,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned int size; bool create; u64 timeout; - u32 ktype, dtype, flags, policy, gc_int; + u32 ktype, dtype, flags, policy, gc_int, objtype; struct nft_set_desc desc; unsigned char *udata; u16 udlen; @@ -2826,11 +2880,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | - NFT_SET_MAP | NFT_SET_EVAL)) + NFT_SET_MAP | NFT_SET_EVAL | + NFT_SET_OBJECT)) return -EINVAL; - /* Only one of both operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) == - (NFT_SET_MAP | NFT_SET_EVAL)) + /* Only one of these operations is supported */ + if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } @@ -2855,11 +2910,25 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } else if (flags & NFT_SET_MAP) return -EINVAL; + if (nla[NFTA_SET_OBJ_TYPE] != NULL) { + if (!(flags & NFT_SET_OBJECT)) + return -EINVAL; + + objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); + if (objtype == NFT_OBJECT_UNSPEC || + objtype > NFT_OBJECT_MAX) + return -EINVAL; + } else if (flags & NFT_SET_OBJECT) + return -EINVAL; + else + objtype = NFT_OBJECT_UNSPEC; + timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT])); + timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( + nla[NFTA_SET_TIMEOUT]))); } gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { @@ -2941,6 +3010,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->ktype = ktype; set->klen = desc.klen; set->dtype = dtype; + set->objtype = objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; @@ -3062,6 +3132,7 @@ bind: list_add_tail_rcu(&binding->list, &set->bindings); return 0; } +EXPORT_SYMBOL_GPL(nf_tables_bind_set); void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) @@ -3072,6 +3143,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, nft_is_active(ctx->net, set)) nf_tables_set_destroy(ctx, set); } +EXPORT_SYMBOL_GPL(nf_tables_unbind_set); const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { @@ -3083,6 +3155,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_EXPR] = { .align = __alignof__(struct nft_expr), }, + [NFT_SET_EXT_OBJREF] = { + .len = sizeof(struct nft_object *), + .align = __alignof__(struct nft_object *), + }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), .align = __alignof__(u8), @@ -3171,6 +3247,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && + nla_put_string(skb, NFTA_SET_ELEM_OBJREF, + (*nft_set_ext_obj(ext))->name) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(*nft_set_ext_flags(ext)))) @@ -3178,7 +3259,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - cpu_to_be64(*nft_set_ext_timeout(ext)), + cpu_to_be64(jiffies_to_msecs( + *nft_set_ext_timeout(ext))), NFTA_SET_ELEM_PAD)) goto nla_put_failure; @@ -3447,7 +3529,7 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) *nft_set_ext_expiration(ext) = - jiffies + msecs_to_jiffies(timeout); + jiffies + timeout; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -3464,7 +3546,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_uninit(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); - + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use--; kfree(elem); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); @@ -3489,11 +3572,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + u8 genmask = nft_genmask_next(ctx->net); struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_object *obj = NULL; struct nft_userdata *udata; struct nft_data data; enum nft_registers dreg; @@ -3535,7 +3620,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT])); + timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( + nla[NFTA_SET_ELEM_TIMEOUT]))); } else if (set->flags & NFT_SET_TIMEOUT) { timeout = set->timeout; } @@ -3555,6 +3641,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } + if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { + if (!(set->flags & NFT_SET_OBJECT)) { + err = -EINVAL; + goto err2; + } + obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], + set->objtype, genmask); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err2; + } + nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); + } + if (nla[NFTA_SET_ELEM_DATA] != NULL) { err = nft_data_init(ctx, &data, sizeof(data), &d2, nla[NFTA_SET_ELEM_DATA]); @@ -3613,6 +3713,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } + if (obj) { + *nft_set_ext_obj(ext) = obj; + obj->use++; + } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) @@ -3622,10 +3726,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { - if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && - nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && - memcmp(nft_set_ext_data(ext), - nft_set_ext_data(ext2), set->dlen) != 0) + if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && + memcmp(nft_set_ext_data(ext), + nft_set_ext_data(ext2), set->dlen) != 0) || + (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && + *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) err = -EBUSY; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; @@ -3775,6 +3882,34 @@ err1: return err; } +static int nft_flush_set(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_iter *iter, + const struct nft_set_elem *elem) +{ + struct nft_trans *trans; + int err; + + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, + sizeof(struct nft_trans_elem), GFP_ATOMIC); + if (!trans) + return -ENOMEM; + + if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) { + err = -ENOENT; + goto err1; + } + + nft_trans_elem_set(trans) = (struct nft_set *)set; + nft_trans_elem(trans) = *((struct nft_set_elem *)elem); + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +err1: + kfree(trans); + return err; +} + static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -3785,9 +3920,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int rem, err = 0; - if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) - return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3799,6 +3931,18 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; + if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { + struct nft_set_dump_args args = { + .iter = { + .genmask = genmask, + .fn = nft_flush_set, + }, + }; + set->ops->walk(&ctx, set, &args.iter); + + return args.iter.err; + } + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); if (err < 0) @@ -3834,6 +3978,500 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, } EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); +/* + * Stateful objects + */ + +/** + * nft_register_obj- register nf_tables stateful object type + * @obj: object type + * + * Registers the object type for use with nf_tables. Returns zero on + * success or a negative errno code otherwise. + */ +int nft_register_obj(struct nft_object_type *obj_type) +{ + if (obj_type->type == NFT_OBJECT_UNSPEC) + return -EINVAL; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_rcu(&obj_type->list, &nf_tables_objects); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_obj); + +/** + * nft_unregister_obj - unregister nf_tables object type + * @obj: object type + * + * Unregisters the object type for use with nf_tables. + */ +void nft_unregister_obj(struct nft_object_type *obj_type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&obj_type->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_obj); + +struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) +{ + struct nft_object *obj; + + list_for_each_entry(obj, &table->objects, list) { + if (!nla_strcmp(nla, obj->name) && + objtype == obj->type->type && + nft_active_genmask(obj, genmask)) + return obj; + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); + +static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { + [NFTA_OBJ_TABLE] = { .type = NLA_STRING }, + [NFTA_OBJ_NAME] = { .type = NLA_STRING }, + [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, + [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, +}; + +static struct nft_object *nft_obj_init(const struct nft_object_type *type, + const struct nlattr *attr) +{ + struct nlattr *tb[type->maxattr + 1]; + struct nft_object *obj; + int err; + + if (attr) { + err = nla_parse_nested(tb, type->maxattr, attr, type->policy); + if (err < 0) + goto err1; + } else { + memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1)); + } + + err = -ENOMEM; + obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL); + if (obj == NULL) + goto err1; + + err = type->init((const struct nlattr * const *)tb, obj); + if (err < 0) + goto err2; + + obj->type = type; + return obj; +err2: + kfree(obj); +err1: + return ERR_PTR(err); +} + +static int nft_object_dump(struct sk_buff *skb, unsigned int attr, + struct nft_object *obj, bool reset) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr); + if (!nest) + goto nla_put_failure; + if (obj->type->dump(skb, obj, reset) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_object_type *__nft_obj_type_get(u32 objtype) +{ + const struct nft_object_type *type; + + list_for_each_entry(type, &nf_tables_objects, list) { + if (objtype == type->type) + return type; + } + return NULL; +} + +static const struct nft_object_type *nft_obj_type_get(u32 objtype) +{ + const struct nft_object_type *type; + + type = __nft_obj_type_get(objtype); + if (type != NULL && try_module_get(type->owner)) + return type; + +#ifdef CONFIG_MODULES + if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-obj-%u", objtype); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_obj_type_get(objtype)) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +static int nf_tables_newobj(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_object_type *type; + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_object *obj; + struct nft_ctx ctx; + u32 objtype; + int err; + + if (!nla[NFTA_OBJ_TYPE] || + !nla[NFTA_OBJ_NAME] || + !nla[NFTA_OBJ_DATA]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + if (err != -ENOENT) + return err; + + obj = NULL; + } + + if (obj != NULL) { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + return 0; + } + + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + + type = nft_obj_type_get(objtype); + if (IS_ERR(type)) + return PTR_ERR(type); + + obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err1; + } + obj->table = table; + nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN); + + err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); + if (err < 0) + goto err2; + + list_add_tail_rcu(&obj->list, &table->objects); + table->use++; + return 0; +err2: + if (obj->type->destroy) + obj->type->destroy(obj); + kfree(obj); +err1: + module_put(type->owner); + return err; +} + +static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, + struct nft_object *obj, bool reset) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || + nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || + nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) || + nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +struct nft_obj_filter { + char table[NFT_OBJ_MAXNAMELEN]; + u32 type; +}; + +static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + unsigned int idx = 0, s_idx = cb->args[0]; + struct nft_obj_filter *filter = cb->data; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + struct nft_object *obj; + bool reset = false; + + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + reset = true; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(obj, &table->objects, list) { + if (!nft_is_active(net, obj)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (filter->table[0] && + strcmp(filter->table, table->name)) + goto cont; + if (filter->type != NFT_OBJECT_UNSPEC && + obj->type->type != filter->type) + goto cont; + + if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWOBJ, + NLM_F_MULTI | NLM_F_APPEND, + afi->family, table, obj, reset) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } +done: + rcu_read_unlock(); + + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_dump_obj_done(struct netlink_callback *cb) +{ + kfree(cb->data); + + return 0; +} + +static struct nft_obj_filter * +nft_obj_filter_alloc(const struct nlattr * const nla[]) +{ + struct nft_obj_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + if (nla[NFTA_OBJ_TABLE]) + nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE], + NFT_TABLE_MAXNAMELEN); + if (nla[NFTA_OBJ_TYPE]) + filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + + return filter; +} + +static int nf_tables_getobj(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); + int family = nfmsg->nfgen_family; + const struct nft_af_info *afi; + const struct nft_table *table; + struct nft_object *obj; + struct sk_buff *skb2; + bool reset = false; + u32 objtype; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_obj, + .done = nf_tables_dump_obj_done, + }; + + if (nla[NFTA_OBJ_TABLE] || + nla[NFTA_OBJ_TYPE]) { + struct nft_obj_filter *filter; + + filter = nft_obj_filter_alloc(nla); + if (IS_ERR(filter)) + return -ENOMEM; + + c.data = filter; + } + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + if (!nla[NFTA_OBJ_NAME] || + !nla[NFTA_OBJ_TYPE]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + reset = true; + + err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + family, table, obj, reset); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; + + return 0; +} + +static void nft_obj_destroy(struct nft_object *obj) +{ + if (obj->type->destroy) + obj->type->destroy(obj); + + module_put(obj->type->owner); + kfree(obj); +} + +static int nf_tables_delobj(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_object *obj; + struct nft_ctx ctx; + u32 objtype; + + if (!nla[NFTA_OBJ_TYPE] || + !nla[NFTA_OBJ_NAME]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) + return PTR_ERR(obj); + if (obj->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + + return nft_delobj(&ctx, obj); +} + +int nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + int family, int report, gfp_t gfp) +{ + struct sk_buff *skb; + int err; + + if (!report && + !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, gfp); + if (skb == NULL) + goto err; + + err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family, + table, obj, false); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); +err: + if (err < 0) { + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + } + return err; +} +EXPORT_SYMBOL_GPL(nft_obj_notify); + +static int nf_tables_obj_notify(const struct nft_ctx *ctx, + struct nft_object *obj, int event) +{ + return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, + ctx->seq, event, ctx->afi->family, ctx->report, + GFP_KERNEL); +} + static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { @@ -3994,6 +4632,26 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, }, + [NFT_MSG_NEWOBJ] = { + .call_batch = nf_tables_newobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, + [NFT_MSG_GETOBJ] = { + .call = nf_tables_getobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, + [NFT_MSG_DELOBJ] = { + .call_batch = nf_tables_delobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, + [NFT_MSG_GETOBJ_RESET] = { + .call = nf_tables_getobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -4036,6 +4694,9 @@ static void nf_tables_commit_release(struct nft_trans *trans) nft_set_elem_destroy(nft_trans_elem_set(trans), nft_trans_elem(trans).priv, true); break; + case NFT_MSG_DELOBJ: + nft_obj_destroy(nft_trans_obj(trans)); + break; } kfree(trans); } @@ -4143,6 +4804,17 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) atomic_dec(&te->set->nelems); te->set->ndeact--; break; + case NFT_MSG_NEWOBJ: + nft_clear(net, nft_trans_obj(trans)); + nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), + NFT_MSG_NEWOBJ); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELOBJ: + list_del_rcu(&nft_trans_obj(trans)->list); + nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), + NFT_MSG_DELOBJ); + break; } } @@ -4177,6 +4849,9 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_set_elem_destroy(nft_trans_elem_set(trans), nft_trans_elem(trans).priv, true); break; + case NFT_MSG_NEWOBJ: + nft_obj_destroy(nft_trans_obj(trans)); + break; } kfree(trans); } @@ -4257,6 +4932,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; + case NFT_MSG_NEWOBJ: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_obj(trans)->list); + break; + case NFT_MSG_DELOBJ: + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_obj(trans)); + nft_trans_destroy(trans); + break; } } @@ -4803,6 +5487,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) { struct nft_table *table, *nt; struct nft_chain *chain, *nc; + struct nft_object *obj, *ne; struct nft_rule *rule, *nr; struct nft_set *set, *ns; struct nft_ctx ctx = { @@ -4829,6 +5514,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) table->use--; nft_set_destroy(set); } + list_for_each_entry_safe(obj, ne, &table->objects, list) { + list_del(&obj->list); + table->use--; + nft_obj_destroy(obj); + } list_for_each_entry_safe(chain, nc, &table->chains, list) { list_del(&chain->list); table->use--; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 763cb4d54e8d..200922bb2036 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1152,6 +1152,7 @@ MODULE_ALIAS_NF_LOGGER(AF_INET, 1); MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */ +MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */ module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 77db8358ab14..f6a02c5071c2 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -31,11 +31,10 @@ struct nft_counter_percpu_priv { struct nft_counter_percpu __percpu *counter; }; -static void nft_counter_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); struct nft_counter_percpu *this_cpu; local_bh_disable(); @@ -47,10 +46,64 @@ static void nft_counter_eval(const struct nft_expr *expr, local_bh_enable(); } -static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, +static inline void nft_counter_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + nft_counter_do_eval(priv, regs, pkt); +} + +static int nft_counter_do_init(const struct nlattr * const tb[], + struct nft_counter_percpu_priv *priv) +{ + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + + cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); + if (cpu_stats == NULL) + return -ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + if (tb[NFTA_COUNTER_PACKETS]) { + this_cpu->counter.packets = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + } + if (tb[NFTA_COUNTER_BYTES]) { + this_cpu->counter.bytes = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + } + preempt_enable(); + priv->counter = cpu_stats; + return 0; +} + +static int nft_counter_obj_init(const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + return nft_counter_do_init(tb, priv); +} + +static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv) +{ + free_percpu(priv->counter); +} + +static void nft_counter_obj_destroy(struct nft_object *obj) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + nft_counter_do_destroy(priv); +} + +static void nft_counter_fetch(struct nft_counter_percpu __percpu *counter, struct nft_counter *total) { - const struct nft_counter_percpu *cpu_stats; + struct nft_counter_percpu *cpu_stats; u64 bytes, packets; unsigned int seq; int cpu; @@ -69,12 +122,52 @@ static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, } } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static u64 __nft_counter_reset(u64 *counter) +{ + u64 ret, old; + + do { + old = *counter; + ret = cmpxchg64(counter, old, 0); + } while (ret != old); + + return ret; +} + +static void nft_counter_reset(struct nft_counter_percpu __percpu *counter, + struct nft_counter *total) +{ + struct nft_counter_percpu *cpu_stats; + u64 bytes, packets; + unsigned int seq; + int cpu; + + memset(total, 0, sizeof(*total)); + for_each_possible_cpu(cpu) { + bytes = packets = 0; + + cpu_stats = per_cpu_ptr(counter, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + packets += __nft_counter_reset(&cpu_stats->counter.packets); + bytes += __nft_counter_reset(&cpu_stats->counter.bytes); + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + + total->packets += packets; + total->bytes += bytes; + } +} + +static int nft_counter_do_dump(struct sk_buff *skb, + const struct nft_counter_percpu_priv *priv, + bool reset) { - struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); struct nft_counter total; - nft_counter_fetch(priv->counter, &total); + if (reset) + nft_counter_reset(priv->counter, &total); + else + nft_counter_fetch(priv->counter, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD) || @@ -87,36 +180,54 @@ nla_put_failure: return -1; } +static int nft_counter_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + return nft_counter_do_dump(skb, priv, reset); +} + static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; +static struct nft_object_type nft_counter_obj __read_mostly = { + .type = NFT_OBJECT_COUNTER, + .size = sizeof(struct nft_counter_percpu_priv), + .maxattr = NFTA_COUNTER_MAX, + .policy = nft_counter_policy, + .eval = nft_counter_obj_eval, + .init = nft_counter_obj_init, + .destroy = nft_counter_obj_destroy, + .dump = nft_counter_obj_dump, + .owner = THIS_MODULE, +}; + +static void nft_counter_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + + nft_counter_do_eval(priv, regs, pkt); +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + + return nft_counter_do_dump(skb, priv, false); +} + static int nft_counter_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - struct nft_counter_percpu __percpu *cpu_stats; - struct nft_counter_percpu *this_cpu; - cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); - if (cpu_stats == NULL) - return -ENOMEM; - - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->counter.packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); - } - if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->counter.bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); - } - preempt_enable(); - priv->counter = cpu_stats; - return 0; + return nft_counter_do_init(tb, priv); } static void nft_counter_destroy(const struct nft_ctx *ctx, @@ -124,7 +235,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - free_percpu(priv->counter); + nft_counter_do_destroy(priv); } static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) @@ -174,12 +285,26 @@ static struct nft_expr_type nft_counter_type __read_mostly = { static int __init nft_counter_module_init(void) { - return nft_register_expr(&nft_counter_type); + int err; + + err = nft_register_obj(&nft_counter_obj); + if (err < 0) + return err; + + err = nft_register_expr(&nft_counter_type); + if (err < 0) + goto err1; + + return 0; +err1: + nft_unregister_obj(&nft_counter_obj); + return err; } static void __exit nft_counter_module_exit(void) { nft_unregister_expr(&nft_counter_type); + nft_unregister_obj(&nft_counter_obj); } module_init(nft_counter_module_init); @@ -188,3 +313,4 @@ module_exit(nft_counter_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("counter"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6837348c8993..e6baeaebe653 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -208,37 +208,37 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_SREG] = { .type = NLA_U32 }, }; -static int nft_ct_l3proto_try_module_get(uint8_t family) +static int nft_ct_netns_get(struct net *net, uint8_t family) { int err; if (family == NFPROTO_INET) { - err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4); + err = nf_ct_netns_get(net, NFPROTO_IPV4); if (err < 0) goto err1; - err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6); + err = nf_ct_netns_get(net, NFPROTO_IPV6); if (err < 0) goto err2; } else { - err = nf_ct_l3proto_try_module_get(family); + err = nf_ct_netns_get(net, family); if (err < 0) goto err1; } return 0; err2: - nf_ct_l3proto_module_put(NFPROTO_IPV4); + nf_ct_netns_put(net, NFPROTO_IPV4); err1: return err; } -static void nft_ct_l3proto_module_put(uint8_t family) +static void nft_ct_netns_put(struct net *net, uint8_t family) { if (family == NFPROTO_INET) { - nf_ct_l3proto_module_put(NFPROTO_IPV4); - nf_ct_l3proto_module_put(NFPROTO_IPV6); + nf_ct_netns_put(net, NFPROTO_IPV4); + nf_ct_netns_put(net, NFPROTO_IPV6); } else - nf_ct_l3proto_module_put(family); + nf_ct_netns_put(net, family); } static int nft_ct_get_init(const struct nft_ctx *ctx, @@ -342,7 +342,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_ct_l3proto_try_module_get(ctx->afi->family); + err = nft_ct_netns_get(ctx->net, ctx->afi->family); if (err < 0) return err; @@ -390,7 +390,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (err < 0) goto err1; - err = nft_ct_l3proto_try_module_get(ctx->afi->family); + err = nft_ct_netns_get(ctx->net, ctx->afi->family); if (err < 0) goto err1; @@ -405,7 +405,7 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { - nft_ct_l3proto_module_put(ctx->afi->family); + nf_ct_netns_put(ctx->net, ctx->afi->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, @@ -423,7 +423,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, break; } - nft_ct_l3proto_module_put(ctx->afi->family); + nft_ct_netns_put(ctx->net, ctx->afi->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 249c9b80c150..29a4906adc27 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -86,7 +86,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0) return -EINVAL; - priv->result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT])); + priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]); switch (priv->result) { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 763ebc3e0b2b..ce13a50b9189 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -26,8 +26,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; - nf_dup_netdev_egress(pkt, oif); - regs->verdict.code = NF_DROP; + nf_fwd_netdev_egress(pkt, oif); + regs->verdict.code = NF_STOLEN; } static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 97ad8e30e4b4..eb2721af898d 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -53,6 +53,7 @@ static int nft_hash_init(const struct nft_ctx *ctx, { struct nft_hash *priv = nft_expr_priv(expr); u32 len; + int err; if (!tb[NFTA_HASH_SREG] || !tb[NFTA_HASH_DREG] || @@ -66,8 +67,10 @@ static int nft_hash_init(const struct nft_ctx *ctx, priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN])); - if (len == 0 || len > U8_MAX) + err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); + if (err < 0) + return err; + if (len == 0) return -ERANGE; priv->len = len; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 81b5ad6165ac..11ce016cd479 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -77,7 +77,7 @@ int nft_masq_init(const struct nft_ctx *ctx, } } - return 0; + return nf_ct_netns_get(ctx->net, ctx->afi->family); } EXPORT_SYMBOL_GPL(nft_masq_init); @@ -105,4 +105,4 @@ nla_put_failure: EXPORT_SYMBOL_GPL(nft_masq_dump); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index ee2d71753746..19a7bf3236f9 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -209,7 +209,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } - return 0; + return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -257,12 +257,21 @@ nla_put_failure: return -1; } +static void +nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + + nf_ct_netns_put(ctx->net, priv->family); +} + static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, + .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, }; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c new file mode 100644 index 000000000000..415a65ba2b85 --- /dev/null +++ b/net/netfilter/nft_objref.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) + +static void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_object *obj = nft_objref_priv(expr); + + obj->type->eval(obj, regs, pkt); +} + +static int nft_objref_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_object *obj = nft_objref_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + u32 objtype; + + if (!tb[NFTA_OBJREF_IMM_NAME] || + !tb[NFTA_OBJREF_IMM_TYPE]) + return -EINVAL; + + objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); + obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, + genmask); + if (IS_ERR(obj)) + return -ENOENT; + + nft_objref_priv(expr) = obj; + obj->use++; + + return 0; +} + +static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_object *obj = nft_objref_priv(expr); + + if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) || + nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static void nft_objref_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + obj->use--; +} + +static struct nft_expr_type nft_objref_type; +static const struct nft_expr_ops nft_objref_ops = { + .type = &nft_objref_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), + .eval = nft_objref_eval, + .init = nft_objref_init, + .destroy = nft_objref_destroy, + .dump = nft_objref_dump, +}; + +struct nft_objref_map { + struct nft_set *set; + enum nft_registers sreg:8; + struct nft_set_binding binding; +}; + +static void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + const struct nft_set *set = priv->set; + const struct nft_set_ext *ext; + struct nft_object *obj; + bool found; + + found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], + &ext); + if (!found) { + regs->verdict.code = NFT_BREAK; + return; + } + obj = *nft_set_ext_obj(ext); + obj->type->eval(obj, regs, pkt); +} + +static int nft_objref_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set *set; + int err; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask); + if (IS_ERR(set)) { + if (tb[NFTA_OBJREF_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_OBJREF_SET_ID], + genmask); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (!(set->flags & NFT_SET_OBJECT)) + return -EINVAL; + + priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]); + err = nft_validate_register_load(priv->sreg, set->klen); + if (err < 0) + return err; + + priv->binding.flags = set->flags & NFT_SET_OBJECT; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + return err; + + priv->set = set; + return 0; +} + +static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_objref_map *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) || + nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static void nft_objref_map_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static struct nft_expr_type nft_objref_type; +static const struct nft_expr_ops nft_objref_map_ops = { + .type = &nft_objref_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), + .eval = nft_objref_map_eval, + .init = nft_objref_map_init, + .destroy = nft_objref_map_destroy, + .dump = nft_objref_map_dump, +}; + +static const struct nft_expr_ops * +nft_objref_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_OBJREF_SET_SREG] && + (tb[NFTA_OBJREF_SET_NAME] || + tb[NFTA_OBJREF_SET_ID])) + return &nft_objref_map_ops; + else if (tb[NFTA_OBJREF_IMM_NAME] && + tb[NFTA_OBJREF_IMM_TYPE]) + return &nft_objref_ops; + + return ERR_PTR(-EOPNOTSUPP); +} + +static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { + [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, + [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, + [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_objref_type __read_mostly = { + .name = "objref", + .select_ops = nft_objref_select_ops, + .policy = nft_objref_policy, + .maxattr = NFTA_OBJREF_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_objref_module_init(void) +{ + return nft_register_expr(&nft_objref_type); +} + +static void __exit nft_objref_module_exit(void) +{ + nft_unregister_expr(&nft_objref_type); +} + +module_init(nft_objref_module_init); +module_exit(nft_objref_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("objref"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 98fb5d7b8087..36d2b1096546 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +18,10 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmpv6.h> /* add vlan header into the user buffer for if tag was removed by offloads */ static bool @@ -164,6 +169,87 @@ const struct nft_expr_ops nft_payload_fast_ops = { .dump = nft_payload_dump, }; +static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) +{ + *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + if (*sum == 0) + *sum = CSUM_MANGLED_0; +} + +static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff) +{ + struct udphdr *uh, _uh; + + uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh); + if (!uh) + return false; + + return uh->check; +} + +static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, + struct sk_buff *skb, + unsigned int *l4csum_offset) +{ + switch (pkt->tprot) { + case IPPROTO_TCP: + *l4csum_offset = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + return -1; + /* Fall through. */ + case IPPROTO_UDPLITE: + *l4csum_offset = offsetof(struct udphdr, check); + break; + case IPPROTO_ICMPV6: + *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + break; + default: + return -1; + } + + *l4csum_offset += pkt->xt.thoff; + return 0; +} + +static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, + struct sk_buff *skb, + __wsum fsum, __wsum tsum) +{ + int l4csum_offset; + __sum16 sum; + + /* If we cannot determine layer 4 checksum offset or this packet doesn't + * require layer 4 checksum recalculation, skip this packet. + */ + if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0) + return 0; + + if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + /* Checksum mangling for an arbitrary amount of bytes, based on + * inet_proto_csum_replace*() functions. + */ + if (skb->ip_summed != CHECKSUM_PARTIAL) { + nft_csum_replace(&sum, fsum, tsum); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum), + tsum); + } + } else { + sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum), + tsum)); + } + + if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) || + skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + return 0; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -204,14 +290,15 @@ static void nft_payload_set_eval(const struct nft_expr *expr, fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); - sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum), - tsum)); - if (sum == 0) - sum = CSUM_MANGLED_0; + nft_csum_replace(&sum, fsum, tsum); if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) goto err; + + if (priv->csum_flags && + nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0) + goto err; } if (!skb_make_writable(skb, max(offset + priv->len, 0)) || @@ -240,6 +327,15 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) priv->csum_offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { + u32 flags; + + flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS])); + if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR) + return -EINVAL; + + priv->csum_flags = flags; + } switch (priv->csum_type) { case NFT_PAYLOAD_CSUM_NONE: @@ -262,7 +358,8 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET, - htonl(priv->csum_offset))) + htonl(priv->csum_offset)) || + nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags))) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index c00104c07095..bd6efc53f26d 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -17,38 +17,59 @@ struct nft_quota { u64 quota; - bool invert; - atomic64_t remain; + unsigned long flags; + atomic64_t consumed; }; static inline bool nft_overquota(struct nft_quota *priv, - const struct nft_pktinfo *pkt) + const struct sk_buff *skb) { - return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0; + return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota; } -static void nft_quota_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline bool nft_quota_invert(struct nft_quota *priv) { - struct nft_quota *priv = nft_expr_priv(expr); + return priv->flags & NFT_QUOTA_F_INV; +} - if (nft_overquota(priv, pkt) ^ priv->invert) +static inline void nft_quota_do_eval(struct nft_quota *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = { [NFTA_QUOTA_BYTES] = { .type = NLA_U64 }, [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 }, + [NFTA_QUOTA_CONSUMED] = { .type = NLA_U64 }, }; -static int nft_quota_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +#define NFT_QUOTA_DEPLETED_BIT 1 /* From NFT_QUOTA_F_DEPLETED. */ + +static void nft_quota_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_quota *priv = nft_expr_priv(expr); - u32 flags = 0; - u64 quota; + struct nft_quota *priv = nft_obj_data(obj); + bool overquota; + + overquota = nft_overquota(priv, pkt->skb); + if (overquota ^ nft_quota_invert(priv)) + regs->verdict.code = NFT_BREAK; + + if (overquota && + !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) + nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0, + NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); +} + +static int nft_quota_do_init(const struct nlattr * const tb[], + struct nft_quota *priv) +{ + unsigned long flags = 0; + u64 quota, consumed = 0; if (!tb[NFTA_QUOTA_BYTES]) return -EINVAL; @@ -57,26 +78,60 @@ static int nft_quota_init(const struct nft_ctx *ctx, if (quota > S64_MAX) return -EOVERFLOW; + if (tb[NFTA_QUOTA_CONSUMED]) { + consumed = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_CONSUMED])); + if (consumed > quota) + return -EINVAL; + } + if (tb[NFTA_QUOTA_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS])); if (flags & ~NFT_QUOTA_F_INV) return -EINVAL; + if (flags & NFT_QUOTA_F_DEPLETED) + return -EOPNOTSUPP; } priv->quota = quota; - priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false; - atomic64_set(&priv->remain, quota); + priv->flags = flags; + atomic64_set(&priv->consumed, consumed); return 0; } -static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_quota_obj_init(const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_quota *priv = nft_obj_data(obj); + + return nft_quota_do_init(tb, priv); +} + +static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, + bool reset) { - const struct nft_quota *priv = nft_expr_priv(expr); - u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; + u32 flags = priv->flags; + u64 consumed; + + if (reset) { + consumed = atomic64_xchg(&priv->consumed, 0); + if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) + flags |= NFT_QUOTA_F_DEPLETED; + } else { + consumed = atomic64_read(&priv->consumed); + } + + /* Since we inconditionally increment consumed quota for each packet + * that we see, don't go over the quota boundary in what we send to + * userspace. + */ + if (consumed > priv->quota) + consumed = priv->quota; if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), NFTA_QUOTA_PAD) || + nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed), + NFTA_QUOTA_PAD) || nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; return 0; @@ -85,6 +140,50 @@ nla_put_failure: return -1; } +static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj, + bool reset) +{ + struct nft_quota *priv = nft_obj_data(obj); + + return nft_quota_do_dump(skb, priv, reset); +} + +static struct nft_object_type nft_quota_obj __read_mostly = { + .type = NFT_OBJECT_QUOTA, + .size = sizeof(struct nft_quota), + .maxattr = NFTA_QUOTA_MAX, + .policy = nft_quota_policy, + .init = nft_quota_obj_init, + .eval = nft_quota_obj_eval, + .dump = nft_quota_obj_dump, + .owner = THIS_MODULE, +}; + +static void nft_quota_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + nft_quota_do_eval(priv, regs, pkt); +} + +static int nft_quota_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + return nft_quota_do_init(tb, priv); +} + +static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + return nft_quota_do_dump(skb, priv, false); +} + static struct nft_expr_type nft_quota_type; static const struct nft_expr_ops nft_quota_ops = { .type = &nft_quota_type, @@ -105,12 +204,26 @@ static struct nft_expr_type nft_quota_type __read_mostly = { static int __init nft_quota_module_init(void) { - return nft_register_expr(&nft_quota_type); + int err; + + err = nft_register_obj(&nft_quota_obj); + if (err < 0) + return err; + + err = nft_register_expr(&nft_quota_type); + if (err < 0) + goto err1; + + return 0; +err1: + nft_unregister_obj(&nft_quota_obj); + return err; } static void __exit nft_quota_module_exit(void) { - nft_unregister_expr(&nft_quota_type); + nft_unregister_expr(&nft_quota_type); + nft_unregister_obj(&nft_quota_obj); } module_init(nft_quota_module_init); @@ -119,3 +232,4 @@ module_exit(nft_quota_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("quota"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA); diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 009062606697..9edc74eedc10 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -59,6 +59,12 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr int err; u32 op; + if (!tb[NFTA_RANGE_SREG] || + !tb[NFTA_RANGE_OP] || + !tb[NFTA_RANGE_FROM_DATA] || + !tb[NFTA_RANGE_TO_DATA]) + return -EINVAL; + err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), &desc_from, tb[NFTA_RANGE_FROM_DATA]); if (err < 0) diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 03f7bf40ae75..40dcd05146d5 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -79,7 +79,7 @@ int nft_redir_init(const struct nft_ctx *ctx, return -EINVAL; } - return 0; + return nf_ct_netns_get(ctx->net, ctx->afi->family); } EXPORT_SYMBOL_GPL(nft_redir_init); @@ -108,4 +108,4 @@ nla_put_failure: EXPORT_SYMBOL_GPL(nft_redir_dump); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index a3dface3e6e6..1e20e2bbb6d9 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -167,6 +167,19 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, nft_set_elem_clear_busy(&he->ext); } +static bool nft_hash_deactivate_one(const struct net *net, + const struct nft_set *set, void *priv) +{ + struct nft_hash_elem *he = priv; + + if (!nft_set_elem_mark_busy(&he->ext) || + !nft_is_active(net, &he->ext)) { + nft_set_elem_change_active(net, set, &he->ext); + return true; + } + return false; +} + static void *nft_hash_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) @@ -181,13 +194,10 @@ static void *nft_hash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); - if (he != NULL) { - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) - nft_set_elem_change_active(net, set, &he->ext); - else - he = NULL; - } + if (he != NULL && + !nft_hash_deactivate_one(net, set, he)) + he = NULL; + rcu_read_unlock(); return he; @@ -387,6 +397,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .insert = nft_hash_insert, .activate = nft_hash_activate, .deactivate = nft_hash_deactivate, + .deactivate_one = nft_hash_deactivate_one, .remove = nft_hash_remove, .lookup = nft_hash_lookup, .update = nft_hash_update, diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 36493a7cae88..08376e50f6cd 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -171,6 +171,15 @@ static void nft_rbtree_activate(const struct net *net, nft_set_elem_change_active(net, set, &rbe->ext); } +static bool nft_rbtree_deactivate_one(const struct net *net, + const struct nft_set *set, void *priv) +{ + struct nft_rbtree_elem *rbe = priv; + + nft_set_elem_change_active(net, set, &rbe->ext); + return true; +} + static void *nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) @@ -204,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; continue; } - nft_set_elem_change_active(net, set, &rbe->ext); + nft_rbtree_deactivate_one(net, set, rbe); return rbe; } } @@ -295,6 +304,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .insert = nft_rbtree_insert, .remove = nft_rbtree_remove, .deactivate = nft_rbtree_deactivate, + .deactivate_one = nft_rbtree_deactivate_one, .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ad818e52859b..2ff499680cc6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +#define XT_PCPU_BLOCK_SIZE 4096 struct compat_delta { unsigned int offset; /* offset in kernel */ @@ -958,7 +959,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!info) { - info = vmalloc(sz); + info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | + __GFP_NORETRY | __GFP_HIGHMEM, + PAGE_KERNEL); if (!info) return NULL; } @@ -1615,6 +1618,59 @@ void xt_proto_fini(struct net *net, u_int8_t af) } EXPORT_SYMBOL_GPL(xt_proto_fini); +/** + * xt_percpu_counter_alloc - allocate x_tables rule counter + * + * @state: pointer to xt_percpu allocation state + * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct + * + * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then + * contain the address of the real (percpu) counter. + * + * Rule evaluation needs to use xt_get_this_cpu_counter() helper + * to fetch the real percpu counter. + * + * To speed up allocation and improve data locality, a 4kb block is + * allocated. + * + * xt_percpu_counter_alloc_state contains the base address of the + * allocated page and the current sub-offset. + * + * returns false on error. + */ +bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, + struct xt_counters *counter) +{ + BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2)); + + if (nr_cpu_ids <= 1) + return true; + + if (!state->mem) { + state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE, + XT_PCPU_BLOCK_SIZE); + if (!state->mem) + return false; + } + counter->pcnt = (__force unsigned long)(state->mem + state->off); + state->off += sizeof(*counter); + if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) { + state->mem = NULL; + state->off = 0; + } + return true; +} +EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc); + +void xt_percpu_counter_free(struct xt_counters *counters) +{ + unsigned long pcnt = counters->pcnt; + + if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0) + free_percpu((void __percpu *)pcnt); +} +EXPORT_SYMBOL_GPL(xt_percpu_counter_free); + static int __net_init xt_net_init(struct net *net) { int i; diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index e04dc282e3bb..da56c06a443c 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -106,7 +106,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -115,7 +115,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par) static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_target connsecmark_tg_reg __read_mostly = { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 6669e68d589e..95c750358747 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -216,7 +216,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err1; #endif - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) goto err1; @@ -260,7 +260,7 @@ out: err3: nf_ct_tmpl_free(ct); err2: - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); err1: return ret; } @@ -341,7 +341,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, if (help) module_put(help->helper->me); - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); xt_ct_destroy_timeout(ct); nf_ct_put(info->ct); diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c index 94d0b5411192..e45a01255e70 100644 --- a/net/netfilter/xt_NETMAP.c +++ b/net/netfilter/xt_NETMAP.c @@ -60,7 +60,12 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return -EINVAL; - return 0; + return nf_ct_netns_get(par->net, par->family); +} + +static void netmap_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static unsigned int @@ -111,7 +116,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par) pr_debug("bad rangesize %u.\n", mr->rangesize); return -EINVAL; } - return 0; + return nf_ct_netns_get(par->net, par->family); } static struct xt_target netmap_tg_reg[] __read_mostly = { @@ -127,6 +132,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg6_checkentry, + .destroy = netmap_tg_destroy, .me = THIS_MODULE, }, { @@ -141,6 +147,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg4_check, + .destroy = netmap_tg_destroy, .me = THIS_MODULE, }, }; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index dbd6c4a12b97..91a373a3f534 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est) mutex_lock(&xt_rateest_mutex); if (--est->refcnt == 0) { hlist_del(&est->list); - gen_kill_estimator(&est->bstats, &est->rstats); + gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' @@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; - ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, + ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 651dce65a30b..98a4c6d4f1cb 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -40,7 +40,13 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; - return 0; + + return nf_ct_netns_get(par->net, par->family); +} + +static void redirect_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } /* FIXME: Take multiple ranges --RR */ @@ -56,7 +62,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par) pr_debug("bad rangesize %u.\n", mr->rangesize); return -EINVAL; } - return 0; + return nf_ct_netns_get(par->net, par->family); } static unsigned int @@ -72,6 +78,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = { .revision = 0, .table = "nat", .checkentry = redirect_tg6_checkentry, + .destroy = redirect_tg_destroy, .target = redirect_tg6, .targetsize = sizeof(struct nf_nat_range), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -85,6 +92,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = { .table = "nat", .target = redirect_tg4, .checkentry = redirect_tg4_check, + .destroy = redirect_tg_destroy, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index dbd72cc40e42..80cb7babeb64 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -531,6 +531,11 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) static int tproxy_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_ip6 *i = par->entryinfo; + int err; + + err = nf_defrag_ipv6_enable(par->net); + if (err) + return err; if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && !(i->invflags & IP6T_INV_PROTO)) @@ -545,6 +550,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) static int tproxy_tg4_check(const struct xt_tgchk_param *par) { const struct ipt_ip *i = par->entryinfo; + int err; + + err = nf_defrag_ipv4_enable(par->net); + if (err) + return err; if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && !(i->invflags & IPT_INV_PROTO)) @@ -596,11 +606,6 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { static int __init tproxy_tg_init(void) { - nf_defrag_ipv4_enable(); -#ifdef XT_TPROXY_HAVE_IPV6 - nf_defrag_ipv6_enable(); -#endif - return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); } diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index dffee9d47ec4..2dedaa23ab0a 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/filter.h> +#include <linux/bpf.h> #include <linux/netfilter/xt_bpf.h> #include <linux/netfilter/x_tables.h> @@ -20,15 +21,15 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_bpf"); MODULE_ALIAS("ip6t_bpf"); -static int bpf_mt_check(const struct xt_mtchk_param *par) +static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, + struct bpf_prog **ret) { - struct xt_bpf_info *info = par->matchinfo; struct sock_fprog_kern program; - program.len = info->bpf_program_num_elem; - program.filter = info->bpf_program; + program.len = len; + program.filter = insns; - if (bpf_prog_create(&info->filter, &program)) { + if (bpf_prog_create(ret, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; } @@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par) return 0; } +static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + *ret = prog; + return 0; +} + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info *info = par->matchinfo; + + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); +} + +static int bpf_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info_v1 *info = par->matchinfo; + + if (info->mode == XT_BPF_MODE_BYTECODE) + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); + else if (info->mode == XT_BPF_MODE_FD_PINNED || + info->mode == XT_BPF_MODE_FD_ELF) + return __bpf_mt_check_fd(info->fd, &info->filter); + else + return -EINVAL; +} + static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; @@ -43,31 +80,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) return BPF_PROG_RUN(info->filter, skb); } +static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_bpf_info_v1 *info = par->matchinfo; + + return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb); +} + static void bpf_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_bpf_info *info = par->matchinfo; + + bpf_prog_destroy(info->filter); +} + +static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par) +{ + const struct xt_bpf_info_v1 *info = par->matchinfo; + bpf_prog_destroy(info->filter); } -static struct xt_match bpf_mt_reg __read_mostly = { - .name = "bpf", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = bpf_mt_check, - .match = bpf_mt, - .destroy = bpf_mt_destroy, - .matchsize = sizeof(struct xt_bpf_info), - .me = THIS_MODULE, +static struct xt_match bpf_mt_reg[] __read_mostly = { + { + .name = "bpf", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check, + .match = bpf_mt, + .destroy = bpf_mt_destroy, + .matchsize = sizeof(struct xt_bpf_info), + .me = THIS_MODULE, + }, + { + .name = "bpf", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check_v1, + .match = bpf_mt_v1, + .destroy = bpf_mt_destroy_v1, + .matchsize = sizeof(struct xt_bpf_info_v1), + .me = THIS_MODULE, + }, }; static int __init bpf_mt_init(void) { - return xt_register_match(&bpf_mt_reg); + return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg)); } static void __exit bpf_mt_exit(void) { - xt_unregister_match(&bpf_mt_reg); + xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg)); } module_init(bpf_mt_init); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index d4bec261e74e..cad0b7b5eb35 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -110,7 +110,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) sinfo->direction != XT_CONNBYTES_DIR_BOTH) return -EINVAL; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -129,7 +129,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_match connbytes_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index 03d66f1c5e69..7827128d5a95 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -61,7 +61,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -70,14 +70,14 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) ret = nf_connlabels_get(par->net, info->bit); if (ret < 0) - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { nf_connlabels_put(par->net); - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_match connlabels_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bb3845339efd..2aff2b7c4689 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -368,7 +368,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd)); - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for " "address family %u\n", par->family); @@ -378,7 +378,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) /* init private data */ info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); if (info->data == NULL) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); return -ENOMEM; } @@ -414,7 +414,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) const struct xt_connlimit_info *info = par->matchinfo; unsigned int i; - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) destroy_tree(&info->data->climit_root4[i]); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index b83e158e116a..9935d5029b0e 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -77,7 +77,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -86,7 +86,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par) static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static bool @@ -107,7 +107,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par) { int ret; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -116,7 +116,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par) static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_target connmark_tg_reg __read_mostly = { diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 2dea15ebc55b..c0fb217bc649 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -271,7 +271,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -280,7 +280,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_match conntrack_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index f679dd4c272a..38a78151c0e9 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -59,7 +59,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par) struct xt_helper_info *info = par->matchinfo; int ret; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -71,7 +71,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par) static void helper_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_match helper_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index ec06fb1cb16f..1cde0e4985b7 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -44,12 +44,18 @@ ports_match_v1(const struct xt_multiport_v1 *minfo, switch (minfo->flags) { case XT_MULTIPORT_SOURCE: - return (src >= s && src <= e) ^ minfo->invert; + if (src >= s && src <= e) + return true ^ minfo->invert; + break; case XT_MULTIPORT_DESTINATION: - return (dst >= s && dst <= e) ^ minfo->invert; + if (dst >= s && dst <= e) + return true ^ minfo->invert; + break; case XT_MULTIPORT_EITHER: - return ((dst >= s && dst <= e) || - (src >= s && src <= e)) ^ minfo->invert; + if ((dst >= s && dst <= e) || + (src >= s && src <= e)) + return true ^ minfo->invert; + break; default: break; } @@ -59,11 +65,17 @@ ports_match_v1(const struct xt_multiport_v1 *minfo, switch (minfo->flags) { case XT_MULTIPORT_SOURCE: - return (src == s) ^ minfo->invert; + if (src == s) + return true ^ minfo->invert; + break; case XT_MULTIPORT_DESTINATION: - return (dst == s) ^ minfo->invert; + if (dst == s) + return true ^ minfo->invert; + break; case XT_MULTIPORT_EITHER: - return (src == s || dst == s) ^ minfo->invert; + if (src == s || dst == s) + return true ^ minfo->invert; + break; default: break; } diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index bea7464cc43f..8107b3eb865f 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -23,7 +23,17 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) par->target->name); return -EINVAL; } - return 0; + return nf_ct_netns_get(par->net, par->family); +} + +static int xt_nat_checkentry(const struct xt_tgchk_param *par) +{ + return nf_ct_netns_get(par->net, par->family); +} + +static void xt_nat_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static void xt_nat_convert_range(struct nf_nat_range *dst, @@ -106,6 +116,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { .name = "SNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, + .destroy = xt_nat_destroy, .target = xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, @@ -118,6 +129,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { .name = "DNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, + .destroy = xt_nat_destroy, .target = xt_dnat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, @@ -129,6 +141,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "SNAT", .revision = 1, + .checkentry = xt_nat_checkentry, + .destroy = xt_nat_destroy, .target = xt_snat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", @@ -139,6 +153,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "DNAT", .revision = 1, + .checkentry = xt_nat_checkentry, + .destroy = xt_nat_destroy, .target = xt_dnat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 7720b036d76a..1db02f6fca54 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -18,35 +18,33 @@ static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; - struct gnet_stats_rate_est64 *r; + struct gnet_stats_rate_est64 sample = {0}; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; - spin_lock_bh(&info->est1->lock); - r = &info->est1->rstats; + gen_estimator_read(&info->est1->rate_est, &sample); + if (info->flags & XT_RATEEST_MATCH_DELTA) { - bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0; - pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0; + bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0; + pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0; } else { - bps1 = r->bps; - pps1 = r->pps; + bps1 = sample.bps; + pps1 = sample.pps; } - spin_unlock_bh(&info->est1->lock); if (info->flags & XT_RATEEST_MATCH_ABS) { bps2 = info->bps2; pps2 = info->pps2; } else { - spin_lock_bh(&info->est2->lock); - r = &info->est2->rstats; + gen_estimator_read(&info->est2->rate_est, &sample); + if (info->flags & XT_RATEEST_MATCH_DELTA) { - bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0; - pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0; + bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0; + pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0; } else { - bps2 = r->bps; - pps2 = r->pps; + bps2 = sample.bps; + pps2 = sample.pps; } - spin_unlock_bh(&info->est2->lock); } switch (info->mode) { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2198914707f5..770bbec878f1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -147,9 +147,28 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) } #endif +static int socket_mt_enable_defrag(struct net *net, int family) +{ + switch (family) { + case NFPROTO_IPV4: + return nf_defrag_ipv4_enable(net); +#ifdef XT_SOCKET_HAVE_IPV6 + case NFPROTO_IPV6: + return nf_defrag_ipv6_enable(net); +#endif + } + WARN_ONCE(1, "Unknown family %d\n", family); + return 0; +} + static int socket_mt_v1_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + int err; + + err = socket_mt_enable_defrag(par->net, par->family); + if (err) + return err; if (info->flags & ~XT_SOCKET_FLAGS_V1) { pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); @@ -161,6 +180,11 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par) static int socket_mt_v2_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; + int err; + + err = socket_mt_enable_defrag(par->net, par->family); + if (err) + return err; if (info->flags & ~XT_SOCKET_FLAGS_V2) { pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); @@ -173,7 +197,11 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo3 *info = (struct xt_socket_mtinfo3 *)par->matchinfo; + int err; + err = socket_mt_enable_defrag(par->net, par->family); + if (err) + return err; if (info->flags & ~XT_SOCKET_FLAGS_V3) { pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V3); @@ -268,11 +296,6 @@ static struct xt_match socket_mt_reg[] __read_mostly = { static int __init socket_mt_init(void) { - nf_defrag_ipv4_enable(); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_defrag_ipv6_enable(); -#endif - return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); } diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index a507922d80cd..5746a33789a5 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -43,7 +43,7 @@ static int state_mt_check(const struct xt_mtchk_param *par) { int ret; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -52,7 +52,7 @@ static int state_mt_check(const struct xt_mtchk_param *par) static void state_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_match state_mt_reg __read_mostly = { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 62bea4591054..246f29d365c0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -329,7 +329,6 @@ static void netlink_sock_destruct(struct sock *sk) if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); - module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); } @@ -346,6 +345,14 @@ static void netlink_sock_destruct(struct sock *sk) WARN_ON(nlk_sk(sk)->groups); } +static void netlink_sock_destruct_work(struct work_struct *work) +{ + struct netlink_sock *nlk = container_of(work, struct netlink_sock, + work); + + sk_free(&nlk->sk); +} + /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -648,8 +655,18 @@ out_module: static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); + struct sock *sk = &nlk->sk; + + if (!atomic_dec_and_test(&sk->sk_refcnt)) + return; + + if (nlk->cb_running && nlk->cb.done) { + INIT_WORK(&nlk->work, netlink_sock_destruct_work); + schedule_work(&nlk->work); + return; + } - sock_put(&nlk->sk); + sk_free(sk); } static int netlink_release(struct socket *sock) diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 3cfd6cc60504..4fdb38318977 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -3,6 +3,7 @@ #include <linux/rhashtable.h> #include <linux/atomic.h> +#include <linux/workqueue.h> #include <net/sock.h> #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) @@ -33,6 +34,7 @@ struct netlink_sock { struct rhash_head node; struct rcu_head rcu; + struct work_struct work; }; static inline struct netlink_sock *nlk_sk(struct sock *sk) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 9b8a028b7dad..6b78bab27755 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -370,8 +370,11 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, skb_orphan(skb); memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); err = nf_ct_frag6_gather(net, skb, user); - if (err) + if (err) { + if (err != -EINPROGRESS) + kfree_skb(skb); return err; + } key->ip.proto = ipv6_hdr(skb)->nexthdr; ovs_cb.mru = IP6CB(skb)->frag_max_size; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fab9bbfdead5..89f2e8c1f4dc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3593,19 +3593,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: - po->tp_version = val; - return 0; + break; default: return -EINVAL; } + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_version = val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_RESERVE: { @@ -4109,6 +4115,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; + lock_sock(sk); /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { net_warn_ratelimited("Tx-ring is not supported.\n"); @@ -4190,7 +4197,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; } - lock_sock(sk); /* Detach socket from network */ spin_lock(&po->bind_lock); @@ -4239,11 +4245,11 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } - release_sock(sk); if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: + release_sock(sk); return err; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 1a0399dea764..57bb52361e0f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -665,6 +665,8 @@ out_recv: out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); out_slab: + if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) + pr_warn("could not unregister rds_tcp_dev_notifier\n"); kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f893d180da1c..2095c83ce773 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p) spin_lock_bh(&hinfo->lock); hlist_del(&p->tcfa_head); spin_unlock_bh(&hinfo->lock); - gen_kill_estimator(&p->tcfa_bstats, - &p->tcfa_rate_est); + gen_kill_estimator(&p->tcfa_rate_est); /* * gen_estimator est_timer() might access p->tcfa_lock * or bstats, wait a RCU grace period before freeing p @@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { if (est) - gen_kill_estimator(&a->tcfa_bstats, - &a->tcfa_rate_est); + gen_kill_estimator(&a->tcfa_rate_est); call_rcu(&a->tcfa_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); @@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, goto errout; if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || - gnet_stats_copy_rate_est(&d, &p->tcfa_bstats, - &p->tcfa_rate_est) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, p->tcfa_qstats.qlen) < 0) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 1aa4ecf41baf..1c60317f0121 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -28,7 +28,6 @@ struct tcf_bpf_cfg { struct bpf_prog *filter; struct sock_filter *bpf_ops; const char *bpf_name; - u32 bpf_fd; u16 bpf_num_ops; bool is_ebpf; }; @@ -118,13 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { - if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd)) - return -EMSGSIZE; + struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST, + sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } @@ -233,7 +238,6 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) } } - cfg->bpf_fd = bpf_fd; cfg->bpf_name = name; cfg->filter = fp; cfg->is_ebpf = true; @@ -332,8 +336,6 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (cfg.bpf_num_ops) prog->bpf_num_ops = cfg.bpf_num_ops; - if (cfg.bpf_fd) - prog->bpf_fd = cfg.bpf_fd; prog->tcf_action = parm->action; rcu_assign_pointer(prog->filter, cfg.filter); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b2d417b8f46c..2d9fa6e0a1b4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> +#include <linux/if_arp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -73,20 +74,6 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; -static bool dev_is_mac_header_xmit(const struct net_device *dev) -{ - switch (dev->type) { - case ARPHRD_TUNNEL: - case ARPHRD_TUNNEL6: - case ARPHRD_SIT: - case ARPHRD_IPGRE: - case ARPHRD_VOID: - case ARPHRD_NONE: - return false; - } - return true; -} - static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) @@ -328,6 +315,17 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; +static int tcf_mirred_device(const struct tc_action *a, struct net *net, + struct net_device **mirred_dev) +{ + int ifindex = tcf_mirred_ifindex(a); + + *mirred_dev = __dev_get_by_index(net, ifindex); + if (!*mirred_dev) + return -EINVAL; + return 0; +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, @@ -340,6 +338,7 @@ static struct tc_action_ops act_mirred_ops = { .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), + .get_dev = tcf_mirred_device, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index eda322045e75..b27c4daec88f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind) kfree(keys); } +static bool offset_valid(struct sk_buff *skb, int offset) +{ + if (offset > 0 && offset > skb->len) + return false; + + if (offset < 0 && -offset > skb_headroom(skb)) + return false; + + return true; +} + static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { @@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, if (tkey->offmask) { char *d, _d; + if (!offset_valid(skb, off + tkey->at)) { + pr_info("tc filter pedit 'at' offset %d out of bounds\n", + off + tkey->at); + goto bad; + } d = skb_header_pointer(skb, off + tkey->at, 1, &_d); if (!d) @@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, " offset must be on 32 bit boundaries\n"); goto bad; } - if (offset > 0 && offset > skb->len) { - pr_info("tc filter pedit" - " offset %d can't exceed pkt length %d\n", - offset, skb->len); + + if (!offset_valid(skb, off + offset)) { + pr_info("tc filter pedit offset %d out of bounds\n", + offset); goto bad; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index c990b73a6c85..0ba91d1ce994 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, goto failure_unlock; } else if (tb[TCA_POLICE_AVRATE] && (ret == ACT_P_CREATED || - !gen_estimator_active(&police->tcf_bstats, - &police->tcf_rate_est))) { + !gen_estimator_active(&police->tcf_rate_est))) { err = -EINVAL; goto failure_unlock; } @@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, bstats_update(&police->tcf_bstats, skb); tcf_lastuse_update(&police->tcf_tm); - if (police->tcfp_ewma_rate && - police->tcf_rate_est.bps >= police->tcfp_ewma_rate) { - police->tcf_qstats.overlimits++; - if (police->tcf_action == TC_ACT_SHOT) - police->tcf_qstats.drops++; - spin_unlock(&police->tcf_lock); - return police->tcf_action; + if (police->tcfp_ewma_rate) { + struct gnet_stats_rate_est64 sample; + + if (!gen_estimator_read(&police->tcf_rate_est, &sample) || + sample.bps >= police->tcfp_ewma_rate) { + police->tcf_qstats.overlimits++; + if (police->tcf_action == TC_ACT_SHOT) + police->tcf_qstats.drops++; + spin_unlock(&police->tcf_lock); + return police->tcf_action; + } } if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2b2a7974e4bb..3fbba79a4ef0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -430,7 +430,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) { + if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + n->nlmsg_flags, event) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -681,6 +682,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); +int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, + struct net_device **hw_dev) +{ +#ifdef CONFIG_NET_CLS_ACT + const struct tc_action *a; + LIST_HEAD(actions); + + if (tc_no_actions(exts)) + return -EINVAL; + + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { + if (a->ops->get_dev) { + a->ops->get_dev(a, dev_net(dev), hw_dev); + break; + } + } + if (*hw_dev) + return 0; +#endif + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(tcf_exts_get_dev); + static int __init tc_filter_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index eb219b78cd49..5877f6061b57 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -62,9 +62,6 @@ static unsigned long basic_get(struct tcf_proto *tp, u32 handle) struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; - if (head == NULL) - return 0UL; - list_for_each_entry(f, &head->flist, link) { if (f->handle == handle) { l = (unsigned long) f; @@ -109,7 +106,6 @@ static bool basic_destroy(struct tcf_proto *tp, bool force) tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, basic_delete_filter); } - RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); return true; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 52dc85acca7d..adc776048d1a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -45,10 +45,7 @@ struct cls_bpf_prog { u32 gen_flags; struct tcf_exts exts; u32 handle; - union { - u32 bpf_fd; - u16 bpf_num_ops; - }; + u16 bpf_num_ops; struct sock_filter *bpf_ops; const char *bpf_name; struct tcf_proto *tp; @@ -244,7 +241,7 @@ static int cls_bpf_init(struct tcf_proto *tp) return 0; } -static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) +static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); @@ -258,22 +255,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) kfree(prog); } -static void __cls_bpf_delete_prog(struct rcu_head *rcu) +static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) { - struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); - - cls_bpf_delete_prog(prog->tp, prog); + __cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu)); } -static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) { - struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg; - cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); - call_rcu(&prog->rcu, __cls_bpf_delete_prog); + call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); +} +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +{ + __cls_bpf_delete(tp, (struct cls_bpf_prog *) arg); return 0; } @@ -285,14 +282,9 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force) if (!force && !list_empty(&head->plist)) return false; - list_for_each_entry_safe(prog, tmp, &head->plist, link) { - cls_bpf_stop_offload(tp, prog); - list_del_rcu(&prog->link); - tcf_unbind_filter(tp, &prog->res); - call_rcu(&prog->rcu, __cls_bpf_delete_prog); - } + list_for_each_entry_safe(prog, tmp, &head->plist, link) + __cls_bpf_delete(tp, prog); - RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); return true; } @@ -303,9 +295,6 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) struct cls_bpf_prog *prog; unsigned long ret = 0UL; - if (head == NULL) - return 0UL; - list_for_each_entry(prog, &head->plist, link) { if (prog->handle == handle) { ret = (unsigned long) prog; @@ -377,7 +366,6 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, } prog->bpf_ops = NULL; - prog->bpf_fd = bpf_fd; prog->bpf_name = name; prog->filter = fp; @@ -519,14 +507,14 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { - cls_bpf_delete_prog(tp, prog); + __cls_bpf_delete_prog(prog); return ret; } if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); - call_rcu(&oldprog->rcu, __cls_bpf_delete_prog); + call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); } else { list_add_rcu(&prog->link, &head->plist); } @@ -561,13 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { - if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd)) - return -EMSGSIZE; + struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 85233c470035..c1f20077837f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -137,11 +137,10 @@ static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force) if (!force) return false; - - if (head) { - RCU_INIT_POINTER(tp->root, NULL); + /* Head can still be NULL due to cls_cgroup_init(). */ + if (head) call_rcu(&head->rcu, cls_cgroup_destroy_rcu); - } + return true; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index e39672394c7b..6575aba87630 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -596,7 +596,6 @@ static bool flow_destroy(struct tcf_proto *tp, bool force) list_del_rcu(&f->list); call_rcu(&f->rcu, flow_destroy_filter); } - RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); return true; } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e8dd09af0d0c..e040c5140f61 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/rhashtable.h> +#include <linux/workqueue.h> #include <linux/if_ether.h> #include <linux/in6.h> @@ -38,6 +39,7 @@ struct fl_flow_key { struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; + struct flow_dissector_key_icmp icmp; struct flow_dissector_key_keyid enc_key_id; union { struct flow_dissector_key_ipv4_addrs enc_ipv4; @@ -65,7 +67,10 @@ struct cls_fl_head { bool mask_assigned; struct list_head filters; struct rhashtable_params ht_params; - struct rcu_head rcu; + union { + struct work_struct work; + struct rcu_head rcu; + }; }; struct cls_fl_filter { @@ -78,6 +83,8 @@ struct cls_fl_filter { u32 handle; u32 flags; struct rcu_head rcu; + struct tc_to_netdev tc; + struct net_device *hw_dev; }; static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) @@ -201,85 +208,110 @@ static void fl_destroy_filter(struct rcu_head *head) kfree(f); } -static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) +static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; - struct tc_to_netdev tc; + struct net_device *dev = f->hw_dev; + struct tc_to_netdev *tc = &f->tc; - if (!tc_should_offload(dev, tp, 0)) + if (!tc_can_offload(dev, tp)) return; offload.command = TC_CLSFLOWER_DESTROY; - offload.cookie = cookie; + offload.cookie = (unsigned long)f; - tc.type = TC_SETUP_CLSFLOWER; - tc.cls_flower = &offload; + tc->type = TC_SETUP_CLSFLOWER; + tc->cls_flower = &offload; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc); } static int fl_hw_replace_filter(struct tcf_proto *tp, struct flow_dissector *dissector, struct fl_flow_key *mask, - struct fl_flow_key *key, - struct tcf_exts *actions, - unsigned long cookie, u32 flags) + struct cls_fl_filter *f) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; - struct tc_to_netdev tc; + struct tc_to_netdev *tc = &f->tc; int err; - if (!tc_should_offload(dev, tp, flags)) - return tc_skip_sw(flags) ? -EINVAL : 0; + if (!tc_can_offload(dev, tp)) { + if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) || + (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) { + f->hw_dev = dev; + return tc_skip_sw(f->flags) ? -EINVAL : 0; + } + dev = f->hw_dev; + tc->egress_dev = true; + } else { + f->hw_dev = dev; + } offload.command = TC_CLSFLOWER_REPLACE; - offload.cookie = cookie; + offload.cookie = (unsigned long)f; offload.dissector = dissector; offload.mask = mask; - offload.key = key; - offload.exts = actions; + offload.key = &f->key; + offload.exts = &f->exts; - tc.type = TC_SETUP_CLSFLOWER; - tc.cls_flower = &offload; + tc->type = TC_SETUP_CLSFLOWER; + tc->cls_flower = &offload; err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, - &tc); + tc); - if (tc_skip_sw(flags)) + if (tc_skip_sw(f->flags)) return err; - return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload offload = {0}; - struct tc_to_netdev tc; + struct net_device *dev = f->hw_dev; + struct tc_to_netdev *tc = &f->tc; - if (!tc_should_offload(dev, tp, 0)) + if (!tc_can_offload(dev, tp)) return; offload.command = TC_CLSFLOWER_STATS; offload.cookie = (unsigned long)f; offload.exts = &f->exts; - tc.type = TC_SETUP_CLSFLOWER; - tc.cls_flower = &offload; + tc->type = TC_SETUP_CLSFLOWER; + tc->cls_flower = &offload; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) { list_del_rcu(&f->list); - fl_hw_destroy_filter(tp, (unsigned long)f); + if (!tc_skip_hw(f->flags)) + fl_hw_destroy_filter(tp, f); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fl_destroy_filter); } +static void fl_destroy_sleepable(struct work_struct *work) +{ + struct cls_fl_head *head = container_of(work, struct cls_fl_head, + work); + if (head->mask_assigned) + rhashtable_destroy(&head->ht); + kfree(head); + module_put(THIS_MODULE); +} + +static void fl_destroy_rcu(struct rcu_head *rcu) +{ + struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu); + + INIT_WORK(&head->work, fl_destroy_sleepable); + schedule_work(&head->work); +} + static bool fl_destroy(struct tcf_proto *tp, bool force) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -290,10 +322,10 @@ static bool fl_destroy(struct tcf_proto *tp, bool force) list_for_each_entry_safe(f, next, &head->filters, list) __fl_delete(tp, f); - RCU_INIT_POINTER(tp->root, NULL); - if (head->mask_assigned) - rhashtable_destroy(&head->ht); - kfree_rcu(head, rcu); + + __module_get(THIS_MODULE); + call_rcu(&head->rcu, fl_destroy_rcu); + return true; } @@ -355,6 +387,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -389,6 +431,39 @@ static void fl_set_key_vlan(struct nlattr **tb, } } +static void fl_set_key_flag(u32 flower_key, u32 flower_mask, + u32 *dissector_key, u32 *dissector_mask, + u32 flower_flag_bit, u32 dissector_flag_bit) +{ + if (flower_mask & flower_flag_bit) { + *dissector_mask |= dissector_flag_bit; + if (flower_key & flower_flag_bit) + *dissector_key |= dissector_flag_bit; + } +} + +static void fl_set_key_flags(struct nlattr **tb, + u32 *flags_key, u32 *flags_mask) +{ + u32 key, mask; + + if (!tb[TCA_FLOWER_KEY_FLAGS]) + return; + + key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); + + if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) + mask = ~0; + else + mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); + + *flags_key = 0; + *flags_mask = 0; + + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -471,6 +546,26 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)); + } else if (key->basic.n_proto == htons(ETH_P_IP) && + key->basic.ip_proto == IPPROTO_ICMP) { + fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE, + &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, + sizeof(key->icmp.type)); + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_IPV6) && + key->basic.ip_proto == IPPROTO_ICMPV6) { + fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE, + &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, + sizeof(key->icmp.type)); + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || @@ -515,6 +610,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); + return 0; } @@ -581,6 +678,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ICMP, icmp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); @@ -743,20 +842,21 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout; } - err = fl_hw_replace_filter(tp, - &head->dissector, - &mask.key, - &fnew->key, - &fnew->exts, - (unsigned long)fnew, - fnew->flags); - if (err) - goto errout; + if (!tc_skip_hw(fnew->flags)) { + err = fl_hw_replace_filter(tp, + &head->dissector, + &mask.key, + fnew); + if (err) + goto errout; + } if (fold) { - rhashtable_remove_fast(&head->ht, &fold->ht_node, - head->ht_params); - fl_hw_destroy_filter(tp, (unsigned long)fold); + if (!tc_skip_sw(fold->flags)) + rhashtable_remove_fast(&head->ht, &fold->ht_node, + head->ht_params); + if (!tc_skip_hw(fold->flags)) + fl_hw_destroy_filter(tp, fold); } *arg = (unsigned long) fnew; @@ -782,8 +882,9 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg) struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = (struct cls_fl_filter *) arg; - rhashtable_remove_fast(&head->ht, &f->ht_node, - head->ht_params); + if (!tc_skip_sw(f->flags)) + rhashtable_remove_fast(&head->ht, &f->ht_node, + head->ht_params); __fl_delete(tp, f); return 0; } @@ -847,6 +948,42 @@ static int fl_dump_key_vlan(struct sk_buff *skb, return 0; } +static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask, + u32 *flower_key, u32 *flower_mask, + u32 flower_flag_bit, u32 dissector_flag_bit) +{ + if (dissector_mask & dissector_flag_bit) { + *flower_mask |= flower_flag_bit; + if (dissector_key & dissector_flag_bit) + *flower_key |= flower_flag_bit; + } +} + +static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) +{ + u32 key, mask; + __be32 _key, _mask; + int err; + + if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask))) + return 0; + + key = 0; + mask = 0; + + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + + _key = cpu_to_be32(key); + _mask = cpu_to_be32(mask); + + err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key); + if (err) + return err; + + return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); +} + static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -879,7 +1016,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; } - fl_hw_update_stats(tp, f); + if (!tc_skip_hw(f->flags)) + fl_hw_update_stats(tp, f); if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, @@ -943,6 +1081,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IP) && + key->basic.ip_proto == IPPROTO_ICMP && + (fl_dump_key_val(skb, &key->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, + sizeof(key->icmp.type)) || + fl_dump_key_val(skb, &key->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)))) + goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IPV6) && + key->basic.ip_proto == IPPROTO_ICMPV6 && + (fl_dump_key_val(skb, &key->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, + sizeof(key->icmp.type)) || + fl_dump_key_val(skb, &key->icmp.code, + TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV6_CODE_MASK, + sizeof(key->icmp.code)))) + goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, @@ -981,6 +1141,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->enc_tp.dst))) goto nla_put_failure; + if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) + goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); if (tcf_exts_dump(skb, &f->exts)) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 25927b6c4436..f935429bd5ef 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -114,7 +114,6 @@ static bool mall_destroy(struct tcf_proto *tp, bool force) call_rcu(&f->rcu, mall_destroy_filter); } - RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); return true; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 4f05a19fb073..322438fb3ffc 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -152,7 +152,8 @@ static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; nhptr = ip_hdr(skb); #endif - + if (unlikely(!head)) + return -1; restart: #if RSVP_DST_LEN == 4 diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 96144bdf30db..0751245a6ace 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -543,7 +543,6 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force) walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); - RCU_INIT_POINTER(tp->root, NULL); call_rcu(&p->rcu, __tcindex_destroy); return true; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f337f1bdd1d4..d7b93429f0cc 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), &d, cpu_bstats, &q->bstats) < 0 || - gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || + gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index beb554aa8cfb..9ffe1c220b02 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -122,7 +122,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est64 rate_est; + struct net_rate_estimator __rcu *rate_est; struct tc_cbq_xstats xstats; struct tcf_proto __rcu *filter_list; @@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; @@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) tcf_destroy_chain(&cl->filter_list); qdisc_destroy(cl->q); qdisc_put_rtab(cl->R_tab); - gen_kill_estimator(&cl->bstats, &cl->rate_est); + gen_kill_estimator(&cl->rate_est); if (cl != &q->link) kfree(cl); } diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 8af5c59eef84..bb4cbdf75004 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est64 rate_est; + struct net_rate_estimator __rcu *rate_est; struct list_head alist; struct Qdisc *qdisc; @@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl) { - gen_kill_estimator(&cl->bstats, &cl->rate_est); + gen_kill_estimator(&cl->rate_est); qdisc_destroy(cl->qdisc); kfree(cl); } @@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6cfb6e9038c2..6eb9c8e88519 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc) qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif - gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); + gen_kill_estimator(&qdisc->rate_est); if (ops->reset) ops->reset(qdisc); if (ops->destroy) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 000f1d36128e..3ffaa6fb0990 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est64 rate_est; + struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ unsigned int level; /* class level in hierarchy */ @@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) tcf_destroy_chain(&cl->filter_list); qdisc_destroy(cl->qdisc); - gen_kill_estimator(&cl->bstats, &cl->rate_est); + gen_kill_estimator(&cl->rate_est); if (cl != &q->root) kfree(cl); } @@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.rtwork = cl->cl_cumul; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9926fe4f3b6f..760f39e7caee 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -111,7 +111,7 @@ struct htb_class { unsigned int children; struct htb_class *parent; /* parent class */ - struct gnet_stats_rate_est64 rate_est; + struct net_rate_estimator __rcu *rate_est; /* * Written often fields @@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; @@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) WARN_ON(!cl->un.leaf.q); qdisc_destroy(cl->un.leaf.q); } - gen_kill_estimator(&cl->bstats, &cl->rate_est); + gen_kill_estimator(&cl->rate_est); tcf_destroy_chain(&cl->filter_list); kfree(cl); } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index ca0516e6f743..f9e712ce2d15 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -137,7 +137,7 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est64 rate_est; + struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ @@ -508,7 +508,7 @@ set_change_agg: new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); if (new_agg == NULL) { err = -ENOBUFS; - gen_kill_estimator(&cl->bstats, &cl->rate_est); + gen_kill_estimator(&cl->rate_est); goto destroy_class; } sch_tree_lock(sch); @@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) struct qfq_sched *q = qdisc_priv(sch); qfq_rm_from_agg(q, cl); - gen_kill_estimator(&cl->bstats, &cl->rate_est); + gen_kill_estimator(&cl->rate_est); qdisc_destroy(cl->qdisc); kfree(cl); } @@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || - gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) return -1; diff --git a/net/socket.c b/net/socket.c index f9e26c68c3cf..58353835497d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1,3 +1,4 @@ + /* * NET An implementation of the SOCKET network access protocol. * @@ -341,8 +342,23 @@ static const struct xattr_handler sockfs_xattr_handler = { .get = sockfs_xattr_get, }; +static int sockfs_security_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + /* Handled by LSM. */ + return -EAGAIN; +} + +static const struct xattr_handler sockfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .set = sockfs_security_xattr_set, +}; + static const struct xattr_handler *sockfs_xattr_handlers[] = { &sockfs_xattr_handler, + &sockfs_security_xattr_handler, NULL }; @@ -678,9 +694,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) empty = 0; - if (!empty) + if (!empty) { put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); + + if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, + skb->len, skb->data); + } } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); @@ -1898,7 +1919,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct sockaddr_storage address; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20] - __attribute__ ((aligned(sizeof(__kernel_size_t)))); + __aligned(sizeof(__kernel_size_t)); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; int ctl_len; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c3f652395a80..3bc1d61694cb 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1002,14 +1002,8 @@ static void svc_age_temp_xprts(unsigned long closure) void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) { struct svc_xprt *xprt; - struct svc_sock *svsk; - struct socket *sock; struct list_head *le, *next; LIST_HEAD(to_be_closed); - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; spin_lock_bh(&serv->sv_lock); list_for_each_safe(le, next, &serv->sv_tempsocks) { @@ -1027,10 +1021,7 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) list_del_init(le); xprt = list_entry(le, struct svc_xprt, xpt_list); dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_close_xprt(xprt); } } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 78da4aee3543..135ec2c11b3b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -451,6 +451,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); } +static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) +{ + struct svc_sock *svsk; + struct socket *sock; + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -660,6 +675,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) return NULL; } +static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt) +{ +} + static struct svc_xprt *svc_udp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -679,6 +698,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt, }; static struct svc_xprt_class svc_udp_class = { @@ -1254,6 +1274,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt, }; static struct svc_xprt_class svc_tcp_class = { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6864fb967038..1334de2715c2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -67,6 +67,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); static int svc_rdma_has_wspace(struct svc_xprt *xprt); static int svc_rdma_secure_port(struct svc_rqst *); +static void svc_rdma_kill_temp_xprt(struct svc_xprt *); static struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, @@ -79,6 +80,7 @@ static struct svc_xprt_ops svc_rdma_ops = { .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, .xpo_secure_port = svc_rdma_secure_port, + .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt, }; struct svc_xprt_class svc_rdma_class = { @@ -1317,6 +1319,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp) return 1; } +static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) +{ +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr, *n_wr; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 975dbeb60ab0..52d74760fb68 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -421,6 +421,10 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, dev = dev_get_by_name(net, driver_name); if (!dev) return -ENODEV; + if (tipc_mtu_bad(dev, 0)) { + dev_put(dev); + return -EINVAL; + } /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); @@ -610,8 +614,6 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, if (!b) return NOTIFY_DONE; - b->mtu = dev->mtu; - switch (evt) { case NETDEV_CHANGE: if (netif_carrier_ok(dev)) @@ -624,6 +626,11 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, tipc_reset_bearer(net, b); break; case NETDEV_CHANGEMTU: + if (tipc_mtu_bad(dev, 0)) { + bearer_disable(net, b); + break; + } + b->mtu = dev->mtu; tipc_reset_bearer(net, b); break; case NETDEV_CHANGEADDR: diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 78892e2f53e3..278ff7f616f9 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -39,6 +39,7 @@ #include "netlink.h" #include "core.h" +#include "msg.h" #include <net/genetlink.h> #define MAX_MEDIA 3 @@ -59,6 +60,9 @@ #define TIPC_MEDIA_TYPE_IB 2 #define TIPC_MEDIA_TYPE_UDP 3 +/* minimum bearer MTU */ +#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) + /** * struct tipc_media_addr - destination address used by TIPC bearers * @value: address info (format defined by media) @@ -215,4 +219,13 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); +/* check if device MTU is too low for tipc headers */ +static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve) +{ + if (dev->mtu >= TIPC_MIN_BEARER_MTU + reserve) + return false; + netdev_warn(dev, "MTU too low for tipc bearer\n"); + return true; +} + #endif /* _TIPC_BEARER_H */ diff --git a/net/tipc/link.c b/net/tipc/link.c index 1055164c6232..bda89bf9f4ff 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -47,8 +47,8 @@ #include <linux/pkt_sched.h> struct tipc_stats { - u32 sent_info; /* used in counting # sent packets */ - u32 recv_info; /* used in counting # recv'd packets */ + u32 sent_pkts; + u32 recv_pkts; u32 sent_states; u32 recv_states; u32 sent_probes; @@ -857,7 +857,6 @@ void tipc_link_reset(struct tipc_link *l) l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stats.recv_info = 0; l->stale_count = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); @@ -888,6 +887,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *transmq = &l->transmq; struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff *skb, *_skb, *bskb; + int pkt_cnt = skb_queue_len(list); /* Match msg importance against this and all higher backlog limits: */ if (!skb_queue_empty(backlogq)) { @@ -901,6 +901,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return -EMSGSIZE; } + if (pkt_cnt > 1) { + l->stats.sent_fragmented++; + l->stats.sent_fragments += pkt_cnt; + } + /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { skb = skb_peek(list); @@ -920,6 +925,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; l->rcv_unacked = 0; + l->stats.sent_pkts++; seqno++; continue; } @@ -968,6 +974,7 @@ void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) msg_set_ack(hdr, ack); msg_set_bcast_ack(hdr, bc_ack); l->rcv_unacked = 0; + l->stats.sent_pkts++; seqno++; } l->snd_nxt = seqno; @@ -1260,7 +1267,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Deliver packet */ l->rcv_nxt++; - l->stats.recv_info++; + l->stats.recv_pkts++; if (!tipc_data_input(l, skb, l->inputq)) rc |= tipc_link_input(l, skb, l->inputq); if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) @@ -1492,8 +1499,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) l->tolerance = peers_tol; - if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI, - TIPC_MAX_LINK_PRI)) { + /* Update own prio if peer indicates a different value */ + if ((peers_prio != l->priority) && + in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { l->priority = peers_prio; rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } @@ -1799,10 +1807,6 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) void tipc_link_reset_stats(struct tipc_link *l) { memset(&l->stats, 0, sizeof(l->stats)); - if (!link_is_bc_sndlink(l)) { - l->stats.sent_info = l->snd_nxt; - l->stats.recv_info = l->rcv_nxt; - } } static void link_print(struct tipc_link *l, const char *str) @@ -1866,12 +1870,12 @@ static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s) }; struct nla_map map[] = { - {TIPC_NLA_STATS_RX_INFO, s->recv_info}, + {TIPC_NLA_STATS_RX_INFO, 0}, {TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments}, {TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented}, {TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles}, {TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled}, - {TIPC_NLA_STATS_TX_INFO, s->sent_info}, + {TIPC_NLA_STATS_TX_INFO, 0}, {TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments}, {TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented}, {TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles}, @@ -1946,9 +1950,9 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->stats.recv_pkts)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->stats.sent_pkts)) goto attr_msg_full; if (tipc_link_is_up(link)) @@ -2003,12 +2007,12 @@ static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, }; struct nla_map map[] = { - {TIPC_NLA_STATS_RX_INFO, stats->recv_info}, + {TIPC_NLA_STATS_RX_INFO, stats->recv_pkts}, {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments}, {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented}, {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles}, {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled}, - {TIPC_NLA_STATS_TX_INFO, stats->sent_info}, + {TIPC_NLA_STATS_TX_INFO, stats->sent_pkts}, {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments}, {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented}, {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles}, @@ -2075,9 +2079,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, 0)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0)) goto attr_msg_full; prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index ed97a5876ebe..9e109bb1a207 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -455,14 +455,14 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, int i, applied_bef; state->probing = false; - if (!dlen) - return; /* Sanity check received domain record */ - if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) { - pr_warn_ratelimited("Received illegal domain record\n"); + if (dlen < dom_rec_len(arrv_dom, 0)) + return; + if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) + return; + if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) return; - } /* Synch generation numbers with peer if link just came up */ if (!state->synched) { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 22d92f0ec5ac..333c5dae0072 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,7 +1,7 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2015, Ericsson AB + * Copyright (c) 2001-2007, 2012-2016, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -127,54 +127,8 @@ static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; static struct proto tipc_proto; - static const struct rhashtable_params tsk_rht_params; -/* - * Revised TIPC socket locking policy: - * - * Most socket operations take the standard socket lock when they start - * and hold it until they finish (or until they need to sleep). Acquiring - * this lock grants the owner exclusive access to the fields of the socket - * data structures, with the exception of the backlog queue. A few socket - * operations can be done without taking the socket lock because they only - * read socket information that never changes during the life of the socket. - * - * Socket operations may acquire the lock for the associated TIPC port if they - * need to perform an operation on the port. If any routine needs to acquire - * both the socket lock and the port lock it must take the socket lock first - * to avoid the risk of deadlock. - * - * The dispatcher handling incoming messages cannot grab the socket lock in - * the standard fashion, since invoked it runs at the BH level and cannot block. - * Instead, it checks to see if the socket lock is currently owned by someone, - * and either handles the message itself or adds it to the socket's backlog - * queue; in the latter case the queued message is processed once the process - * owning the socket lock releases it. - * - * NOTE: Releasing the socket lock while an operation is sleeping overcomes - * the problem of a blocked socket operation preventing any other operations - * from occurring. However, applications must be careful if they have - * multiple threads trying to send (or receive) on the same socket, as these - * operations might interfere with each other. For example, doing a connect - * and a receive at the same time might allow the receive to consume the - * ACK message meant for the connect. While additional work could be done - * to try and overcome this, it doesn't seem to be worthwhile at the present. - * - * NOTE: Releasing the socket lock while an operation is sleeping also ensures - * that another operation that must be performed in a non-blocking manner is - * not delayed for very long because the lock has already been taken. - * - * NOTE: This code assumes that certain fields of a port/socket pair are - * constant over its lifetime; such fields can be examined without taking - * the socket lock and/or port lock, and do not need to be re-read even - * after resuming processing after waiting. These fields include: - * - socket type - * - pointer to socket sk structure (aka tipc_sock structure) - * - pointer to port structure - * - port reference - */ - static u32 tsk_own_node(struct tipc_sock *tsk) { return msg_prevnode(&tsk->phdr); @@ -230,7 +184,7 @@ static struct tipc_sock *tipc_sk(const struct sock *sk) static bool tsk_conn_cong(struct tipc_sock *tsk) { - return tsk->snt_unacked >= tsk->snd_win; + return tsk->snt_unacked > tsk->snd_win; } /* tsk_blocks(): translate a buffer size in bytes to number of diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 78cab9c5a445..b58dc95f3d35 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -697,6 +697,11 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.local_ip.s_addr = htonl(INADDR_ANY); udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; + if (tipc_mtu_bad(dev, sizeof(struct iphdr) + + sizeof(struct udphdr))) { + err = -EINVAL; + goto err; + } b->mtu = dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr); #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6a705d0ff889..1752d6b10ac4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2199,7 +2199,8 @@ out: * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last, unsigned int last_len) + struct sk_buff *last, unsigned int last_len, + bool freezable) { struct sk_buff *tail; DEFINE_WAIT(wait); @@ -2220,7 +2221,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); - timeo = freezable_schedule_timeout(timeo); + if (freezable) + timeo = freezable_schedule_timeout(timeo); + else + timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) @@ -2250,7 +2254,8 @@ struct unix_stream_read_state { unsigned int splice_flags; }; -static int unix_stream_read_generic(struct unix_stream_read_state *state) +static int unix_stream_read_generic(struct unix_stream_read_state *state, + bool freezable) { struct scm_cookie scm; struct socket *sock = state->socket; @@ -2330,7 +2335,7 @@ again: mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, - last_len); + last_len, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); @@ -2472,7 +2477,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, .flags = flags }; - return unix_stream_read_generic(&state); + return unix_stream_read_generic(&state, true); } static int unix_stream_splice_actor(struct sk_buff *skb, @@ -2503,7 +2508,7 @@ static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, flags & SPLICE_F_NONBLOCK) state.flags = MSG_DONTWAIT; - return unix_stream_read_generic(&state); + return unix_stream_read_generic(&state, false); } static int unix_shutdown(struct socket *sock, int mode) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 936d7eee62d0..2e47f9f06b96 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -44,6 +44,10 @@ struct virtio_vsock { spinlock_t send_pkt_list_lock; struct list_head send_pkt_list; + struct work_struct loopback_work; + spinlock_t loopback_list_lock; /* protects loopback_list */ + struct list_head loopback_list; + atomic_t queued_replies; /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] @@ -74,6 +78,42 @@ static u32 virtio_transport_get_local_cid(void) return vsock->guest_cid; } +static void virtio_transport_loopback_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, loopback_work); + LIST_HEAD(pkts); + + spin_lock_bh(&vsock->loopback_list_lock); + list_splice_init(&vsock->loopback_list, &pkts); + spin_unlock_bh(&vsock->loopback_list_lock); + + mutex_lock(&vsock->rx_lock); + while (!list_empty(&pkts)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + + virtio_transport_recv_pkt(pkt); + } + mutex_unlock(&vsock->rx_lock); +} + +static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock, + struct virtio_vsock_pkt *pkt) +{ + int len = pkt->len; + + spin_lock_bh(&vsock->loopback_list_lock); + list_add_tail(&pkt->list, &vsock->loopback_list); + spin_unlock_bh(&vsock->loopback_list_lock); + + queue_work(virtio_vsock_workqueue, &vsock->loopback_work); + + return len; +} + static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -159,6 +199,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) return -ENODEV; } + if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) + return virtio_transport_send_pkt_loopback(vsock, pkt); + if (pkt->reply) atomic_inc(&vsock->queued_replies); @@ -510,10 +553,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_init(&vsock->event_lock); spin_lock_init(&vsock->send_pkt_list_lock); INIT_LIST_HEAD(&vsock->send_pkt_list); + spin_lock_init(&vsock->loopback_list_lock); + INIT_LIST_HEAD(&vsock->loopback_list); INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); INIT_WORK(&vsock->event_work, virtio_transport_event_work); INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); + INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work); mutex_lock(&vsock->rx_lock); virtio_vsock_rx_fill(vsock); @@ -539,6 +585,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev) struct virtio_vsock *vsock = vdev->priv; struct virtio_vsock_pkt *pkt; + flush_work(&vsock->loopback_work); flush_work(&vsock->rx_work); flush_work(&vsock->tx_work); flush_work(&vsock->event_work); @@ -565,6 +612,15 @@ static void virtio_vsock_remove(struct virtio_device *vdev) } spin_unlock_bh(&vsock->send_pkt_list_lock); + spin_lock_bh(&vsock->loopback_list_lock); + while (!list_empty(&vsock->loopback_list)) { + pkt = list_first_entry(&vsock->loopback_list, + struct virtio_vsock_pkt, list); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->loopback_list_lock); + mutex_lock(&the_virtio_vsock_mutex); the_virtio_vsock = NULL; vsock_core_exit(); diff --git a/net/wireless/core.h b/net/wireless/core.h index 9820fa251daa..af6e023020b1 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -71,6 +71,7 @@ struct cfg80211_registered_device { struct list_head bss_list; struct rb_root bss_tree; u32 bss_generation; + u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct sk_buff *scan_msg; struct cfg80211_sched_scan_request __rcu *sched_scan_req; diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index 71447cf86306..ba0a1f398ce5 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -556,7 +556,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ break; - case 0: + default: memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ break; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index b5bd58d0f731..35ad69fd0838 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -57,6 +57,19 @@ * also linked into the probe response struct. */ +/* + * Limit the number of BSS entries stored in mac80211. Each one is + * a bit over 4k at most, so this limits to roughly 4-5M of memory. + * If somebody wants to really attack this though, they'd likely + * use small beacons, and only one type of frame, limiting each of + * the entries to a much smaller size (in order to generate more + * entries in total, so overhead is bigger.) + */ +static int bss_entries_limit = 1000; +module_param(bss_entries_limit, int, 0644); +MODULE_PARM_DESC(bss_entries_limit, + "limit to number of scan BSS entries (per wiphy, default 1000)"); + #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) static void bss_free(struct cfg80211_internal_bss *bss) @@ -137,6 +150,10 @@ static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev, list_del_init(&bss->list); rb_erase(&bss->rbn, &rdev->bss_tree); + rdev->bss_entries--; + WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list), + "rdev bss entries[%d]/list[empty:%d] corruption\n", + rdev->bss_entries, list_empty(&rdev->bss_list)); bss_ref_put(rdev, bss); return true; } @@ -163,6 +180,40 @@ static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev, rdev->bss_generation++; } +static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_internal_bss *bss, *oldest = NULL; + bool ret; + + lockdep_assert_held(&rdev->bss_lock); + + list_for_each_entry(bss, &rdev->bss_list, list) { + if (atomic_read(&bss->hold)) + continue; + + if (!list_empty(&bss->hidden_list) && + !bss->pub.hidden_beacon_bss) + continue; + + if (oldest && time_before(oldest->ts, bss->ts)) + continue; + oldest = bss; + } + + if (WARN_ON(!oldest)) + return false; + + /* + * The callers make sure to increase rdev->bss_generation if anything + * gets removed (and a new entry added), so there's no need to also do + * it here. + */ + + ret = __cfg80211_unlink_bss(rdev, oldest); + WARN_ON(!ret); + return ret; +} + void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { @@ -689,6 +740,7 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, const u8 *ie; int i, ssidlen; u8 fold = 0; + u32 n_entries = 0; ies = rcu_access_pointer(new->pub.beacon_ies); if (WARN_ON(!ies)) @@ -712,6 +764,12 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, /* This is the bad part ... */ list_for_each_entry(bss, &rdev->bss_list, list) { + /* + * we're iterating all the entries anyway, so take the + * opportunity to validate the list length accounting + */ + n_entries++; + if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid)) continue; if (bss->pub.channel != new->pub.channel) @@ -740,6 +798,10 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, new->pub.beacon_ies); } + WARN_ONCE(n_entries != rdev->bss_entries, + "rdev bss entries[%d]/list[len:%d] corruption\n", + rdev->bss_entries, n_entries); + return true; } @@ -894,7 +956,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, } } + if (rdev->bss_entries >= bss_entries_limit && + !cfg80211_bss_expire_oldest(rdev)) { + kfree(new); + goto drop; + } + list_add_tail(&new->list, &rdev->bss_list); + rdev->bss_entries++; rb_insert_bss(rdev, new); found = new; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 88725f8eefad..e9d040d29846 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1159,7 +1159,8 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) 58500000, 65000000, 78000000, - 0, + /* not in the spec, but some devices use this: */ + 86500000, }, { 13500000, 27000000, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index fd6986634e6f..5bf7e1bfeac7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1268,12 +1268,14 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, policy_to_flow_dir(dir)); - if (!err && !xfrm_pol_hold_rcu(pol)) - goto again; - else if (err == -ESRCH) + if (!err) { + if (!xfrm_pol_hold_rcu(pol)) + goto again; + } else if (err == -ESRCH) { pol = NULL; - else + } else { pol = ERR_PTR(err); + } } else pol = NULL; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 08892091cfe3..671a1d0333f0 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2450,7 +2450,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) #ifdef CONFIG_COMPAT if (in_compat_syscall()) - return -ENOTSUPP; + return -EOPNOTSUPP; #endif type = nlh->nlmsg_type; |