summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig8
-rw-r--r--net/batman-adv/hard-interface.c1
-rw-r--r--net/batman-adv/tp_meter.c1
-rw-r--r--net/batman-adv/translation-table.c4
-rw-r--r--net/bluetooth/6lowpan.c4
-rw-r--r--net/bluetooth/hci_conn.c26
-rw-r--r--net/bluetooth/l2cap_core.c2
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/bluetooth/smp.c85
-rw-r--r--net/bluetooth/smp.h1
-rw-r--r--net/bridge/br_multicast.c166
-rw-r--r--net/bridge/br_netfilter_hooks.c8
-rw-r--r--net/bridge/br_netlink.c34
-rw-r--r--net/bridge/br_private.h7
-rw-r--r--net/bridge/br_sysfs_br.c41
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c3
-rw-r--r--net/caif/caif_socket.c5
-rw-r--r--net/can/bcm.c18
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c19
-rw-r--r--net/core/dev.c27
-rw-r--r--net/core/devlink.c70
-rw-r--r--net/core/ethtool.c5
-rw-r--r--net/core/filter.c334
-rw-r--r--net/core/flow.c6
-rw-r--r--net/core/flow_dissector.c33
-rw-r--r--net/core/gen_estimator.c294
-rw-r--r--net/core/gen_stats.c20
-rw-r--r--net/core/lwt_bpf.c396
-rw-r--r--net/core/lwtunnel.c2
-rw-r--r--net/core/neighbour.c15
-rw-r--r--net/core/net_namespace.c30
-rw-r--r--net/core/rtnetlink.c48
-rw-r--r--net/core/secure_seq.c11
-rw-r--r--net/core/skbuff.c63
-rw-r--r--net/core/sock.c11
-rw-r--r--net/core/sysctl_net_core.c5
-rw-r--r--net/dcb/dcbnl.c1
-rw-r--r--net/dccp/ipv4.c12
-rw-r--r--net/dsa/dsa.c13
-rw-r--r--net/dsa/dsa2.c4
-rw-r--r--net/dsa/slave.c19
-rw-r--r--net/ieee802154/nl-phy.c6
-rw-r--r--net/ipv4/Kconfig1
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c22
-rw-r--r--net/ipv4/fib_semantics.c1
-rw-r--r--net/ipv4/fib_trie.c292
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/igmp.c50
-rw-r--r--net/ipv4/ip_output.c28
-rw-r--r--net/ipv4/netfilter.c5
-rw-r--r--net/ipv4/netfilter/arp_tables.c20
-rw-r--r--net/ipv4/netfilter/ip_tables.c16
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c4
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c8
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c4
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c8
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c71
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c41
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c17
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c11
-rw-r--r--net/ipv4/ping.c4
-rw-r--r--net/ipv4/route.c40
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/tcp.c54
-rw-r--r--net/ipv4/tcp_cong.c18
-rw-r--r--net/ipv4/tcp_dctcp.c10
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_hybla.c1
-rw-r--r--net/ipv4/tcp_illinois.c10
-rw-r--r--net/ipv4/tcp_input.c43
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_lp.c1
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c181
-rw-r--r--net/ipv4/tcp_scalable.c15
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c1
-rw-r--r--net/ipv4/tcp_veno.c10
-rw-r--r--net/ipv4/tcp_westwood.c1
-rw-r--r--net/ipv4/tcp_yeah.c10
-rw-r--r--net/ipv4/udp.c96
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv6/addrconf.c40
-rw-r--r--net/ipv6/af_inet6.c8
-rw-r--r--net/ipv6/datagram.c4
-rw-r--r--net/ipv6/esp6.c2
-rw-r--r--net/ipv6/icmp.c6
-rw-r--r--net/ipv6/ip6_offload.c2
-rw-r--r--net/ipv6/ip6_output.c9
-rw-r--r--net/ipv6/ip6_tunnel.c14
-rw-r--r--net/ipv6/ip6_vti.c31
-rw-r--r--net/ipv6/ndisc.c29
-rw-r--r--net/ipv6/netfilter/ip6_tables.c17
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c4
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c70
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c44
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c1
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c11
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c11
-rw-r--r--net/ipv6/output_core.c2
-rw-r--r--net/ipv6/route.c7
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c12
-rw-r--r--net/ipv6/udp.c11
-rw-r--r--net/ipv6/udp_impl.h2
-rw-r--r--net/l2tp/l2tp_ip.c66
-rw-r--r--net/l2tp/l2tp_ip6.c84
-rw-r--r--net/mac80211/sta_info.c2
-rw-r--r--net/mac80211/tx.c14
-rw-r--r--net/mac80211/vht.c16
-rw-r--r--net/mpls/af_mpls.c2
-rw-r--r--net/netfilter/Kconfig30
-rw-r--r--net/netfilter/Makefile18
-rw-r--r--net/netfilter/core.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c54
-rw-r--r--net/netfilter/nf_conntrack_proto.c73
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c79
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c76
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c79
-rw-r--r--net/netfilter/nf_conntrack_standalone.c10
-rw-r--r--net/netfilter/nf_dup_netdev.c33
-rw-r--r--net/netfilter/nf_log_common.c3
-rw-r--r--net/netfilter/nf_log_netdev.c3
-rw-r--r--net/netfilter/nf_nat_core.c61
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c36
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c40
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c35
-rw-r--r--net/netfilter/nf_queue.c5
-rw-r--r--net/netfilter/nf_tables_api.c732
-rw-r--r--net/netfilter/nfnetlink_log.c1
-rw-r--r--net/netfilter/nft_counter.c186
-rw-r--r--net/netfilter/nft_ct.c26
-rw-r--r--net/netfilter/nft_fib.c2
-rw-r--r--net/netfilter/nft_fwd_netdev.c4
-rw-r--r--net/netfilter/nft_hash.c7
-rw-r--r--net/netfilter/nft_masq.c6
-rw-r--r--net/netfilter/nft_nat.c11
-rw-r--r--net/netfilter/nft_objref.c226
-rw-r--r--net/netfilter/nft_payload.c107
-rw-r--r--net/netfilter/nft_quota.c158
-rw-r--r--net/netfilter/nft_range.c6
-rw-r--r--net/netfilter/nft_redir.c6
-rw-r--r--net/netfilter/nft_set_hash.c25
-rw-r--r--net/netfilter/nft_set_rbtree.c12
-rw-r--r--net/netfilter/x_tables.c58
-rw-r--r--net/netfilter/xt_CONNSECMARK.c4
-rw-r--r--net/netfilter/xt_CT.c6
-rw-r--r--net/netfilter/xt_NETMAP.c11
-rw-r--r--net/netfilter/xt_RATEEST.c4
-rw-r--r--net/netfilter/xt_REDIRECT.c12
-rw-r--r--net/netfilter/xt_TPROXY.c15
-rw-r--r--net/netfilter/xt_bpf.c96
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlabel.c6
-rw-r--r--net/netfilter/xt_connlimit.c6
-rw-r--r--net/netfilter/xt_connmark.c8
-rw-r--r--net/netfilter/xt_conntrack.c4
-rw-r--r--net/netfilter/xt_helper.c4
-rw-r--r--net/netfilter/xt_multiport.c26
-rw-r--r--net/netfilter/xt_nat.c18
-rw-r--r--net/netfilter/xt_rateest.c28
-rw-r--r--net/netfilter/xt_socket.c33
-rw-r--r--net/netfilter/xt_state.c4
-rw-r--r--net/netlink/af_netlink.c21
-rw-r--r--net/netlink/af_netlink.h2
-rw-r--r--net/openvswitch/conntrack.c5
-rw-r--r--net/packet/af_packet.c18
-rw-r--r--net/rds/tcp.c2
-rw-r--r--net/sched/act_api.c9
-rw-r--r--net/sched/act_bpf.c14
-rw-r--r--net/sched/act_mirred.c27
-rw-r--r--net/sched/act_pedit.c24
-rw-r--r--net/sched/act_police.c21
-rw-r--r--net/sched/cls_api.c27
-rw-r--r--net/sched/cls_basic.c4
-rw-r--r--net/sched/cls_bpf.c49
-rw-r--r--net/sched/cls_cgroup.c7
-rw-r--r--net/sched/cls_flow.c1
-rw-r--r--net/sched/cls_flower.c261
-rw-r--r--net/sched/cls_matchall.c1
-rw-r--r--net/sched/cls_rsvp.h3
-rw-r--r--net/sched/cls_tcindex.c1
-rw-r--r--net/sched/sch_api.c2
-rw-r--r--net/sched/sch_cbq.c6
-rw-r--r--net/sched/sch_drr.c6
-rw-r--r--net/sched/sch_generic.c2
-rw-r--r--net/sched/sch_hfsc.c6
-rw-r--r--net/sched/sch_htb.c6
-rw-r--r--net/sched/sch_qfq.c8
-rw-r--r--net/socket.c25
-rw-r--r--net/sunrpc/svc_xprt.c11
-rw-r--r--net/sunrpc/svcsock.c21
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c6
-rw-r--r--net/tipc/bearer.c11
-rw-r--r--net/tipc/bearer.h13
-rw-r--r--net/tipc/link.c40
-rw-r--r--net/tipc/monitor.c10
-rw-r--r--net/tipc/socket.c50
-rw-r--r--net/tipc/udp_media.c5
-rw-r--r--net/unix/af_unix.c17
-rw-r--r--net/vmw_vsock/virtio_transport.c56
-rw-r--r--net/wireless/core.h1
-rw-r--r--net/wireless/lib80211_crypt_tkip.c2
-rw-r--r--net/wireless/scan.c69
-rw-r--r--net/wireless/util.c3
-rw-r--r--net/xfrm/xfrm_policy.c10
-rw-r--r--net/xfrm/xfrm_user.c2
215 files changed, 5349 insertions, 1600 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..a1005007224c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -402,6 +402,14 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.
+config LWTUNNEL_BPF
+ bool "Execute BPF program as route nexthop action"
+ depends on LWTUNNEL
+ default y if LWTUNNEL=y
+ ---help---
+ Allows to run BPF programs as a nexthop action following a route
+ lookup for incoming and outgoing packets.
+
config DST_CACHE
bool
default n
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 672150b42c61..61a431a9772b 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -851,6 +851,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_softif_destroy_sysfs(hard_iface->soft_iface);
}
+ hard_iface->soft_iface = NULL;
batadv_hardif_put(hard_iface);
out:
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index f1564520dfae..981e8c5b07e9 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -834,6 +834,7 @@ static int batadv_tp_send(void *arg)
primary_if = batadv_primary_if_get_selected(bat_priv);
if (unlikely(!primary_if)) {
err = BATADV_TP_REASON_DST_UNREACHABLE;
+ tp_vars->reason = err;
goto out;
}
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 447f9490b692..30ecbfb40adf 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -3287,7 +3287,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
&tvlv_tt_data,
&tt_change,
&tt_len);
- if (!tt_len)
+ if (!tt_len || !tvlv_len)
goto unlock;
/* Copy the last orig_node's OGM buffer */
@@ -3305,7 +3305,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
&tvlv_tt_data,
&tt_change,
&tt_len);
- if (!tt_len)
+ if (!tt_len || !tvlv_len)
goto out;
/* fill the rest of the tvlv with the real TT entries */
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index d020299baba4..1904a93f47d5 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -1090,7 +1090,6 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
{
struct hci_conn *hcon;
struct hci_dev *hdev;
- bdaddr_t *src = BDADDR_ANY;
int n;
n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
@@ -1101,7 +1100,8 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
if (n < 7)
return -EINVAL;
- hdev = hci_get_route(addr, src);
+ /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */
+ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC);
if (!hdev)
return -ENOENT;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 3809617aa98d..dc59eae54717 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -613,7 +613,7 @@ int hci_conn_del(struct hci_conn *conn)
return 0;
}
-struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
+struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
{
int use_src = bacmp(src, BDADDR_ANY);
struct hci_dev *hdev = NULL, *d;
@@ -634,7 +634,29 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
*/
if (use_src) {
- if (!bacmp(&d->bdaddr, src)) {
+ bdaddr_t id_addr;
+ u8 id_addr_type;
+
+ if (src_type == BDADDR_BREDR) {
+ if (!lmp_bredr_capable(d))
+ continue;
+ bacpy(&id_addr, &d->bdaddr);
+ id_addr_type = BDADDR_BREDR;
+ } else {
+ if (!lmp_le_capable(d))
+ continue;
+
+ hci_copy_identity_address(d, &id_addr,
+ &id_addr_type);
+
+ /* Convert from HCI to three-value type */
+ if (id_addr_type == ADDR_LE_DEV_PUBLIC)
+ id_addr_type = BDADDR_LE_PUBLIC;
+ else
+ id_addr_type = BDADDR_LE_RANDOM;
+ }
+
+ if (!bacmp(&id_addr, src) && id_addr_type == src_type) {
hdev = d; break;
}
} else {
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d4cad29b033f..577f1c01454a 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -7060,7 +7060,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst,
dst_type, __le16_to_cpu(psm));
- hdev = hci_get_route(dst, &chan->src);
+ hdev = hci_get_route(dst, &chan->src, chan->src_type);
if (!hdev)
return -EHOSTUNREACH;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 8e385a0ae60e..2f2cb5e27cdd 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -178,7 +178,7 @@ static void rfcomm_reparent_device(struct rfcomm_dev *dev)
struct hci_dev *hdev;
struct hci_conn *conn;
- hdev = hci_get_route(&dev->dst, &dev->src);
+ hdev = hci_get_route(&dev->dst, &dev->src, BDADDR_BREDR);
if (!hdev)
return;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index f52bcbf2e58c..3125ce670c2f 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -219,7 +219,7 @@ static int sco_connect(struct sock *sk)
BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst);
- hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src);
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
if (!hdev)
return -EHOSTUNREACH;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 43faf2aea2ab..fae391f1871f 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -57,7 +57,7 @@
#define SMP_TIMEOUT msecs_to_jiffies(30000)
#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
- 0x1f : 0x07)
+ 0x3f : 0x07)
#define KEY_DIST_MASK 0x07
/* Maximum message length that can be passed to aes_cmac */
@@ -76,6 +76,7 @@ enum {
SMP_FLAG_DHKEY_PENDING,
SMP_FLAG_REMOTE_OOB,
SMP_FLAG_LOCAL_OOB,
+ SMP_FLAG_CT2,
};
struct smp_dev {
@@ -357,6 +358,22 @@ static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
return err;
}
+static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
+ const u8 salt[16], u8 res[16])
+{
+ int err;
+
+ SMP_DBG("w %16phN salt %16phN", w, salt);
+
+ err = aes_cmac(tfm_cmac, salt, w, 16, res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
/* The following functions map to the legacy SMP crypto functions e, c1,
* s1 and ah.
*/
@@ -1130,20 +1147,31 @@ static void sc_add_ltk(struct smp_chan *smp)
static void sc_generate_link_key(struct smp_chan *smp)
{
- /* These constants are as specified in the core specification.
- * In ASCII they spell out to 'tmp1' and 'lebr'.
- */
- const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+ /* From core spec. Spells out in ASCII as 'lebr'. */
const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c };
smp->link_key = kzalloc(16, GFP_KERNEL);
if (!smp->link_key)
return;
- if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
- kzfree(smp->link_key);
- smp->link_key = NULL;
- return;
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x00000000000000000000000000000000746D7031 */
+ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp1'. */
+ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
}
if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
@@ -1169,10 +1197,7 @@ static void smp_allow_key_dist(struct smp_chan *smp)
static void sc_generate_ltk(struct smp_chan *smp)
{
- /* These constants are as specified in the core specification.
- * In ASCII they spell out to 'tmp2' and 'brle'.
- */
- const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+ /* From core spec. Spells out in ASCII as 'brle'. */
const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 };
struct hci_conn *hcon = smp->conn->hcon;
struct hci_dev *hdev = hcon->hdev;
@@ -1187,8 +1212,19 @@ static void sc_generate_ltk(struct smp_chan *smp)
if (key->type == HCI_LK_DEBUG_COMBINATION)
set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
- if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
- return;
+ if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
+ /* SALT = 0x00000000000000000000000000000000746D7032 */
+ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
+ return;
+ } else {
+ /* From core spec. Spells out in ASCII as 'tmp2'. */
+ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+
+ if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
+ return;
+ }
if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk))
return;
@@ -1669,6 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
if (!rsp) {
memset(req, 0, sizeof(*req));
+ req->auth_req = SMP_AUTH_CT2;
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
req->max_key_size = conn->hcon->enc_key_size;
@@ -1680,6 +1717,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp,
memset(rsp, 0, sizeof(*rsp));
+ rsp->auth_req = SMP_AUTH_CT2;
rsp->max_key_size = conn->hcon->enc_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
@@ -1744,6 +1782,9 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
build_bredr_pairing_cmd(smp, req, &rsp);
+ if (req->auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
key_size = min(req->max_key_size, rsp.max_key_size);
if (check_enc_key_size(conn, key_size))
return SMP_ENC_KEY_SIZE;
@@ -1761,9 +1802,13 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
build_pairing_cmd(conn, req, &rsp, auth);
- if (rsp.auth_req & SMP_AUTH_SC)
+ if (rsp.auth_req & SMP_AUTH_SC) {
set_bit(SMP_FLAG_SC, &smp->flags);
+ if (rsp.auth_req & SMP_AUTH_CT2)
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+ }
+
if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
sec_level = BT_SECURITY_MEDIUM;
else
@@ -1917,6 +1962,9 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
*/
smp->remote_key_dist &= rsp->resp_key_dist;
+ if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2))
+ set_bit(SMP_FLAG_CT2, &smp->flags);
+
/* For BR/EDR this means we're done and can start phase 3 */
if (conn->hcon->type == ACL_LINK) {
/* Clear bits which are generated but not distributed */
@@ -2312,8 +2360,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
authreq = seclevel_to_authreq(sec_level);
- if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED))
+ if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) {
authreq |= SMP_AUTH_SC;
+ if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED))
+ authreq |= SMP_AUTH_CT2;
+ }
/* Require MITM if IO Capability allows or the security level
* requires it.
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index ffcc70b6b199..0ff6247eaa6c 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -57,6 +57,7 @@ struct smp_cmd_pairing {
#define SMP_AUTH_MITM 0x04
#define SMP_AUTH_SC 0x08
#define SMP_AUTH_KEYPRESS 0x10
+#define SMP_AUTH_CT2 0x20
#define SMP_CMD_PAIRING_CONFIRM 0x03
struct smp_cmd_pairing_confirm {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 073d54afa056..b30e77e8427c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -365,13 +365,18 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
__be32 group,
u8 *igmp_type)
{
+ struct igmpv3_query *ihv3;
+ size_t igmp_hdr_size;
struct sk_buff *skb;
struct igmphdr *ih;
struct ethhdr *eth;
struct iphdr *iph;
+ igmp_hdr_size = sizeof(*ih);
+ if (br->multicast_igmp_version == 3)
+ igmp_hdr_size = sizeof(*ihv3);
skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
- sizeof(*ih) + 4);
+ igmp_hdr_size + 4);
if (!skb)
goto out;
@@ -396,7 +401,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
iph->version = 4;
iph->ihl = 6;
iph->tos = 0xc0;
- iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4);
+ iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
@@ -412,17 +417,37 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
skb_put(skb, 24);
skb_set_transport_header(skb, skb->len);
- ih = igmp_hdr(skb);
*igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->code = (group ? br->multicast_last_member_interval :
- br->multicast_query_response_interval) /
- (HZ / IGMP_TIMER_SCALE);
- ih->group = group;
- ih->csum = 0;
- ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- skb_put(skb, sizeof(*ih));
+ switch (br->multicast_igmp_version) {
+ case 2:
+ ih = igmp_hdr(skb);
+ ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ih->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ih->group = group;
+ ih->csum = 0;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
+ break;
+ case 3:
+ ihv3 = igmpv3_query_hdr(skb);
+ ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ihv3->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ihv3->group = group;
+ ihv3->qqic = br->multicast_query_interval / HZ;
+ ihv3->nsrcs = 0;
+ ihv3->resv = 0;
+ ihv3->suppress = 0;
+ ihv3->qrv = 2;
+ ihv3->csum = 0;
+ ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
+ break;
+ }
+
+ skb_put(skb, igmp_hdr_size);
__skb_pull(skb, sizeof(*eth));
out:
@@ -434,15 +459,20 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
const struct in6_addr *grp,
u8 *igmp_type)
{
- struct sk_buff *skb;
+ struct mld2_query *mld2q;
+ unsigned long interval;
struct ipv6hdr *ip6h;
struct mld_msg *mldq;
+ size_t mld_hdr_size;
+ struct sk_buff *skb;
struct ethhdr *eth;
u8 *hopopt;
- unsigned long interval;
+ mld_hdr_size = sizeof(*mldq);
+ if (br->multicast_mld_version == 2)
+ mld_hdr_size = sizeof(*mld2q);
skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
- 8 + sizeof(*mldq));
+ 8 + mld_hdr_size);
if (!skb)
goto out;
@@ -461,7 +491,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h = ipv6_hdr(skb);
*(__force __be32 *)ip6h = htonl(0x60000000);
- ip6h->payload_len = htons(8 + sizeof(*mldq));
+ ip6h->payload_len = htons(8 + mld_hdr_size);
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
@@ -489,26 +519,47 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
/* ICMPv6 */
skb_set_transport_header(skb, skb->len);
- mldq = (struct mld_msg *) icmp6_hdr(skb);
-
interval = ipv6_addr_any(grp) ?
br->multicast_query_response_interval :
br->multicast_last_member_interval;
-
*igmp_type = ICMPV6_MGM_QUERY;
- mldq->mld_type = ICMPV6_MGM_QUERY;
- mldq->mld_code = 0;
- mldq->mld_cksum = 0;
- mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
- mldq->mld_reserved = 0;
- mldq->mld_mca = *grp;
-
- /* checksum */
- mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mldq), IPPROTO_ICMPV6,
- csum_partial(mldq,
- sizeof(*mldq), 0));
- skb_put(skb, sizeof(*mldq));
+ switch (br->multicast_mld_version) {
+ case 1:
+ mldq = (struct mld_msg *)icmp6_hdr(skb);
+ mldq->mld_type = ICMPV6_MGM_QUERY;
+ mldq->mld_code = 0;
+ mldq->mld_cksum = 0;
+ mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
+ mldq->mld_reserved = 0;
+ mldq->mld_mca = *grp;
+ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mldq), IPPROTO_ICMPV6,
+ csum_partial(mldq,
+ sizeof(*mldq),
+ 0));
+ break;
+ case 2:
+ mld2q = (struct mld2_query *)icmp6_hdr(skb);
+ mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval));
+ mld2q->mld2q_type = ICMPV6_MGM_QUERY;
+ mld2q->mld2q_code = 0;
+ mld2q->mld2q_cksum = 0;
+ mld2q->mld2q_resv1 = 0;
+ mld2q->mld2q_resv2 = 0;
+ mld2q->mld2q_suppress = 0;
+ mld2q->mld2q_qrv = 2;
+ mld2q->mld2q_nsrcs = 0;
+ mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
+ mld2q->mld2q_mca = *grp;
+ mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mld2q),
+ IPPROTO_ICMPV6,
+ csum_partial(mld2q,
+ sizeof(*mld2q),
+ 0));
+ break;
+ }
+ skb_put(skb, mld_hdr_size);
__skb_pull(skb, sizeof(*eth));
@@ -608,7 +659,8 @@ err:
}
struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
- struct net_bridge_port *port, struct br_ip *group)
+ struct net_bridge_port *p,
+ struct br_ip *group)
{
struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
@@ -624,7 +676,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
}
hash = br_ip_hash(mdb, group);
- mp = br_multicast_get_group(br, port, group, hash);
+ mp = br_multicast_get_group(br, p, group, hash);
switch (PTR_ERR(mp)) {
case 0:
break;
@@ -681,9 +733,9 @@ static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group)
{
- struct net_bridge_mdb_entry *mp;
- struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_mdb_entry *mp;
unsigned long now = jiffies;
int err;
@@ -861,9 +913,9 @@ static void br_multicast_send_query(struct net_bridge *br,
struct net_bridge_port *port,
struct bridge_mcast_own_query *own_query)
{
- unsigned long time;
- struct br_ip br_group;
struct bridge_mcast_other_query *other_query = NULL;
+ struct br_ip br_group;
+ unsigned long time;
if (!netif_running(br->dev) || br->multicast_disabled ||
!br->multicast_querier)
@@ -1831,7 +1883,9 @@ void br_multicast_init(struct net_bridge *br)
br->ip4_other_query.delay_time = 0;
br->ip4_querier.port = NULL;
+ br->multicast_igmp_version = 2;
#if IS_ENABLED(CONFIG_IPV6)
+ br->multicast_mld_version = 1;
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
#endif
@@ -2132,6 +2186,44 @@ unlock:
return err;
}
+int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
+{
+ /* Currently we support only version 2 and 3 */
+ switch (val) {
+ case 2:
+ case 3:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ br->multicast_igmp_version = val;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
+{
+ /* Currently we support version 1 and 2 */
+ switch (val) {
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ br->multicast_mld_version = val;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+#endif
+
/**
* br_multicast_list_adjacent - Returns snooped multicast addresses
* @dev: The bridge port adjacent to which to retrieve addresses
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 83d937f4415e..b12501a77f18 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1008,10 +1008,10 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
struct nf_hook_state state;
int ret;
- elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
-
- while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF))
- elem = rcu_dereference(elem->next);
+ for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+ elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF;
+ elem = rcu_dereference(elem->next))
+ ;
if (!elem)
return okfn(net, sk, skb);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e99037c6f7b7..71c7453268c1 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -858,6 +858,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 },
[IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 },
[IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1069,6 +1071,26 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
br->multicast_stats_enabled = !!mcast_stats;
}
+
+ if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
+ __u8 igmp_version;
+
+ igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
+ err = br_multicast_set_igmp_version(br, igmp_version);
+ if (err)
+ return err;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (data[IFLA_BR_MCAST_MLD_VERSION]) {
+ __u8 mld_version;
+
+ mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
+ err = br_multicast_set_mld_version(br, mld_version);
+ if (err)
+ return err;
+ }
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
@@ -1135,6 +1157,8 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
@@ -1210,9 +1234,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
br->multicast_last_member_count) ||
nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
- br->multicast_startup_query_count))
+ br->multicast_startup_query_count) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
+ br->multicast_igmp_version))
return -EMSGSIZE;
-
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
+ br->multicast_mld_version))
+ return -EMSGSIZE;
+#endif
clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
IFLA_BR_PAD))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b63177e0ccd..26aec2366bc3 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -333,6 +333,8 @@ struct net_bridge
u32 multicast_last_member_count;
u32 multicast_startup_query_count;
+ u8 multicast_igmp_version;
+
unsigned long multicast_last_member_interval;
unsigned long multicast_membership_interval;
unsigned long multicast_querier_interval;
@@ -353,6 +355,7 @@ struct net_bridge
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
+ u8 multicast_mld_version;
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif
@@ -582,6 +585,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
int br_multicast_toggle(struct net_bridge *br, unsigned long val);
int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
+#if IS_ENABLED(CONFIG_IPV6)
+int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
+#endif
struct net_bridge_mdb_entry *
br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
struct net_bridge_mdb_entry *
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index e120307c6e36..a18148213b08 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -440,6 +440,23 @@ static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(hash_max);
+static ssize_t multicast_igmp_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_igmp_version);
+}
+
+static ssize_t multicast_igmp_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version);
+}
+static DEVICE_ATTR_RW(multicast_igmp_version);
+
static ssize_t multicast_last_member_count_show(struct device *d,
struct device_attribute *attr,
char *buf)
@@ -642,6 +659,25 @@ static ssize_t multicast_stats_enabled_store(struct device *d,
return store_bridge_parm(d, buf, len, set_stats_enabled);
}
static DEVICE_ATTR_RW(multicast_stats_enabled);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static ssize_t multicast_mld_version_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ return sprintf(buf, "%u\n", br->multicast_mld_version);
+}
+
+static ssize_t multicast_mld_version_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_mld_version);
+}
+static DEVICE_ATTR_RW(multicast_mld_version);
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static ssize_t nf_call_iptables_show(
@@ -809,6 +845,10 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_multicast_query_response_interval.attr,
&dev_attr_multicast_startup_query_interval.attr,
&dev_attr_multicast_stats_enabled.attr,
+ &dev_attr_multicast_igmp_version.attr,
+#if IS_ENABLED(CONFIG_IPV6)
+ &dev_attr_multicast_mld_version.attr,
+#endif
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
&dev_attr_nf_call_iptables.attr,
@@ -898,6 +938,7 @@ int br_sysfs_addbr(struct net_device *dev)
if (!br->ifobj) {
pr_info("%s: can't add kobject (directory) %s/%s\n",
__func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR);
+ err = -ENOMEM;
goto out3;
}
return 0;
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index c197b1f844ee..bd2b3c78f59b 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -24,7 +24,8 @@ static void nf_log_bridge_packet(struct net *net, u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- nf_log_l2packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb,
+ in, out, loginfo, prefix);
}
static struct nf_logger nf_bridge_logger __read_mostly = {
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index aa209b1066c9..92cbbd2afddb 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1107,10 +1107,7 @@ static struct net_proto_family caif_family_ops = {
static int __init caif_sktinit_module(void)
{
- int err = sock_register(&caif_family_ops);
- if (!err)
- return err;
- return 0;
+ return sock_register(&caif_family_ops);
}
static void __exit caif_sktexit_module(void)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 8af9d25ff988..436a7537e6a9 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -77,7 +77,7 @@
(CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
(CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
-#define CAN_BCM_VERSION "20160617"
+#define CAN_BCM_VERSION "20161123"
MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
@@ -109,8 +109,9 @@ struct bcm_op {
u32 count;
u32 nframes;
u32 currframe;
- struct canfd_frame *frames;
- struct canfd_frame *last_frames;
+ /* void pointers to arrays of struct can[fd]_frame */
+ void *frames;
+ void *last_frames;
struct canfd_frame sframe;
struct canfd_frame last_sframe;
struct sock *sk;
@@ -681,7 +682,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
if (op->flags & RX_FILTER_ID) {
/* the easiest case */
- bcm_rx_update_and_send(op, &op->last_frames[0], rxframe);
+ bcm_rx_update_and_send(op, op->last_frames, rxframe);
goto rx_starttimer;
}
@@ -1068,7 +1069,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->nframes) {
/* update CAN frames content */
- err = memcpy_from_msg((u8 *)op->frames, msg,
+ err = memcpy_from_msg(op->frames, msg,
msg_head->nframes * op->cfsiz);
if (err < 0)
return err;
@@ -1118,7 +1119,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
}
if (msg_head->nframes) {
- err = memcpy_from_msg((u8 *)op->frames, msg,
+ err = memcpy_from_msg(op->frames, msg,
msg_head->nframes * op->cfsiz);
if (err < 0) {
if (op->frames != &op->sframe)
@@ -1163,6 +1164,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
/* check flags */
if (op->flags & RX_RTR_FRAME) {
+ struct canfd_frame *frame0 = op->frames;
/* no timers in RTR-mode */
hrtimer_cancel(&op->thrtimer);
@@ -1174,8 +1176,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
* prevent a full-load-loopback-test ... ;-]
*/
if ((op->flags & TX_CP_CAN_ID) ||
- (op->frames[0].can_id == op->can_id))
- op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG;
+ (frame0->can_id == op->can_id))
+ frame0->can_id = op->can_id & ~CAN_RTR_FLAG;
} else {
if (op->flags & SETTIMER) {
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..f6761b6e3b29 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 49816af8586b..9482037a5c8c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -214,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
+ *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -227,22 +228,22 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
*last = skb;
- *peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
skb->peeked)) {
_off -= skb->len;
continue;
}
-
- skb = skb_set_peeked(skb);
- error = PTR_ERR(skb);
- if (IS_ERR(skb)) {
- spin_unlock_irqrestore(&queue->lock,
- cpu_flags);
- goto no_packet;
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ error = PTR_ERR(skb);
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
}
-
+ *peeked = 1;
atomic_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
diff --git a/net/core/dev.c b/net/core/dev.c
index f71b34ab57a5..1d33ce03365f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3447,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -5260,7 +5262,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- return;
+ goto out;
break;
}
@@ -5278,7 +5280,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
}
}
- __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -5288,6 +5289,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
+out:
+ __kfree_skb_flush();
}
struct netdev_adjacent {
@@ -6691,26 +6694,42 @@ EXPORT_SYMBOL(dev_change_proto_down);
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @fd: new program fd or negative value to clear
+ * @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
-int dev_change_xdp_fd(struct net_device *dev, int fd)
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp = {};
+ struct netdev_xdp xdp;
int err;
+ ASSERT_RTNL();
+
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
+ if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0)
+ return err;
+ if (xdp.prog_attached)
+ return -EBUSY;
+ }
+
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ memset(&xdp, 0, sizeof(xdp));
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
+
err = ops->ndo_xdp(dev, &xdp);
if (err < 0 && prog)
bpf_prog_put(prog);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index c14f8b661db9..2b5bf9efa720 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1394,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
- u32 seq, int flags, u16 mode)
+ u32 seq, int flags)
{
+ const struct devlink_ops *ops = devlink->ops;
void *hdr;
+ int err = 0;
+ u16 mode;
+ u8 inline_mode;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out;
- if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
- goto nla_put_failure;
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto out;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto out;
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto out;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto out;
+ }
genlmsg_end(msg, hdr);
return 0;
-nla_put_failure:
+out:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
@@ -1422,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct sk_buff *msg;
- u16 mode;
int err;
if (!ops || !ops->eswitch_mode_get)
return -EOPNOTSUPP;
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- return err;
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
- info->snd_portid, info->snd_seq, 0, mode);
+ info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
@@ -1453,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
u16 mode;
+ u8 inline_mode;
+ int err = 0;
- if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
- return -EINVAL;
+ if (!ops)
+ return -EOPNOTSUPP;
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = ops->eswitch_mode_set(devlink, mode);
+ if (err)
+ return err;
+ }
- if (ops && ops->eswitch_mode_set)
- return ops->eswitch_mode_set(devlink, mode);
- return -EOPNOTSUPP;
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(
+ info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -1478,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
[DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
};
static const struct genl_ops devlink_nl_ops[] = {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e9b4556751ff..e23766c7e3ba 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2466,7 +2466,9 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
data = kmalloc(tuna.len, GFP_USER);
if (!data)
return -ENOMEM;
+ mutex_lock(&phydev->lock);
ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
if (ret)
goto out;
useraddr += sizeof(tuna);
@@ -2501,7 +2503,9 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
ret = -EFAULT;
if (copy_from_user(data, useraddr, tuna.len))
goto out;
+ mutex_lock(&phydev->lock);
ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
out:
kfree(data);
@@ -2566,6 +2570,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GEEE:
case ETHTOOL_GTUNABLE:
case ETHTOOL_PHY_GTUNABLE:
+ case ETHTOOL_GLINKSETTINGS:
break;
default:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
diff --git a/net/core/filter.c b/net/core/filter.c
index dece94fef005..b1461708a977 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
return -ENOMEM;
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
@@ -1684,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -1692,17 +1703,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
- switch (dev->type) {
- case ARPHRD_TUNNEL:
- case ARPHRD_TUNNEL6:
- case ARPHRD_SIT:
- case ARPHRD_IPGRE:
- case ARPHRD_VOID:
- case ARPHRD_NONE:
- return __bpf_redirect_no_mac(skb, dev, flags);
- default:
+ if (dev_is_mac_header_xmit(dev))
return __bpf_redirect_common(skb, dev, flags);
- }
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
}
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2190,16 +2194,79 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
.arg3_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < xdp->data_hard_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||
func == bpf_skb_store_bytes ||
func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace)
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head)
return true;
return false;
@@ -2625,12 +2692,87 @@ xdp_func_proto(enum bpf_func_id func_id)
return &bpf_xdp_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
default:
return sk_filter_func_proto(func_id);
}
}
-static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_inout_func_proto(func_id);
+ }
+}
+
+static bool __is_valid_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
@@ -2664,7 +2806,64 @@ static bool sk_filter_is_valid_access(int off, int size,
}
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ if (off < 0 || off + size > sizeof(struct bpf_sock))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
}
static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -2733,11 +2932,10 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
}
-static bool __is_valid_xdp_access(int off, int size,
- enum bpf_access_type type)
+static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
return false;
@@ -2765,7 +2963,7 @@ static bool xdp_is_valid_access(int off, int size,
break;
}
- return __is_valid_xdp_access(off, size, type);
+ return __is_valid_xdp_access(off, size);
}
void bpf_warn_invalid_xdp_action(u32 act)
@@ -2925,6 +3123,51 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+ int dst_reg, int src_reg,
+ int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
@@ -2992,6 +3235,31 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_inout_ops = {
+ .get_func_proto = lwt_inout_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops cg_sock_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = sock_filter_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3012,12 +3280,42 @@ static struct bpf_prog_type_list xdp_type __read_mostly = {
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+ .ops = &lwt_xmit_ops,
+ .type = BPF_PROG_TYPE_LWT_XMIT,
+};
+
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+ .ops = &cg_sock_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SOCK
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
+ bpf_register_prog_type(&cg_sock_type);
+ bpf_register_prog_type(&lwt_in_type);
+ bpf_register_prog_type(&lwt_out_type);
+ bpf_register_prog_type(&lwt_xmit_type);
return 0;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 3937b1b68d5b..18e8893d4be5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -95,7 +95,6 @@ static void flow_cache_gc_task(struct work_struct *work)
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
flow_entry_kill(fce, xfrm);
atomic_dec(&xfrm->flow_cache_gc_count);
- WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
}
}
@@ -236,9 +235,8 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
if (fcp->hash_count > fc->high_watermark)
flow_cache_shrink(fc, fcp);
- if (fcp->hash_count > 2 * fc->high_watermark ||
- atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
- atomic_inc(&net->xfrm.flow_cache_genid);
+ if (atomic_read(&net->xfrm.flow_cache_gc_count) >
+ 2 * num_online_cpus() * fc->high_watermark) {
flo = ERR_PTR(-ENOBUFS);
goto ret_object;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b481a4a6d3ec..d6447dc10371 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
EXPORT_SYMBOL(skb_flow_dissector_init);
/**
+ * skb_flow_get_be16 - extract be16 entity
+ * @skb: sk_buff to extract from
+ * @poff: offset to extract at
+ * @data: raw buffer pointer to the packet
+ * @hlen: packet header length
+ *
+ * The function will try to retrieve a be32 entity at
+ * offset poff
+ */
+__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data,
+ int hlen)
+{
+ __be16 *u, _u;
+
+ u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
+ if (u)
+ return *u;
+
+ return 0;
+}
+
+/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
@@ -117,6 +139,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
struct flow_dissector_key_keyid *key_keyid;
@@ -546,6 +569,14 @@ ip_proto_again:
data, hlen);
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP)) {
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+ key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
+ }
+
out_good:
ret = true;
@@ -1013,4 +1044,4 @@ static int __init init_default_flow_dissectors(void)
return 0;
}
-late_initcall_sync(init_default_flow_dissectors);
+core_initcall(init_default_flow_dissectors);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cad8e791f28e..101b5d0e2142 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,6 +7,7 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,165 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator
-{
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head
-{
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (!list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
-
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
-
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
{
- bool res;
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb..87f28557b329 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **rate_est)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(rate_est, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..71bb3e2eca08
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,396 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable is needed to protect per-cpu redirect_info between
+ * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+ * access to maps strictly require a rcu_read_lock() for protection,
+ * mixing with BH RCU lock doesn't work.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ bpf_compute_data_end(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 03976e939818..a5d4e866ce88 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "ILA";
case LWTUNNEL_ENCAP_SEG6:
return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2ae929f9bd06..782dd8663665 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2291,13 +2291,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
- if (!net_eq(dev_net(n->dev), net))
- continue;
- if (neigh_ifindex_filtered(n->dev, filter_idx))
- continue;
- if (neigh_master_filtered(n->dev, filter_master_idx))
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter_idx) ||
+ neigh_master_filtered(n->dev, filter_master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2332,9 +2329,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (pneigh_net(n) != net)
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || pneigh_net(n) != net)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 35d37b196e67..50fdc1b59777 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -39,6 +39,9 @@ EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -46,11 +49,11 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
- size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+ unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->len = max_gen_ptrs;
+ ng->s.len = max_gen_ptrs;
return ng;
}
@@ -60,13 +63,14 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
- BUG_ON(id == 0);
+ BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&net_mutex));
- ng = old_ng;
- if (old_ng->len >= id)
- goto assign;
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
ng = net_alloc_generic();
if (ng == NULL)
@@ -83,12 +87,12 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
* the old copy for kfree after a grace period.
*/
- memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
rcu_assign_pointer(net->gen, ng);
- kfree_rcu(old_ng, rcu);
-assign:
- ng->ptr[id - 1] = data;
+ kfree_rcu(old_ng, s.rcu);
return 0;
}
@@ -218,6 +222,8 @@ int peernet2id_alloc(struct net *net, struct net *peer)
bool alloc;
int id;
+ if (atomic_read(&net->count) == 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
spin_lock_irqsave(&net->nsid_lock, flags);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
@@ -872,7 +878,7 @@ static int register_pernet_operations(struct list_head *list,
if (ops->id) {
again:
- error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
if (error < 0) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -880,7 +886,7 @@ again:
}
return error;
}
- max_gen_ptrs = max(max_gen_ptrs, *ops->id);
+ max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
}
error = __register_pernet_operations(list, ops);
if (error) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index db313ec7af32..c482491a63d8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -840,18 +840,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
(ext_filter_mask & RTEXT_FILTER_VF)) {
int num_vfs = dev_num_vf(dev->dev.parent);
- size_t size = nla_total_size(sizeof(struct nlattr));
- size += nla_total_size(num_vfs * sizeof(struct nlattr));
+ size_t size = nla_total_size(0);
size += num_vfs *
- (nla_total_size(sizeof(struct ifla_vf_mac)) +
- nla_total_size(MAX_VLAN_LIST_LEN *
- sizeof(struct nlattr)) +
+ (nla_total_size(0) +
+ nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
nla_total_size(MAX_VLAN_LIST_LEN *
sizeof(struct ifla_vf_vlan_info)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ nla_total_size(0) + /* nest IFLA_VF_STATS */
/* IFLA_VF_STATS_RX_PACKETS */
nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_PACKETS */
@@ -899,7 +901,8 @@ static size_t rtnl_port_size(const struct net_device *dev,
static size_t rtnl_xdp_size(const struct net_device *dev)
{
- size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */
+ size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
+ nla_total_size(1); /* XDP_ATTACHED */
if (!dev->netdev_ops->ndo_xdp)
return 0;
@@ -928,8 +931,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
- + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
- + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1502,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
};
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -1606,7 +1610,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
head = &net->dev_index_head[h];
hlist_for_each_entry(dev, head, index_hlist) {
if (link_dump_filtered(dev, master_idx, kind_ops))
- continue;
+ goto cont;
if (idx < s_idx)
goto cont;
err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
@@ -2161,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_XDP]) {
struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
ifla_xdp_policy);
@@ -2171,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
if (xdp[IFLA_XDP_FD]) {
err = dev_change_xdp_fd(dev,
- nla_get_s32(xdp[IFLA_XDP_FD]));
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ xdp_flags);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2734,7 +2749,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
ext_filter_mask));
}
- return min_ifinfo_dump_size;
+ return nlmsg_total_size(min_ifinfo_dump_size);
}
static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2849,7 +2864,10 @@ nla_put_failure:
static inline size_t rtnl_fdb_nlmsg_size(void)
{
- return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) +
+ nla_total_size(ETH_ALEN) + /* NDA_LLADDR */
+ nla_total_size(sizeof(u16)) + /* NDA_VLAN */
+ 0;
}
static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
@@ -3159,7 +3177,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
goto out;
- nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
return err;
@@ -3665,7 +3683,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
if (!size)
continue;
- if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
continue;
attr = nla_reserve_64bit(skb, attr_id, size,
@@ -3706,7 +3724,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
continue;
size = rtnl_get_offload_stats_attr_size(attr_id);
nla_size += nla_total_size_64bit(size);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..88a8e429fc3e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -12,6 +12,7 @@
#include <net/secure_seq.h>
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <net/tcp.h>
#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
@@ -40,8 +41,8 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
@@ -58,6 +59,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
md5_transform(hash, secret);
+ *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
return seq_scale(hash[0]);
}
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
@@ -86,8 +88,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#ifdef CONFIG_INET
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
{
u32 hash[MD5_DIGEST_WORDS];
@@ -99,6 +101,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
md5_transform(hash, net_secret);
+ *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
return seq_scale(hash[0]);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4c96cb18c214..84151cf40aeb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2656,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
struct skb_frag_struct *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
- BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ if (skb_headlen(skb))
+ return 0;
todo = shiftlen;
from = 0;
@@ -3712,20 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
struct sk_buff_head *q = &sk->sk_error_queue;
- struct sk_buff *skb, *skb_next;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
unsigned long flags;
- int err = 0;
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q)))
- err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ icmp_next = is_icmp_err_skb(skb_next);
spin_unlock_irqrestore(&q->lock, flags);
- if (err)
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
sk->sk_error_report(sk);
return skb;
@@ -3837,10 +3848,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
- if (tsonly)
- skb = alloc_skb(0, GFP_ATOMIC);
- else
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
+ skb = tcp_get_timestamping_opt_stats(sk);
+ else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
+ }
if (!skb)
return;
@@ -4912,3 +4931,31 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
return clone;
}
EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (!skb->data_len ||
+ skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+
+ /* Now adjust skb->truesize, since __pskb_pull_tail() does
+ * not do this.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 14e6145be33b..9fa46b956bdc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -715,7 +715,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -751,7 +751,7 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
break;
case SO_RCVBUFFORCE:
@@ -854,6 +854,13 @@ set_rcvbuf:
sk->sk_tskey = 0;
}
}
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+ ret = -EINVAL;
+ break;
+ }
+
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..2a46e4009f62 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- if (sock_table)
+ if (sock_table) {
static_key_slow_inc(&rps_needed);
+ static_key_slow_inc(&rfs_needed);
+ }
if (orig_sock_table) {
static_key_slow_dec(&rps_needed);
+ static_key_slow_dec(&rfs_needed);
synchronize_rcu();
vfree(orig_sock_table);
}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 4f6c1862dfd2..3202d75329b5 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1353,6 +1353,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
dcb_unlock:
spin_unlock_bh(&dcb_lock);
nla_put_failure:
+ err = -EMSGSIZE;
return err;
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index fda321d814d6..d859a5c36e70 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -694,6 +694,7 @@ int dccp_invalid_packet(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
unsigned int cscov;
+ u8 dccph_doff;
if (skb->pkt_type != PACKET_HOST)
return 1;
@@ -715,18 +716,19 @@ int dccp_invalid_packet(struct sk_buff *skb)
/*
* If P.Data Offset is too small for packet type, drop packet and return
*/
- if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
- DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
+ dccph_doff = dh->dccph_doff;
+ if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+ DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff);
return 1;
}
/*
* If P.Data Offset is too too large for packet, drop packet and return
*/
- if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
- DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
+ if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
+ DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
return 1;
}
-
+ dh = dccp_hdr(skb);
/*
* If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
* has short sequence numbers), drop packet and return
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index a6902c1e2f28..7899919cd9f0 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -233,6 +233,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
genphy_read_status(phydev);
if (ds->ops->adjust_link)
ds->ops->adjust_link(ds, port, phydev);
+
+ put_device(&phydev->mdio.dev);
}
return 0;
@@ -504,15 +506,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
void dsa_cpu_dsa_destroy(struct device_node *port_dn)
{
- struct phy_device *phydev;
-
- if (of_phy_is_fixed_link(port_dn)) {
- phydev = of_phy_find_device(port_dn);
- if (phydev) {
- phy_device_free(phydev);
- fixed_phy_unregister(phydev);
- }
- }
+ if (of_phy_is_fixed_link(port_dn))
+ of_phy_deregister_fixed_link(port_dn);
}
static void dsa_switch_destroy(struct dsa_switch *ds)
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index f8a7d9aab437..5fff951a0a49 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -28,8 +28,10 @@ static struct dsa_switch_tree *dsa_get_dst(u32 tree)
struct dsa_switch_tree *dst;
list_for_each_entry(dst, &dsa_switch_trees, list)
- if (dst->tree == tree)
+ if (dst->tree == tree) {
+ kref_get(&dst->refcount);
return dst;
+ }
return NULL;
}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d0c7bce88743..68c9eea00518 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1127,7 +1127,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
p->phy_interface = mode;
phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
- if (of_phy_is_fixed_link(port_dn)) {
+ if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
/* In the case of a fixed PHY, the DT node associated
* to the fixed PHY is the Port DT node
*/
@@ -1137,7 +1137,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
return ret;
}
phy_is_fixed = true;
- phy_dn = port_dn;
+ phy_dn = of_node_get(port_dn);
}
if (ds->ops->get_phy_flags)
@@ -1156,6 +1156,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
ret = dsa_slave_phy_connect(p, slave_dev, phy_id);
if (ret) {
netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret);
+ of_node_put(phy_dn);
return ret;
}
} else {
@@ -1164,6 +1165,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
phy_flags,
p->phy_interface);
}
+
+ of_node_put(phy_dn);
}
if (p->phy && phy_is_fixed)
@@ -1176,6 +1179,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
ret = dsa_slave_phy_connect(p, slave_dev, p->port);
if (ret) {
netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
+ if (phy_is_fixed)
+ of_phy_deregister_fixed_link(port_dn);
return ret;
}
}
@@ -1293,10 +1298,18 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
void dsa_slave_destroy(struct net_device *slave_dev)
{
struct dsa_slave_priv *p = netdev_priv(slave_dev);
+ struct dsa_switch *ds = p->parent;
+ struct device_node *port_dn;
+
+ port_dn = ds->ports[p->port].dn;
netif_carrier_off(slave_dev);
- if (p->phy)
+ if (p->phy) {
phy_disconnect(p->phy);
+
+ if (of_phy_is_fixed_link(port_dn))
+ of_phy_deregister_fixed_link(port_dn);
+ }
unregister_netdev(slave_dev);
free_netdev(slave_dev);
}
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 77d73014bde3..dc2960be51e0 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -286,9 +286,12 @@ int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info)
if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
return -EINVAL; /* name should be null-terminated */
+ rc = -ENODEV;
dev = dev_get_by_name(genl_info_net(info), name);
if (!dev)
- return -ENODEV;
+ return rc;
+ if (dev->type != ARPHRD_IEEE802154)
+ goto out;
phy = dev->ieee802154_ptr->wpan_phy;
BUG_ON(!phy);
@@ -342,6 +345,7 @@ nla_put_failure:
nlmsg_free(msg);
out_dev:
wpan_phy_put(phy);
+out:
if (dev)
dev_put(dev);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 28e051a8e847..6e7baaf814c6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -723,6 +723,7 @@ config DEFAULT_TCP_CONG
default "reno" if DEFAULT_RENO
default "dctcp" if DEFAULT_DCTCP
default "cdg" if DEFAULT_CDG
+ default "bbr" if DEFAULT_BBR
default "cubic"
config TCP_MD5SIG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..1830e6f0e9cc 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -374,8 +374,18 @@ lookup_protocol:
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err)
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
+ }
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
sk_common_release(sk);
+ goto out;
+ }
}
out:
return err;
@@ -1233,7 +1243,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
/* fixed ID is invalid if DF bit is not set */
- if (fixedid && !(iph->frag_off & htons(IP_DF)))
+ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
goto out;
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d95631d09248..20fb25e3027b 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -476,7 +476,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
esph = (void *)skb_push(skb, 4);
*seqhi = esph->spi;
esph->spi = esph->seq_no;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d93eea8e2409..dbad5a1c161a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -151,7 +151,7 @@ static void fib_replace_table(struct net *net, struct fib_table *old,
int fib_unmerge(struct net *net)
{
- struct fib_table *old, *new;
+ struct fib_table *old, *new, *main_table;
/* attempt to fetch local table if it has been allocated */
old = fib_get_table(net, RT_TABLE_LOCAL);
@@ -162,11 +162,21 @@ int fib_unmerge(struct net *net)
if (!new)
return -ENOMEM;
+ /* table is already unmerged */
+ if (new == old)
+ return 0;
+
/* replace merged table with clean table */
- if (new != old) {
- fib_replace_table(net, old, new);
- fib_free_table(old);
- }
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+
+ /* attempt to fetch main table if it has been allocated */
+ main_table = fib_get_table(net, RT_TABLE_MAIN);
+ if (!main_table)
+ return 0;
+
+ /* flush local entries from main table */
+ fib_table_flush_external(main_table);
return 0;
}
@@ -1209,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+ net->ipv4.fib_seq = 0;
+
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 388d3e21629b..c1bc1e92de0e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
+EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4cff74d4133f..1b0e7d1f5217 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -84,25 +84,114 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static BLOCKING_NOTIFIER_HEAD(fib_chain);
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_notifier_info info;
+
+ if (net->ipv4.fib_has_custom_rules)
+ call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
-int register_fib_notifier(struct notifier_block *nb)
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
{
- return blocking_notifier_chain_register(&fib_chain, nb);
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+ fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
}
EXPORT_SYMBOL(register_fib_notifier);
int unregister_fib_notifier(struct notifier_block *nb)
{
- return blocking_notifier_chain_unregister(&fib_chain, nb);
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ net->ipv4.fib_seq++;
info->net = net;
- return blocking_notifier_call_chain(&fib_chain, event_type, info);
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
}
static int call_fib_entry_notifiers(struct net *net,
@@ -719,6 +808,13 @@ static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
+ unsigned char slen_max;
+
+ /* only vector 0 can have a suffix length greater than or equal to
+ * tn->pos + tn->bits, the second highest node will have a suffix
+ * length at most of tn->pos + tn->bits - 1
+ */
+ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
/* search though the list of children looking for nodes that might
* have a suffix greater than the one we currently have. This is
@@ -736,12 +832,8 @@ static unsigned char update_suffix(struct key_vector *tn)
slen = n->slen;
i &= ~(stride - 1);
- /* if slen covers all but the last bit we can stop here
- * there will be nothing longer than that since only node
- * 0 and 1 << (bits - 1) could have that as their suffix
- * length.
- */
- if ((slen + 1) >= (tn->pos + tn->bits))
+ /* stop searching if we have hit the maximum possible value */
+ if (slen >= slen_max)
break;
}
@@ -913,39 +1005,27 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn)
return collapse(t, tn);
/* update parent in case halve failed */
- tp = node_parent(tn);
-
- /* Return if at least one deflate was run */
- if (max_work != MAX_WORK)
- return tp;
-
- /* push the suffix length to the parent node */
- if (tn->slen > tn->pos) {
- unsigned char slen = update_suffix(tn);
-
- if (slen > tp->slen)
- tp->slen = slen;
- }
-
- return tp;
+ return node_parent(tn);
}
-static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
+static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
- while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
- if (update_suffix(tp) > l->slen)
+ unsigned char node_slen = tn->slen;
+
+ while ((node_slen > tn->pos) && (node_slen > slen)) {
+ slen = update_suffix(tn);
+ if (node_slen == slen)
break;
- tp = node_parent(tp);
+
+ tn = node_parent(tn);
+ node_slen = tn->slen;
}
}
-static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
+static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
- /* if this is a new leaf then tn will be NULL and we can sort
- * out parent suffix lengths as a part of trie_rebalance
- */
- while (tn->slen < l->slen) {
- tn->slen = l->slen;
+ while (tn->slen < slen) {
+ tn->slen = slen;
tn = node_parent(tn);
}
}
@@ -1066,6 +1146,7 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp,
}
/* Case 3: n is NULL, and will just insert a new leaf */
+ node_push_suffix(tp, new->fa_slen);
NODE_INIT_PARENT(l, tp);
put_child_root(tp, key, l);
trie_rebalance(t, tp);
@@ -1107,7 +1188,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
- leaf_push_suffix(tp, l);
+ node_push_suffix(tp, new->fa_slen);
}
return 0;
@@ -1499,6 +1580,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
* out parent suffix lengths as a part of trie_rebalance
*/
if (hlist_empty(&l->leaf)) {
+ if (tp->slen == l->slen)
+ node_pull_suffix(tp, tp->pos);
put_child_root(tp, l->key, NULL);
node_free(l);
trie_rebalance(t, tp);
@@ -1511,7 +1594,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
- leaf_pull_suffix(tp, l);
+ node_pull_suffix(tp, fa->fa_slen);
}
/* Caller must hold RTNL. */
@@ -1743,8 +1826,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
local_l = fib_find_node(lt, &local_tp, l->key);
if (fib_insert_alias(lt, local_tp, local_l, new_fa,
- NULL, l->key))
+ NULL, l->key)) {
+ kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
+ }
}
/* stop loop if key wrapped back to 0 */
@@ -1760,6 +1845,75 @@ out:
return NULL;
}
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
+ }
+
+ /* record local slen */
+ slen = fa->fa_slen;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+ }
+}
+
/* Caller must hold RTNL. */
int fib_table_flush(struct net *net, struct fib_table *tb)
{
@@ -1782,6 +1936,10 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
if (IS_TRIE(pn))
break;
+ /* update the suffix to address pulled leaves */
+ if (pn->slen > pn->pos)
+ update_suffix(pn);
+
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
@@ -1834,6 +1992,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+ struct fib_table *tb, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ call_fib_entry_notifier(nb, net, event_type, l->key,
+ KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+ fa->fa_type, fa->tb_id, 0);
+ }
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+ struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ fib_leaf_notify(net, l, tb, nb, event_type);
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
+ }
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ fib_table_notify(net, tb, nb, event_type);
+ }
+}
+
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 691146abde2d..f79d7a8ab1c6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1047,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb)
if (success) {
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
}
drop:
kfree_skb(skb);
- return 0;
+ return NET_RX_DROP;
csum_error:
__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 606cc3e85d2b..15db786d50ed 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -162,7 +162,7 @@ static int unsolicited_report_interval(struct in_device *in_dev)
}
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
@@ -1130,10 +1130,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
spin_unlock_bh(&in_dev->mc_tomb_lock);
}
-static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+/*
+ * restore ip_mc_list deleted records
+ */
+static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc, *pmc_prev;
- struct ip_sf_list *psf, *psf_next;
+ struct ip_sf_list *psf;
+ struct net *net = dev_net(in_dev->dev);
+ __be32 multiaddr = im->multiaddr;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
@@ -1149,16 +1154,26 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
in_dev->mc_tomb = pmc->next;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ spin_lock_bh(&im->lock);
if (pmc) {
- for (psf = pmc->tomb; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ im->interface = pmc->interface;
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ im->sfmode = pmc->sfmode;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ im->tomb = pmc->tomb;
+ im->sources = pmc->sources;
+ for (psf = im->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = im->crcount;
}
in_dev_put(pmc->interface);
- kfree(pmc);
}
+ spin_unlock_bh(&im->lock);
}
+/*
+ * flush ip_mc_list deleted records
+ */
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *nextpmc;
@@ -1366,7 +1381,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
ip_mc_hash_add(in_dev, im);
#ifdef CONFIG_IP_MULTICAST
- igmpv3_del_delrec(in_dev, im->multiaddr);
+ igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
if (!in_dev->dead)
@@ -1626,8 +1641,12 @@ void ip_mc_remap(struct in_device *in_dev)
ASSERT_RTNL();
- for_each_pmc_rtnl(in_dev, pmc)
+ for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, pmc);
+#endif
igmp_group_added(pmc);
+ }
}
/* Device going down */
@@ -1648,7 +1667,6 @@ void ip_mc_down(struct in_device *in_dev)
in_dev->mr_gq_running = 0;
if (del_timer(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
- igmpv3_clear_delrec(in_dev);
#endif
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
@@ -1688,8 +1706,12 @@ void ip_mc_up(struct in_device *in_dev)
#endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
- for_each_pmc_rtnl(in_dev, pmc)
+ for_each_pmc_rtnl(in_dev, pmc) {
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, pmc);
+#endif
igmp_group_added(pmc);
+ }
}
/*
@@ -1704,13 +1726,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
/* Deactivate timers */
ip_mc_down(in_dev);
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_clear_delrec(in_dev);
+#endif
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
-
- /* We've dropped the groups in ip_mc_down already */
- ip_mc_clear_src(i);
ip_ma_put(i);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 358f2c82b030..9ffc2625cddd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -74,6 +74,7 @@
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
@@ -107,6 +108,8 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
if (unlikely(!skb))
return 0;
+ skb->protocol = htons(ETH_P_IP);
+
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
@@ -285,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -303,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ return dev_loopback_xmit(net, sk, skb);
+}
+
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
@@ -340,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -356,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3776ff6749f..b3cc1335adbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
if (addr_type == RTN_UNSPEC)
- addr_type = inet_addr_type(net, saddr);
+ addr_type = inet_addr_type_dev_table(net, dev, saddr);
if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
flags |= FLOWI_FLAG_ANYSRC;
else
@@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ if (!fl4.flowi4_oif)
+ fl4.flowi4_oif = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_flags = flags;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 39004da318e2..1258a9ab62ef 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -411,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name)
}
static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
- unsigned long pcnt;
int ret;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
@@ -439,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
err:
module_put(t->u.kernel.target->me);
out:
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -519,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -528,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
static int translate_table(struct xt_table_info *newinfo, void *entry0,
const struct arpt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct arpt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -590,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size);
+ ret = find_check_entry(iter, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
@@ -1197,8 +1197,8 @@ static int translate_compat_table(struct xt_table_info **pinfo,
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
- newinfo->hook_entry[i] = info->hook_entry[i];
- newinfo->underflow[i] = info->underflow[i];
+ newinfo->hook_entry[i] = compatr->hook_entry[i];
+ newinfo->underflow[i] = compatr->underflow[i];
}
entry1 = newinfo->entries;
pos = entry1;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 46815c8a60d7..308b456723f0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -531,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -539,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -582,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -670,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -679,6 +677,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ipt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -738,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e6e206fa86c8..21db00d0362b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -419,7 +419,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
}
cipinfo->config = config;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -444,7 +444,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
clusterip_config_put(cipinfo->config);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_get(par->net, par->family);
}
#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 34cfb9b0bc0a..a03e4e7ef5f9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
@@ -59,6 +59,11 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
xt_out(par));
}
+static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
+}
+
static struct xt_target masquerade_tg_reg __read_mostly = {
.name = "MASQUERADE",
.family = NFPROTO_IPV4,
@@ -67,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
+ .destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
};
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 361411688221..30c0de53e254 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -418,12 +418,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
e->ip.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg4_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 59b49945b481..f273098e48fd 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -83,10 +83,12 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
return true ^ invert;
iph = ip_hdr(skb);
- if (ipv4_is_multicast(iph->daddr)) {
- if (ipv4_is_zeronet(iph->saddr))
- return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr))
+ return true ^ invert;
}
+
flow.flowi4_iif = LOOPBACK_IFINDEX;
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 7130ed5dc1fa..fcfd071f4705 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -31,6 +31,13 @@
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netfilter/nf_log.h>
+static int conntrack4_net_id __read_mostly;
+static DEFINE_MUTEX(register_ipv4_hooks);
+
+struct conntrack4_net {
+ unsigned int users;
+};
+
static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -307,9 +314,42 @@ static struct nf_sockopt_ops so_getorigdst = {
.owner = THIS_MODULE,
};
-static int ipv4_init_net(struct net *net)
+static int ipv4_hooks_register(struct net *net)
{
- return 0;
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv4_hooks);
+
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv4_enable(net);
+ if (err) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv4_hooks);
+ return err;
+}
+
+static void ipv4_hooks_unregister(struct net *net)
+{
+ struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
+
+ mutex_lock(&register_ipv4_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ mutex_unlock(&register_ipv4_hooks);
}
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
@@ -325,7 +365,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
.nla_policy = ipv4_nla_policy,
#endif
- .init_net = ipv4_init_net,
+ .net_ns_get = ipv4_hooks_register,
+ .net_ns_put = ipv4_hooks_unregister,
.me = THIS_MODULE,
};
@@ -340,6 +381,15 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
&nf_conntrack_l4proto_tcp4,
&nf_conntrack_l4proto_udp4,
&nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite4,
+#endif
};
static int ipv4_net_init(struct net *net)
@@ -369,6 +419,8 @@ static void ipv4_net_exit(struct net *net)
static struct pernet_operations ipv4_net_ops = {
.init = ipv4_net_init,
.exit = ipv4_net_exit,
+ .id = &conntrack4_net_id,
+ .size = sizeof(struct conntrack4_net),
};
static int __init nf_conntrack_l3proto_ipv4_init(void)
@@ -376,7 +428,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv4_enable();
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
@@ -390,17 +441,10 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
goto cleanup_sockopt;
}
- ret = nf_register_hooks(ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register hooks.\n");
- goto cleanup_pernet;
- }
-
ret = nf_ct_l4proto_register(builtin_l4proto4,
ARRAY_SIZE(builtin_l4proto4));
if (ret < 0)
- goto cleanup_hooks;
+ goto cleanup_pernet;
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
@@ -412,8 +456,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
cleanup_l4proto:
nf_ct_l4proto_unregister(builtin_l4proto4,
ARRAY_SIZE(builtin_l4proto4));
- cleanup_hooks:
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
cleanup_pernet:
unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
@@ -427,7 +469,6 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
nf_ct_l4proto_unregister(builtin_l4proto4,
ARRAY_SIZE(builtin_l4proto4));
- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
}
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index d88da36b383c..49bd6a54404f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -11,6 +11,7 @@
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <net/netns/generic.h>
#include <net/route.h>
#include <net/ip.h>
@@ -22,6 +23,8 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
+static DEFINE_MUTEX(defrag4_mutex);
+
static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
u_int32_t user)
{
@@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
},
};
+static void __net_exit defrag4_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv4) {
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ net->nf.defrag_ipv4 = false;
+ }
+}
+
+static struct pernet_operations defrag4_net_ops = {
+ .exit = defrag4_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
- return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ return register_pernet_subsys(&defrag4_net_ops);
}
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+ unregister_pernet_subsys(&defrag4_net_ops);
}
-void nf_defrag_ipv4_enable(void)
+int nf_defrag_ipv4_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv4)
+ return 0;
+
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv4 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag4_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 1b49966484b3..965b1a161369 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -101,12 +101,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
}
iph = ip_hdr(pkt->skb);
- if (ipv4_is_multicast(iph->daddr) &&
- ipv4_is_zeronet(iph->saddr) &&
- ipv4_is_local_multicast(iph->daddr)) {
- nft_fib_store_result(dest, priv->result, pkt,
- get_ifindex(pkt->skb->dev));
- return;
+ if (ipv4_is_zeronet(iph->saddr)) {
+ if (ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_local_multicast(iph->daddr)) {
+ nft_fib_store_result(dest, priv->result, pkt,
+ get_ifindex(pkt->skb->dev));
+ return;
+ }
}
if (priv->flags & NFTA_FIB_F_MARK)
@@ -122,6 +123,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
fl4.saddr = get_saddr(iph->daddr);
}
+ *dest = 0;
+
if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
return;
@@ -198,7 +201,7 @@ nft_fib4_select_ops(const struct nft_ctx *ctx,
if (!tb[NFTA_FIB_RESULT])
return ERR_PTR(-EINVAL);
- result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
switch (result) {
case NFT_FIB_RESULT_OIF:
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 4f697e431811..a0ea8aad1bf1 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -35,12 +35,19 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
&range, nft_out(pkt));
}
+static void
+nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
+}
+
static struct nft_expr_type nft_masq_ipv4_type;
static const struct nft_expr_ops nft_masq_ipv4_ops = {
.type = &nft_masq_ipv4_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv4_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init);
module_exit(nft_masq_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
index 16df0493c5ce..1650ed23c15d 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -38,12 +38,19 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
}
+static void
+nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
+}
+
static struct nft_expr_type nft_redir_ipv4_type;
static const struct nft_expr_ops nft_redir_ipv4_ops = {
.type = &nft_redir_ipv4_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv4_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -71,5 +78,5 @@ module_init(nft_redir_ipv4_module_init);
module_exit(nft_redir_ipv4_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index d11129f1178d..5b2635e69a92 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -657,6 +657,10 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
if (len > 0xFFFF)
return -EMSGSIZE;
+ /* Must have at least a full ICMP header. */
+ if (len < icmph_len)
+ return -EINVAL;
+
/*
* Check the flags.
*/
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d37fc6f7e679..fa5c037227cb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -531,13 +531,14 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -1602,6 +1603,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_unlock_bh(&fnhe_lock);
}
+static void set_lwt_redirect(struct rtable *rth)
+{
+ if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_output = rth->dst.output;
+ rth->dst.output = lwtunnel_output;
+ }
+
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1691,14 +1705,7 @@ rt_cache:
rth->dst.input = ip_forward;
rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
- if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_output = rth->dst.output;
- rth->dst.output = lwtunnel_output;
- }
- if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
- rth->dst.lwtstate->orig_input = rth->dst.input;
- rth->dst.input = lwtunnel_input;
- }
+ set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1925,8 +1932,18 @@ local_input:
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
+
if (do_cache) {
- if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+ WARN_ON(rth->dst.input == lwtunnel_input);
+ rth->dst.lwtstate->orig_input = rth->dst.input;
+ rth->dst.input = lwtunnel_input;
+ }
+
+ if (unlikely(!rt_cache_route(nh, rth))) {
rth->dst.flags |= DST_NOCACHE;
rt_add_uncached_list(rth);
}
@@ -2154,8 +2171,7 @@ add:
}
rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- if (lwtunnel_output_redirect(rth->dst.lwtstate))
- rth->dst.output = lwtunnel_output;
+ set_lwt_redirect(rth);
return rth;
}
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0dc6286272aa..3e88467d70ee 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq = tcp_rsk(req);
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 913f9bbfc030..1ef3165114ba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -663,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (tcp_should_autocork(sk, skb, size_goal)) {
/* avoid atomic op if TSQ_THROTTLED bit is already set */
- if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
- set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
@@ -996,8 +996,11 @@ do_error:
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
return sk_stream_error(sk, flags, err);
}
@@ -1331,8 +1334,11 @@ do_error:
out_err:
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
+ err == -EAGAIN)) {
sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
release_sock(sk);
return err;
}
@@ -2702,6 +2708,25 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
+static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
+ struct tcp_info *info)
+{
+ u64 stats[__TCP_CHRONO_MAX], total = 0;
+ enum tcp_chrono i;
+
+ for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
+ stats[i] = tp->chrono_stat[i - 1];
+ if (i == tp->chrono_type)
+ stats[i] += tcp_time_stamp - tp->chrono_start;
+ stats[i] *= USEC_PER_SEC / HZ;
+ total += stats[i];
+ }
+
+ info->tcpi_busy_time = total;
+ info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
+ info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
+}
+
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
@@ -2794,6 +2819,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_acked = tp->bytes_acked;
info->tcpi_bytes_received = tp->bytes_received;
info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+ tcp_get_info_chrono_stats(tp, info);
unlock_sock_fast(sk, slow);
@@ -2815,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *stats;
+ struct tcp_info info;
+
+ stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ tcp_get_info_chrono_stats(tp, &info);
+ nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+ info.tcpi_busy_time, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+ info.tcpi_rwnd_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+ info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+ return stats;
+}
+
static int do_tcp_getsockopt(struct sock *sk, int level,
int optname, char __user *optval, int __user *optlen)
{
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 1294af4e0127..79c4817abc94 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
int ret = 0;
- /* all algorithms must implement ssthresh and cong_avoid ops */
- if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) {
+ /* all algorithms must implement these */
+ if (!ca->ssthresh || !ca->undo_cwnd ||
+ !(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
@@ -200,8 +201,10 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_ops = ca;
icsk->icsk_ca_setsockopt = 1;
- if (sk->sk_state != TCP_CLOSE)
+ if (sk->sk_state != TCP_CLOSE) {
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
tcp_init_congestion_control(sk);
+ }
}
/* Manage refcounts on socket close. */
@@ -441,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+u32 tcp_reno_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
+
struct tcp_congestion_ops tcp_reno = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
};
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 51139175bf61..5f5e5936760e 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -188,8 +188,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
/* If ack did not advance snd_una, count dupack as MSS size.
@@ -229,13 +229,6 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
WRITE_ONCE(ca->dctcp_alpha, alpha);
dctcp_reset(tp, ca);
}
-
- if (flags & CA_ACK_ECE) {
- unsigned int cwnd = dctcp_ssthresh(sk);
-
- if (cwnd != tp->snd_cwnd)
- tp->snd_cwnd = cwnd;
- }
}
static void dctcp_state(struct sock *sk, u8 new_state)
@@ -342,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = {
static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.get_info = dctcp_get_info,
.owner = THIS_MODULE,
.name = "dctcp-reno",
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index db7842495a64..6d9879e93648 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -94,6 +94,7 @@ static const struct hstcp_aimd_val {
struct hstcp {
u32 ai;
+ u32 loss_cwnd;
};
static void hstcp_init(struct sock *sk)
@@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 hstcp_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- const struct hstcp *ca = inet_csk_ca(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Do multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
+static u32 hstcp_cwnd_undo(struct sock *sk)
+{
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
+ .undo_cwnd = hstcp_cwnd_undo,
.cong_avoid = hstcp_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 083831e359df..0f7175c3338e 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index c8e6d86be114..60352ff4f5a8 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -48,6 +48,7 @@ struct illinois {
u32 end_seq; /* right edge of current RTT */
u32 alpha; /* Additive increase */
u32 beta; /* Muliplicative decrease */
+ u32 loss_cwnd; /* cwnd on loss */
u16 acked; /* # packets acked by current ACK */
u8 rtt_above; /* average rtt has gone above threshold */
u8 rtt_low; /* # of rtts measurements below threshold */
@@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
+ ca->loss_cwnd = tp->snd_cwnd;
/* Multiplicative decrease */
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
}
+static u32 tcp_illinois_cwnd_undo(struct sock *sk)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
/* Extract info for Tcp socket info provided via netlink. */
static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
@@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
+ .undo_cwnd = tcp_illinois_cwnd_undo,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a70046fea0e8..6c790754ae3e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,6 +85,7 @@ int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
/* rfc5961 challenge ack rate limiting */
int sysctl_tcp_challenge_ack_limit = 1000;
@@ -128,6 +129,23 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+{
+ static bool __once __read_mostly;
+
+ if (!__once) {
+ struct net_device *dev;
+
+ __once = true;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
+ rcu_read_unlock();
+ }
+}
+
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
@@ -144,7 +162,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
- icsk->icsk_ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+ tcp_sk(sk)->advmss);
+ if (unlikely(icsk->icsk_ack.rcv_mss != len))
+ tcp_gro_dev_warn(sk, skb);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -2394,10 +2415,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->undo_cwnd)
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
- else
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -3181,6 +3199,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->lost_skb_hint = NULL;
}
+ if (!skb)
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
@@ -5059,8 +5080,11 @@ static void tcp_check_space(struct sock *sk)
/* pairs with tcp_poll() */
smp_mb__after_atomic();
if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
}
}
@@ -6304,6 +6328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
tcp_rsk(req)->af_specific = af_ops;
+ tcp_rsk(req)->ts_off = 0;
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6325,6 +6350,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
+ if (isn && tmp_opt.tstamp_ok)
+ af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+
if (!want_cookie && !isn) {
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
@@ -6365,7 +6393,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_release;
}
- isn = af_ops->init_seq(skb);
+ isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
}
if (!dst) {
dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6377,6 +6405,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ tcp_rsk(req)->ts_off = 0;
req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5555eb86e549..30d81f533ada 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -95,12 +95,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -237,7 +237,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
- usin->sin_port);
+ usin->sin_port,
+ &tp->tsoffset);
inet->inet_id = tp->write_seq ^ jiffies;
@@ -442,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
- if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
goto out;
@@ -824,7 +825,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c67ece1390c2..046fd3910873 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
static struct tcp_congestion_ops tcp_lp __read_mostly = {
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_lp_cong_avoid,
.pkts_acked = tcp_lp_pkts_acked,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6234ebaa7db1..28ce5ee831f5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -532,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
- newtp->tsoffset = 0;
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -581,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
+ if (tmp_opt.rcv_tsecr)
+ tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f57b5aa51b59..b45101f3d2bd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcp_skb_timestamp(skb);
+ opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -769,25 +769,26 @@ static void tcp_tasklet_func(unsigned long data)
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
- bh_lock_sock(sk);
-
- if (!sock_owned_by_user(sk)) {
- tcp_tsq_handler(sk);
- } else {
- /* defer the work to tcp_release_cb() */
- set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
+ if (!sk->sk_lock.owned &&
+ test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+ tcp_tsq_handler(sk);
+ }
+ bh_unlock_sock(sk);
}
- bh_unlock_sock(sk);
- clear_bit(TSQ_QUEUED, &tp->tsq_flags);
sk_free(sk);
}
}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
- (1UL << TCP_WRITE_TIMER_DEFERRED) | \
- (1UL << TCP_DELACK_TIMER_DEFERRED) | \
- (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED | \
+ TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -797,18 +798,17 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = tp->tsq_flags;
+ flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
- if (flags & (1UL << TCP_TSQ_DEFERRED))
+ if (flags & TCPF_TSQ_DEFERRED)
tcp_tsq_handler(sk);
/* Here begins the tricky part :
@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
*/
sock_release_ownership(sk);
- if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
tcp_delack_timer_handler(sk);
__sock_put(sk);
}
- if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ if (flags & TCPF_MTU_REDUCED_DEFERRED) {
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nval, oval;
int wmem;
/* Keep one reference on sk_wmem_alloc.
@@ -877,16 +878,25 @@ void tcp_wfree(struct sk_buff *skb)
if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
- !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
- unsigned long flags;
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
+ bool empty;
+
+ if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+ goto out;
+
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+ if (nval != oval)
+ continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
+ empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
- tasklet_schedule(&tsq->tasklet);
+ if (empty)
+ tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}
@@ -1514,6 +1524,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
+
+ /* The following conditions together indicate the starvation
+ * is caused by insufficient sender buffer:
+ * 1) just sent some data (see tcp_write_xmit)
+ * 2) not cwnd limited (this else condition)
+ * 3) no more data to send (null tcp_send_head )
+ * 4) application is hitting buffer limit (SOCK_NOSPACE)
+ */
+ if (!tcp_send_head(sk) && sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
+ (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -1910,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
*/
static int tcp_mtu_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
- int len;
int probe_size;
int size_needed;
- int copy;
+ int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
- * not SACKing (the variable headers throw things off) */
- if (!icsk->icsk_mtup.enabled ||
- icsk->icsk_mtup.probe_size ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
- tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ * not SACKing (the variable headers throw things off)
+ */
+ if (likely(!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
@@ -2069,7 +2091,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
- set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+ /* Always send the 1st or 2nd skb in write queue.
+ * No need to wait for TX completion to call us back,
+ * after softirq/tasklet schedule.
+ * This helps when TX completions are delayed too much.
+ */
+ if (skb == sk->sk_write_queue.next ||
+ skb->prev == sk->sk_write_queue.next)
+ return false;
+
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
@@ -2081,6 +2112,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
return false;
}
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_time_stamp;
+
+ if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+ tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+ tp->chrono_start = now;
+ tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+
+ /* There are multiple conditions worthy of tracking in a
+ * chronograph, so that the highest priority enum takes
+ * precedence over the other conditions (see tcp_chrono_start).
+ * If a condition stops, we only stop chrono tracking if
+ * it's the "most interesting" or current chrono we are
+ * tracking and starts busy chrono if we have pending data.
+ */
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+ else if (type == tp->chrono_type)
+ tcp_chrono_set(tp, TCP_CHRONO_BUSY);
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2103,7 +2175,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
- bool is_cwnd_limited = false;
+ bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
@@ -2140,8 +2212,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ is_rwnd_limited = true;
break;
+ }
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
@@ -2167,6 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
@@ -2186,6 +2262,11 @@ repair:
break;
}
+ if (is_rwnd_limited)
+ tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+ else
+ tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2514,7 +2595,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
}
/* Collapses two adjacent SKB's during retransmission. */
-static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -2525,14 +2606,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+ if (next_skb_size) {
+ if (next_skb_size <= skb_availroom(skb))
+ skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
+ next_skb_size);
+ else if (!skb_shift(skb, next_skb, next_skb_size))
+ return false;
+ }
tcp_highest_sack_combine(sk, next_skb, skb);
tcp_unlink_write_queue(next_skb, sk);
- if (next_skb_size)
- skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
- next_skb_size);
-
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2561,6 +2645,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_skb_collapse_tstamp(skb, next_skb);
sk_wmem_free_skb(sk, next_skb);
+ return true;
}
/* Check if coalescing SKBs is legal. */
@@ -2610,16 +2695,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (space < 0)
break;
- /* Punt if not enough space exists in the first SKB for
- * the data in the second
- */
- if (skb->len > skb_availroom(to))
- break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
- tcp_collapse_retrans(sk, to);
+ if (!tcp_collapse_retrans(sk, to))
+ break;
}
}
@@ -3298,6 +3379,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
@@ -3462,8 +3545,6 @@ void tcp_send_ack(struct sock *sk)
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
- * We also avoid tcp_wfree() overhead (cache line miss accessing
- * tp->tsq_flags) by using regular sock_wfree()
*/
skb_set_tcp_pure_ack(buff);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index bf5ea9e9bbc1..f2123075ce6e 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,6 +15,10 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
+struct scalable {
+ u32 loss_cwnd;
+};
+
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ struct scalable *ca = inet_csk_ca(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
}
+static u32 tcp_scalable_cwnd_undo(struct sock *sk)
+{
+ const struct scalable *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
+ .undo_cwnd = tcp_scalable_cwnd_undo,
.cong_avoid = tcp_scalable_cong_avoid,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..3705075f42c3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data)
inet_csk(sk)->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
@@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data)
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 4c4bac1b5eab..218cfcc77650 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = tcp_vegas_cong_avoid,
.pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 40171e163cff..76005d4b8dfc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -30,6 +30,7 @@ struct veno {
u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
u32 inc; /* decide whether to increase cwnd */
u32 diff; /* calculate the diff rate */
+ u32 loss_cwnd; /* cwnd when loss occured */
};
/* There are several situations when we must "re-start" Veno:
@@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
+ veno->loss_cwnd = tp->snd_cwnd;
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
return max(tp->snd_cwnd * 4 / 5, 2U);
@@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
+static u32 tcp_veno_cwnd_undo(struct sock *sk)
+{
+ const struct veno *veno = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
+ .undo_cwnd = tcp_veno_cwnd_undo,
.cong_avoid = tcp_veno_cong_avoid,
.pkts_acked = tcp_veno_pkts_acked,
.set_state = tcp_veno_state,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 4b03a2e2a050..fed66dc0e0f5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
+ .undo_cwnd = tcp_reno_undo_cwnd,
.cwnd_event = tcp_westwood_event,
.in_ack_event = tcp_westwood_ack,
.get_info = tcp_westwood_info,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9c5fc973267f..e6ff99c4bd3b 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -37,6 +37,7 @@ struct yeah {
u32 fast_count;
u32 pkts_acked;
+ u32 loss_cwnd;
};
static void tcp_yeah_init(struct sock *sk)
@@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
+ yeah->loss_cwnd = tp->snd_cwnd;
return max_t(int, tp->snd_cwnd - reduction, 2);
}
+static u32 tcp_yeah_cwnd_undo(struct sock *sk)
+{
+ const struct yeah *yeah = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd);
+}
+
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
+ .undo_cwnd = tcp_yeah_cwnd_undo,
.cong_avoid = tcp_yeah_cong_avoid,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e1fc0116e8d5..9ca279b130d5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1177,42 +1177,102 @@ out:
/* fully reclaim rmem/fwd memory allocated for skb */
static void udp_rmem_release(struct sock *sk, int size, int partial)
{
+ struct udp_sock *up = udp_sk(sk);
int amt;
- atomic_sub(size, &sk->sk_rmem_alloc);
+ if (likely(partial)) {
+ up->forward_deficit += size;
+ size = up->forward_deficit;
+ if (size < (sk->sk_rcvbuf >> 2) &&
+ !skb_queue_empty(&sk->sk_receive_queue))
+ return;
+ } else {
+ size += up->forward_deficit;
+ }
+ up->forward_deficit = 0;
+
sk->sk_forward_alloc += size;
amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
sk->sk_forward_alloc -= amt;
if (amt)
__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+
+ atomic_sub(size, &sk->sk_rmem_alloc);
}
-/* Note: called with sk_receive_queue.lock held */
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
- udp_rmem_release(sk, skb->truesize, 1);
+ udp_rmem_release(sk, skb->dev_scratch, 1);
}
EXPORT_SYMBOL(udp_skb_destructor);
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+ spinlock_t *busy;
+
+ busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+ spin_lock(busy);
+ return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+ if (busy)
+ spin_unlock(busy);
+}
+
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff_head *list = &sk->sk_receive_queue;
int rmem, delta, amt, err = -ENOMEM;
- int size = skb->truesize;
+ spinlock_t *busy = NULL;
+ int size;
/* try to avoid the costly atomic add/sub pair when the receive
* queue is full; always allow at least a packet
*/
rmem = atomic_read(&sk->sk_rmem_alloc);
- if (rmem && (rmem + size > sk->sk_rcvbuf))
+ if (rmem > sk->sk_rcvbuf)
goto drop;
+ /* Under mem pressure, it might be helpful to help udp_recvmsg()
+ * having linear skbs :
+ * - Reduce memory overhead and thus increase receive queue capacity
+ * - Less cache line misses at copyout() time
+ * - Less work at consume_skb() (less alien page frag freeing)
+ */
+ if (rmem > (sk->sk_rcvbuf >> 1)) {
+ skb_condense(skb);
+
+ busy = busylock_acquire(sk);
+ }
+ size = skb->truesize;
+ /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+ * in udp_skb_destructor()
+ */
+ skb->dev_scratch = size;
+
/* we drop only if the receive buf is full and the receive
* queue contains some other skb
*/
rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
- if ((rmem > sk->sk_rcvbuf) && (rmem > size))
+ if (rmem > (size + sk->sk_rcvbuf))
goto uncharge_drop;
spin_lock(&list->lock);
@@ -1233,7 +1293,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
/* no need to setup a destructor, we will explicitly release the
* forward allocated memory on dequeue
*/
- skb->dev = NULL;
sock_skb_set_dropcount(sk, skb);
__skb_queue_tail(list, skb);
@@ -1242,6 +1301,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
+ busylock_release(busy);
return 0;
uncharge_drop:
@@ -1249,6 +1309,7 @@ uncharge_drop:
drop:
atomic_inc(&sk->sk_drops);
+ busylock_release(busy);
return err;
}
EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
@@ -1389,7 +1450,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -1561,7 +1623,7 @@ static void udp_v4_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -1742,10 +1804,10 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
- udp_table.mask;
- hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+ udptable->mask;
+ hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udp_table.hash2[hash2];
+ hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
@@ -2602,6 +2664,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
void __init udp_init(void)
{
unsigned long limit;
+ unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -2612,4 +2675,13 @@ void __init udp_init(void)
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+ /* 16 spinlocks per cpu */
+ udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+ udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+ GFP_KERNEL);
+ if (!udp_busylocks)
+ panic("UDP: failed to alloc udp_busylocks\n");
+ for (i = 0; i < (1U << udp_busylocks_log); i++)
+ spin_lock_init(udp_busylocks + i);
}
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 7e0fe4bdd967..feb50a16398d 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
-int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 86219c0a0104..c1e124bc8e1e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -183,7 +183,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
static void addrconf_dad_start(struct inet6_ifaddr *ifp);
static void addrconf_dad_work(struct work_struct *w);
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id);
static void addrconf_dad_run(struct inet6_dev *idev);
static void addrconf_rs_timer(unsigned long data);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
@@ -242,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
#ifdef CONFIG_IPV6_SEG6_HMAC
.seg6_require_hmac = 0,
#endif
+ .enhanced_dad = 1,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -292,6 +293,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
#ifdef CONFIG_IPV6_SEG6_HMAC
.seg6_require_hmac = 0,
#endif
+ .enhanced_dad = 1,
};
/* Check if a valid qdisc is available */
@@ -2906,6 +2908,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
spin_lock_bh(&ifp->lock);
ifp->flags &= ~IFA_F_TENTATIVE;
spin_unlock_bh(&ifp->lock);
+ rt_genid_bump_ipv6(dev_net(idev->dev));
ipv6_ifa_notify(RTM_NEWADDR, ifp);
in6_ifa_put(ifp);
}
@@ -3734,12 +3737,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
unsigned long rand_num;
struct inet6_dev *idev = ifp->idev;
+ u64 nonce;
if (ifp->flags & IFA_F_OPTIMISTIC)
rand_num = 0;
else
rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+ nonce = 0;
+ if (idev->cnf.enhanced_dad ||
+ dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) {
+ do
+ get_random_bytes(&nonce, 6);
+ while (nonce == 0);
+ }
+ ifp->dad_nonce = nonce;
ifp->dad_probes = idev->cnf.dad_transmits;
addrconf_mod_dad_work(ifp, rand_num);
}
@@ -3748,7 +3760,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
- bool notify = false;
+ bool bump_id, notify = false;
addrconf_join_solict(dev, &ifp->addr);
@@ -3763,11 +3775,12 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
idev->cnf.accept_dad < 1 ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
+ bump_id = ifp->flags & IFA_F_TENTATIVE;
ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
- addrconf_dad_completed(ifp);
+ addrconf_dad_completed(ifp, bump_id);
return;
}
@@ -3827,8 +3840,8 @@ static void addrconf_dad_work(struct work_struct *w)
struct inet6_ifaddr,
dad_work);
struct inet6_dev *idev = ifp->idev;
+ bool bump_id, disable_ipv6 = false;
struct in6_addr mcaddr;
- bool disable_ipv6 = false;
enum {
DAD_PROCESS,
@@ -3898,11 +3911,12 @@ static void addrconf_dad_work(struct work_struct *w)
* DAD was successful
*/
+ bump_id = ifp->flags & IFA_F_TENTATIVE;
ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
- addrconf_dad_completed(ifp);
+ addrconf_dad_completed(ifp, bump_id);
goto out;
}
@@ -3915,7 +3929,8 @@ static void addrconf_dad_work(struct work_struct *w)
/* send a neighbour solicitation for our addr */
addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
- ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
+ ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
+ ifp->dad_nonce);
out:
in6_ifa_put(ifp);
rtnl_unlock();
@@ -3939,7 +3954,7 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
return true;
}
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
{
struct net_device *dev = ifp->idev->dev;
struct in6_addr lladdr;
@@ -3991,6 +4006,9 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
spin_unlock(&ifp->lock);
write_unlock_bh(&ifp->idev->lock);
}
+
+ if (bump_id)
+ rt_genid_bump_ipv6(dev_net(dev));
}
static void addrconf_dad_run(struct inet6_dev *idev)
@@ -4956,6 +4974,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
#ifdef CONFIG_IPV6_SEG6_HMAC
array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
#endif
+ array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
}
static inline size_t inet6_ifla6_size(void)
@@ -6064,6 +6083,13 @@ static const struct ctl_table addrconf_sysctl[] = {
},
#endif
{
+ .procname = "enhanced_dad",
+ .data = &ipv6_devconf.enhanced_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
/* sentinel */
}
};
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a3737a..237e654ba717 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -258,6 +258,14 @@ lookup_protocol:
goto out;
}
}
+
+ if (!kern) {
+ err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
+ }
out:
return err;
out_rcu_unlock:
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index c5d76d2edd26..0489e19258ad 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -140,7 +140,8 @@ void ip6_datagram_release_cb(struct sock *sk)
}
EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
-static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -253,6 +254,7 @@ ipv4_connected:
out:
return err;
}
+EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 218f0cba231c..cbcdd5db31f4 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -418,7 +418,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
esph = (void *)skb_push(skb, 4);
*seqhi = esph->spi;
esph->spi = esph->seq_no;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index eb948ffd734b..17fa28f7a0ff 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -448,8 +448,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (__ipv6_addr_needs_scope_id(addr_type))
iif = skb->dev->ifindex;
- else
- iif = l3mdev_master_ifindex(skb_dst(skb)->dev);
+ else {
+ dst = skb_dst(skb);
+ iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+ }
/*
* Must not send error if the source does not uniquely
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 1fcf61f1cbc3..89c59e656f44 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -99,7 +99,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
segs = ops->callbacks.gso_segment(skb, features);
}
- if (IS_ERR(segs))
+ if (IS_ERR_OR_NULL(segs))
goto out;
gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 312cbd0e5038..70d0de404197 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
@@ -131,6 +132,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d3c619eda051..8b186b56183a 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1034,6 +1034,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
int mtu;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen;
+ bool use_cache = false;
u8 hop_limit;
int err = -1;
@@ -1066,7 +1067,15 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
- } else if (!fl6->flowi6_mark)
+ } else if (!(t->parms.flags &
+ (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
+ /* enable the cache only only if the routing decision does
+ * not depend on the current inner header value
+ */
+ use_cache = true;
+ }
+
+ if (use_cache)
dst = dst_cache_get(&t->dst_cache);
if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
@@ -1150,7 +1159,7 @@ route_lookup:
if (t->encap.type != TUNNEL_ENCAP_NONE)
goto tx_err_dst_release;
} else {
- if (!fl6->flowi6_mark && ndst)
+ if (use_cache && ndst)
dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
}
skb_dst_set(skb, dst);
@@ -1172,7 +1181,6 @@ route_lookup:
if (err)
return err;
- skb->protocol = htons(ETH_P_IPV6);
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c476bb8e9cdb..f4b4a4a5f4ba 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1122,6 +1122,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
.priority = 100,
};
+static bool is_vti6_tunnel(const struct net_device *dev)
+{
+ return dev->netdev_ops == &vti6_netdev_ops;
+}
+
+static int vti6_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!is_vti6_tunnel(dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ if (!net_eq(t->net, dev_net(dev)))
+ xfrm_garbage_collect(t->net);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vti6_notifier_block __read_mostly = {
+ .notifier_call = vti6_device_event,
+};
+
/**
* vti6_tunnel_init - register protocol and reserve needed resources
*
@@ -1132,6 +1159,8 @@ static int __init vti6_tunnel_init(void)
const char *msg;
int err;
+ register_netdevice_notifier(&vti6_notifier_block);
+
msg = "tunnel device";
err = register_pernet_device(&vti6_net_ops);
if (err < 0)
@@ -1164,6 +1193,7 @@ xfrm_proto_ah_failed:
xfrm_proto_esp_failed:
unregister_pernet_device(&vti6_net_ops);
pernet_dev_failed:
+ unregister_netdevice_notifier(&vti6_notifier_block);
pr_err("vti6 init: failed to register %s\n", msg);
return err;
}
@@ -1178,6 +1208,7 @@ static void __exit vti6_tunnel_cleanup(void)
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti6_net_ops);
+ unregister_netdevice_notifier(&vti6_notifier_block);
}
module_init(vti6_tunnel_init);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d8e671457d10..7ebac630d3c6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -233,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
+ case ND_OPT_NONCE:
case ND_OPT_REDIRECT_HDR:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
ND_PRINTK(2, warn,
@@ -568,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev)
}
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
- const struct in6_addr *daddr, const struct in6_addr *saddr)
+ const struct in6_addr *daddr, const struct in6_addr *saddr,
+ u64 nonce)
{
struct sk_buff *skb;
struct in6_addr addr_buf;
@@ -588,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (inc_opt)
optlen += ndisc_opt_addr_space(dev,
NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0)
+ optlen += 8;
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
@@ -605,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
dev->dev_addr,
NDISC_NEIGHBOUR_SOLICITATION);
+ if (nonce != 0) {
+ u8 *opt = skb_put(skb, 8);
+
+ opt[0] = ND_OPT_NONCE;
+ opt[1] = 8 >> 3;
+ memcpy(opt + 2, &nonce, 6);
+ }
ndisc_send_skb(skb, daddr, saddr);
}
@@ -693,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
__func__, target);
}
- ndisc_send_ns(dev, target, target, saddr);
+ ndisc_send_ns(dev, target, target, saddr, 0);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
} else {
addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(dev, target, &mcaddr, saddr);
+ ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
}
}
@@ -742,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
int dad = ipv6_addr_any(saddr);
bool inc;
int is_router = -1;
+ u64 nonce = 0;
if (skb->len < sizeof(struct nd_msg)) {
ND_PRINTK(2, warn, "NS: packet too short\n");
@@ -786,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
return;
}
}
+ if (ndopts.nd_opts_nonce)
+ memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
inc = ipv6_addr_is_multicast(daddr);
@@ -794,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb)
have_ifp:
if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
if (dad) {
+ if (nonce != 0 && ifp->dad_nonce == nonce) {
+ u8 *np = (u8 *)&nonce;
+ /* Matching nonce if looped back */
+ ND_PRINTK(2, notice,
+ "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+ ifp->idev->dev->name,
+ &ifp->addr, np);
+ goto out;
+ }
/*
* We are colliding with another node
* who is doing DAD
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6ff42b8301cc..d56d8ac09a94 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -562,7 +562,8 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
static int
find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
- unsigned int size)
+ unsigned int size,
+ struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
struct xt_target *target;
@@ -570,12 +571,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- unsigned long pcnt;
- pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(pcnt))
+ if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
return -ENOMEM;
- e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -612,7 +610,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
cleanup_match(ematch, net);
}
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
return ret;
}
@@ -699,8 +697,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
-
- xt_percpu_counter_free(e->counters.pcnt);
+ xt_percpu_counter_free(&e->counters);
}
/* Checks and translates the user-supplied table segment (held in
@@ -709,6 +706,7 @@ static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ip6t_replace *repl)
{
+ struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ip6t_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -768,7 +766,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, net, repl->name, repl->size);
+ ret = find_check_entry(iter, net, repl->name, repl->size,
+ &alloc_state);
if (ret != 0)
break;
++i;
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 99a1216287c8..98c8dd38575a 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -440,12 +440,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
e->ipv6.invflags & XT_INV_PROTO)
return -EINVAL;
- return nf_ct_l3proto_try_module_get(par->family);
+ return nf_ct_netns_get(par->net, par->family);
}
static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target synproxy_tg6_reg __read_mostly = {
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 500be28ff563..4e3402486833 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -34,6 +34,13 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_log.h>
+static int conntrack6_net_id;
+static DEFINE_MUTEX(register_ipv6_hooks);
+
+struct conntrack6_net {
+ unsigned int users;
+};
+
static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -308,6 +315,42 @@ static int ipv6_nlattr_tuple_size(void)
}
#endif
+static int ipv6_hooks_register(struct net *net)
+{
+ struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
+ int err = 0;
+
+ mutex_lock(&register_ipv6_hooks);
+ cnet->users++;
+ if (cnet->users > 1)
+ goto out_unlock;
+
+ err = nf_defrag_ipv6_enable(net);
+ if (err < 0) {
+ cnet->users = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ if (err)
+ cnet->users = 0;
+ out_unlock:
+ mutex_unlock(&register_ipv6_hooks);
+ return err;
+}
+
+static void ipv6_hooks_unregister(struct net *net)
+{
+ struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
+
+ mutex_lock(&register_ipv6_hooks);
+ if (cnet->users && (--cnet->users == 0))
+ nf_unregister_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ mutex_unlock(&register_ipv6_hooks);
+}
+
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.l3proto = PF_INET6,
.name = "ipv6",
@@ -321,6 +364,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.nlattr_to_tuple = ipv6_nlattr_to_tuple,
.nla_policy = ipv6_nla_policy,
#endif
+ .net_ns_get = ipv6_hooks_register,
+ .net_ns_put = ipv6_hooks_unregister,
.me = THIS_MODULE,
};
@@ -340,6 +385,15 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
&nf_conntrack_l4proto_tcp6,
&nf_conntrack_l4proto_udp6,
&nf_conntrack_l4proto_icmpv6,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite6,
+#endif
};
static int ipv6_net_init(struct net *net)
@@ -370,6 +424,8 @@ static void ipv6_net_exit(struct net *net)
static struct pernet_operations ipv6_net_ops = {
.init = ipv6_net_init,
.exit = ipv6_net_exit,
+ .id = &conntrack6_net_id,
+ .size = sizeof(struct conntrack6_net),
};
static int __init nf_conntrack_l3proto_ipv6_init(void)
@@ -377,7 +433,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
int ret = 0;
need_conntrack();
- nf_defrag_ipv6_enable();
ret = nf_register_sockopt(&so_getorigdst6);
if (ret < 0) {
@@ -389,18 +444,10 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
if (ret < 0)
goto cleanup_sockopt;
- ret = nf_register_hooks(ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register pre-routing defrag "
- "hook.\n");
- goto cleanup_pernet;
- }
-
ret = nf_ct_l4proto_register(builtin_l4proto6,
ARRAY_SIZE(builtin_l4proto6));
if (ret < 0)
- goto cleanup_hooks;
+ goto cleanup_pernet;
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
if (ret < 0) {
@@ -411,8 +458,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
cleanup_l4proto:
nf_ct_l4proto_unregister(builtin_l4proto6,
ARRAY_SIZE(builtin_l4proto6));
- cleanup_hooks:
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
cleanup_pernet:
unregister_pernet_subsys(&ipv6_net_ops);
cleanup_sockopt:
@@ -426,7 +471,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
nf_ct_l4proto_unregister(builtin_l4proto6,
ARRAY_SIZE(builtin_l4proto6));
- nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
unregister_pernet_subsys(&ipv6_net_ops);
nf_unregister_sockopt(&so_getorigdst6);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e4347aeb2e65..9948b5ce52da 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -576,11 +576,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
/* Jumbo payload inhibits frag. header */
if (ipv6_hdr(skb)->payload_len == 0) {
pr_debug("payload len = 0\n");
- return -EINVAL;
+ return 0;
}
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
- return -EINVAL;
+ return 0;
if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
return -ENOMEM;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f7aab5ab93a5..8e0bdd058787 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -30,6 +30,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+static DEFINE_MUTEX(defrag6_mutex);
+
static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
struct sk_buff *skb)
{
@@ -69,7 +71,7 @@ static unsigned int ipv6_defrag(void *priv,
if (err == -EINPROGRESS)
return NF_STOLEN;
- return NF_ACCEPT;
+ return err == 0 ? NF_ACCEPT : NF_DROP;
}
static struct nf_hook_ops ipv6_defrag_ops[] = {
@@ -87,6 +89,19 @@ static struct nf_hook_ops ipv6_defrag_ops[] = {
},
};
+static void __net_exit defrag6_net_exit(struct net *net)
+{
+ if (net->nf.defrag_ipv6) {
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ net->nf.defrag_ipv6 = false;
+ }
+}
+
+static struct pernet_operations defrag6_net_ops = {
+ .exit = defrag6_net_exit,
+};
+
static int __init nf_defrag_init(void)
{
int ret = 0;
@@ -96,9 +111,9 @@ static int __init nf_defrag_init(void)
pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
return ret;
}
- ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ ret = register_pernet_subsys(&defrag6_net_ops);
if (ret < 0) {
- pr_err("nf_defrag_ipv6: can't register hooks\n");
+ pr_err("nf_defrag_ipv6: can't register pernet ops\n");
goto cleanup_frag6;
}
return ret;
@@ -111,12 +126,31 @@ cleanup_frag6:
static void __exit nf_defrag_fini(void)
{
- nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ unregister_pernet_subsys(&defrag6_net_ops);
nf_ct_frag6_cleanup();
}
-void nf_defrag_ipv6_enable(void)
+int nf_defrag_ipv6_enable(struct net *net)
{
+ int err = 0;
+
+ might_sleep();
+
+ if (net->nf.defrag_ipv6)
+ return 0;
+
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ if (err == 0)
+ net->nf.defrag_ipv6 = true;
+
+ out_unlock:
+ mutex_unlock(&defrag6_mutex);
+ return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index a5400223fd74..10090400c72f 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -156,6 +156,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.daddr = oip6h->saddr;
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
+ fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index d526bb594956..c947aad8bcc6 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -235,7 +235,7 @@ nft_fib6_select_ops(const struct nft_ctx *ctx,
if (!tb[NFTA_FIB_RESULT])
return ERR_PTR(-EINVAL);
- result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+ result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
switch (result) {
case NFT_FIB_RESULT_OIF:
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index a2aff1277b40..6c5b5b1830a7 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -36,12 +36,19 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
nft_out(pkt));
}
+static void
+nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
+}
+
static struct nft_expr_type nft_masq_ipv6_type;
static const struct nft_expr_ops nft_masq_ipv6_ops = {
.type = &nft_masq_ipv6_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval = nft_masq_ipv6_eval,
.init = nft_masq_init,
+ .destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
};
@@ -78,5 +85,5 @@ module_init(nft_masq_ipv6_module_init);
module_exit(nft_masq_ipv6_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index bfcd5af6bc15..f5ac080fc084 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -39,12 +39,19 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
}
+static void
+nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
+}
+
static struct nft_expr_type nft_redir_ipv6_type;
static const struct nft_expr_ops nft_redir_ipv6_ops = {
.type = &nft_redir_ipv6_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval = nft_redir_ipv6_eval,
.init = nft_redir_init,
+ .destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
};
@@ -72,5 +79,5 @@ module_init(nft_redir_ipv6_module_init);
module_exit(nft_redir_ipv6_module_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 7cca8ac66fe9..cd4252346a32 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -155,6 +155,8 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
if (unlikely(!skb))
return 0;
+ skb->protocol = htons(ETH_P_IPV6);
+
return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb135ed4..2413a0637d99 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -527,7 +527,7 @@ static void rt6_probe_deferred(struct work_struct *w)
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
- ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
dev_put(work->dev);
kfree(work);
}
@@ -2000,8 +2000,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
It is very good, but in some (rare!) circumstances
(SIT, PtP, NBMA NOARP links) it is handy to allow
some exceptions. --ANK
+ We allow IPv4-mapped nexthops to support RFC4798-type
+ addressing
*/
- if (!(gwa_type & IPV6_ADDR_UNICAST))
+ if (!(gwa_type & (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_MAPPED)))
goto out;
if (cfg->fc_table) {
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 97830a6a9cbb..a4d49760bf43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -209,6 +209,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
treq->snt_synack.v64 = 0;
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
+ treq->ts_off = 0;
/*
* We need to lookup the dst_entry to get the correct window size.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 28ec0a2e7b72..73bc8fc68acd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
}
}
-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff)
{
return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32,
tcp_hdr(skb)->dest,
- tcp_hdr(skb)->source);
+ tcp_hdr(skb)->source, tsoff);
}
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -283,7 +283,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32,
inet->inet_sport,
- inet->inet_dport);
+ inet->inet_dport,
+ &tp->tsoffset);
err = tcp_connect(sk);
if (err)
@@ -398,7 +399,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
- &tp->tsq_flags))
+ &sk->sk_tsq_flags))
sock_hold(sk);
goto out;
}
@@ -956,7 +957,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+ tcp_time_stamp + tcp_rsk(req)->ts_off,
+ req->ts_recent, sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
0, 0);
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4f99417d9b40..649efc26a252 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -363,7 +363,8 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
+ if (copied < ulen || peeking ||
+ (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
@@ -511,7 +512,7 @@ out:
return;
}
-static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -690,10 +691,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
if (use_hash2) {
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
- udp_table.mask;
- hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+ udptable->mask;
+ hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udp_table.hash2[hash2];
+ hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index f6eb1ab34f4b..e78bdc76dcc3 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,7 +26,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
-int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udpv6_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index fce25afb652a..8938b6ba57a0 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -61,7 +61,8 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif
if ((l2tp->conn_id == tunnel_id) &&
net_eq(sock_net(sk), net) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (!sk->sk_bound_dev_if || !dif ||
+ sk->sk_bound_dev_if == dif))
goto found;
}
@@ -182,15 +183,17 @@ pass_up:
struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
read_lock_bh(&l2tp_ip_lock);
- sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id);
+ sk = __l2tp_ip_bind_lookup(net, iph->daddr, inet_iif(skb),
+ tunnel_id);
+ if (!sk) {
+ read_unlock_bh(&l2tp_ip_lock);
+ goto discard;
+ }
+
+ sock_hold(sk);
read_unlock_bh(&l2tp_ip_lock);
}
- if (sk == NULL)
- goto discard;
-
- sock_hold(sk);
-
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
@@ -251,22 +254,17 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
int ret;
int chk_addr_ret;
- if (!sock_flag(sk, SOCK_ZAPPED))
- return -EINVAL;
if (addr_len < sizeof(struct sockaddr_l2tpip))
return -EINVAL;
if (addr->l2tp_family != AF_INET)
return -EINVAL;
- ret = -EADDRINUSE;
- read_lock_bh(&l2tp_ip_lock);
- if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
- sk->sk_bound_dev_if, addr->l2tp_conn_id))
- goto out_in_use;
+ lock_sock(sk);
- read_unlock_bh(&l2tp_ip_lock);
+ ret = -EINVAL;
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ goto out;
- lock_sock(sk);
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
goto out;
@@ -280,14 +278,22 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
- sk_dst_reset(sk);
+ write_lock_bh(&l2tp_ip_lock);
+ if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
+ sk->sk_bound_dev_if, addr->l2tp_conn_id)) {
+ write_unlock_bh(&l2tp_ip_lock);
+ ret = -EADDRINUSE;
+ goto out;
+ }
+
+ sk_dst_reset(sk);
l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id;
- write_lock_bh(&l2tp_ip_lock);
sk_add_bind_node(sk, &l2tp_ip_bind_table);
sk_del_node_init(sk);
write_unlock_bh(&l2tp_ip_lock);
+
ret = 0;
sock_reset_flag(sk, SOCK_ZAPPED);
@@ -295,11 +301,6 @@ out:
release_sock(sk);
return ret;
-
-out_in_use:
- read_unlock_bh(&l2tp_ip_lock);
-
- return ret;
}
static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -307,21 +308,24 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr;
int rc;
- if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
- return -EINVAL;
-
if (addr_len < sizeof(*lsa))
return -EINVAL;
if (ipv4_is_multicast(lsa->l2tp_addr.s_addr))
return -EINVAL;
- rc = ip4_datagram_connect(sk, uaddr, addr_len);
- if (rc < 0)
- return rc;
-
lock_sock(sk);
+ /* Must bind first - autobinding does not work */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ rc = -EINVAL;
+ goto out_sk;
+ }
+
+ rc = __ip4_datagram_connect(sk, uaddr, addr_len);
+ if (rc < 0)
+ goto out_sk;
+
l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
write_lock_bh(&l2tp_ip_lock);
@@ -329,7 +333,9 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
sk_add_bind_node(sk, &l2tp_ip_bind_table);
write_unlock_bh(&l2tp_ip_lock);
+out_sk:
release_sock(sk);
+
return rc;
}
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 1cea54feab27..f092ac441fdd 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -72,8 +72,9 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
if ((l2tp->conn_id == tunnel_id) &&
net_eq(sock_net(sk), net) &&
- !(addr && ipv6_addr_equal(addr, laddr)) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (!addr || ipv6_addr_equal(addr, laddr)) &&
+ (!sk->sk_bound_dev_if || !dif ||
+ sk->sk_bound_dev_if == dif))
goto found;
}
@@ -196,16 +197,17 @@ pass_up:
struct ipv6hdr *iph = ipv6_hdr(skb);
read_lock_bh(&l2tp_ip6_lock);
- sk = __l2tp_ip6_bind_lookup(net, &iph->daddr,
- 0, tunnel_id);
+ sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, inet6_iif(skb),
+ tunnel_id);
+ if (!sk) {
+ read_unlock_bh(&l2tp_ip6_lock);
+ goto discard;
+ }
+
+ sock_hold(sk);
read_unlock_bh(&l2tp_ip6_lock);
}
- if (sk == NULL)
- goto discard;
-
- sock_hold(sk);
-
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
@@ -266,11 +268,10 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
struct net *net = sock_net(sk);
__be32 v4addr = 0;
+ int bound_dev_if;
int addr_type;
int err;
- if (!sock_flag(sk, SOCK_ZAPPED))
- return -EINVAL;
if (addr->l2tp_family != AF_INET6)
return -EINVAL;
if (addr_len < sizeof(*addr))
@@ -286,41 +287,34 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr_type & IPV6_ADDR_MULTICAST)
return -EADDRNOTAVAIL;
- err = -EADDRINUSE;
- read_lock_bh(&l2tp_ip6_lock);
- if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr,
- sk->sk_bound_dev_if, addr->l2tp_conn_id))
- goto out_in_use;
- read_unlock_bh(&l2tp_ip6_lock);
-
lock_sock(sk);
err = -EINVAL;
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ goto out_unlock;
+
if (sk->sk_state != TCP_CLOSE)
goto out_unlock;
+ bound_dev_if = sk->sk_bound_dev_if;
+
/* Check if the address belongs to the host. */
rcu_read_lock();
if (addr_type != IPV6_ADDR_ANY) {
struct net_device *dev = NULL;
if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (addr_len >= sizeof(struct sockaddr_in6) &&
- addr->l2tp_scope_id) {
- /* Override any existing binding, if another
- * one is supplied by user.
- */
- sk->sk_bound_dev_if = addr->l2tp_scope_id;
- }
+ if (addr->l2tp_scope_id)
+ bound_dev_if = addr->l2tp_scope_id;
/* Binding to link-local address requires an
- interface */
- if (!sk->sk_bound_dev_if)
+ * interface.
+ */
+ if (!bound_dev_if)
goto out_unlock_rcu;
err = -ENODEV;
- dev = dev_get_by_index_rcu(sock_net(sk),
- sk->sk_bound_dev_if);
+ dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if);
if (!dev)
goto out_unlock_rcu;
}
@@ -335,13 +329,22 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
rcu_read_unlock();
- inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+ write_lock_bh(&l2tp_ip6_lock);
+ if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if,
+ addr->l2tp_conn_id)) {
+ write_unlock_bh(&l2tp_ip6_lock);
+ err = -EADDRINUSE;
+ goto out_unlock;
+ }
+
+ inet->inet_saddr = v4addr;
+ inet->inet_rcv_saddr = v4addr;
+ sk->sk_bound_dev_if = bound_dev_if;
sk->sk_v6_rcv_saddr = addr->l2tp_addr;
np->saddr = addr->l2tp_addr;
l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id;
- write_lock_bh(&l2tp_ip6_lock);
sk_add_bind_node(sk, &l2tp_ip6_bind_table);
sk_del_node_init(sk);
write_unlock_bh(&l2tp_ip6_lock);
@@ -354,10 +357,7 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock:
release_sock(sk);
- return err;
-out_in_use:
- read_unlock_bh(&l2tp_ip6_lock);
return err;
}
@@ -370,9 +370,6 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_type;
int rc;
- if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
- return -EINVAL;
-
if (addr_len < sizeof(*lsa))
return -EINVAL;
@@ -389,10 +386,18 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
return -EINVAL;
}
- rc = ip6_datagram_connect(sk, uaddr, addr_len);
-
lock_sock(sk);
+ /* Must bind first - autobinding does not work */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ rc = -EINVAL;
+ goto out_sk;
+ }
+
+ rc = __ip6_datagram_connect(sk, uaddr, addr_len);
+ if (rc < 0)
+ goto out_sk;
+
l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
write_lock_bh(&l2tp_ip6_lock);
@@ -400,6 +405,7 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
sk_add_bind_node(sk, &l2tp_ip6_bind_table);
write_unlock_bh(&l2tp_ip6_lock);
+out_sk:
release_sock(sk);
return rc;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 236d47e76ced..1711bae4abf2 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -688,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
}
/* No need to do anything if the driver does all */
- if (!local->ops->set_tim)
+ if (ieee80211_hw_check(&local->hw, AP_LINK_PS))
return;
if (sta->dead)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 62ccaf6f585d..2c21b7039136 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1500,7 +1500,6 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
struct sta_info *sta,
struct sk_buff *skb)
{
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct fq *fq = &local->fq;
struct ieee80211_vif *vif;
struct txq_info *txqi;
@@ -1525,8 +1524,6 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
if (!txqi)
return false;
- info->control.vif = vif;
-
spin_lock_bh(&fq->lock);
ieee80211_txq_enqueue(local, txqi, skb);
spin_unlock_bh(&fq->lock);
@@ -3238,7 +3235,6 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
- *ieee80211_get_qos_ctl(hdr) = tid;
hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid);
} else {
info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
@@ -3363,6 +3359,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
(tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT;
+ if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+ *ieee80211_get_qos_ctl(hdr) = tid;
+ }
+
__skb_queue_head_init(&tx.skbs);
tx.flags = IEEE80211_TX_UNICAST;
@@ -3451,6 +3452,11 @@ begin:
goto begin;
}
+ if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ else
+ info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+
if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
struct sta_info *sta = container_of(txq->sta, struct sta_info,
sta);
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index ee715764a828..6832bf6ab69f 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -270,6 +270,22 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2);
}
+ /*
+ * This is a workaround for VHT-enabled STAs which break the spec
+ * and have the VHT-MCS Rx map filled in with value 3 for all eight
+ * spacial streams, an example is AR9462.
+ *
+ * As per spec, in section 22.1.1 Introduction to the VHT PHY
+ * A VHT STA shall support at least single spactial stream VHT-MCSs
+ * 0 to 7 (transmit and receive) in all supported channel widths.
+ */
+ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) {
+ vht_cap->vht_supported = false;
+ sdata_info(sdata, "Ignoring VHT IE from %pM due to invalid rx_mcs_map\n",
+ sta->addr);
+ return;
+ }
+
/* finally set up the bandwidth */
switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 0e4334cbde17..15fe97644ffe 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1252,7 +1252,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!nla)
continue;
- switch(index) {
+ switch (index) {
case RTA_OIF:
cfg->rc_ifindex = nla_get_u32(nla);
break;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 44410d30d461..63729b489c2c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -146,38 +146,38 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It selected by the connlabel match.
config NF_CT_PROTO_DCCP
- tristate 'DCCP protocol connection tracking support'
+ bool 'DCCP protocol connection tracking support'
depends on NETFILTER_ADVANCED
- default IP_DCCP
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on DCCP connections.
- If unsure, say 'N'.
+ If unsure, say Y.
config NF_CT_PROTO_GRE
tristate
config NF_CT_PROTO_SCTP
- tristate 'SCTP protocol connection tracking support'
+ bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
- default IP_SCTP
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
- If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ If unsure, say Y.
config NF_CT_PROTO_UDPLITE
- tristate 'UDP-Lite protocol connection tracking support'
+ bool 'UDP-Lite protocol connection tracking support'
depends on NETFILTER_ADVANCED
+ default y
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on UDP-Lite
connections.
- To compile it as a module, choose M here. If unsure, say N.
+ If unsure, say Y.
config NF_CONNTRACK_AMANDA
tristate "Amanda backup protocol support"
@@ -384,17 +384,17 @@ config NF_NAT_NEEDED
default y
config NF_NAT_PROTO_DCCP
- tristate
+ bool
depends on NF_NAT && NF_CT_PROTO_DCCP
default NF_NAT && NF_CT_PROTO_DCCP
config NF_NAT_PROTO_UDPLITE
- tristate
+ bool
depends on NF_NAT && NF_CT_PROTO_UDPLITE
default NF_NAT && NF_CT_PROTO_UDPLITE
config NF_NAT_PROTO_SCTP
- tristate
+ bool
default NF_NAT && NF_CT_PROTO_SCTP
depends on NF_NAT && NF_CT_PROTO_SCTP
select LIBCRC32C
@@ -551,6 +551,12 @@ config NFT_NAT
This option adds the "nat" expression that you can use to perform
typical Network Address Translation (NAT) packet transformations.
+config NFT_OBJREF
+ tristate "Netfilter nf_tables stateful object reference module"
+ help
+ This option adds the "objref" expression that allows you to refer to
+ stateful objects, such as counters and quotas.
+
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5bbf767672ec..ca30d1960f1d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,6 +5,9 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -16,11 +19,7 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
-# SCTP protocol connection tracking
-obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
-obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
-obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
# netlink interface for nf_conntrack
obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
@@ -45,6 +44,11 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
+# NAT protocols (nf_nat)
+nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+
# generic transport layer logging
obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
@@ -54,11 +58,6 @@ obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
-
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -89,6 +88,7 @@ obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
+obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index de30e08d58f2..ce6adfae521a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -102,17 +102,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
if (!entry)
return -ENOMEM;
- entry->orig_ops = reg;
- entry->ops = *reg;
- entry->next = NULL;
+ nf_hook_entry_init(entry, reg);
mutex_lock(&nf_hook_mutex);
/* Find the spot in the list */
- while ((p = nf_entry_dereference(*pp)) != NULL) {
- if (reg->priority < p->orig_ops->priority)
+ for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
+ if (reg->priority < nf_hook_entry_priority(p))
break;
- pp = &p->next;
}
rcu_assign_pointer(entry->next, p);
rcu_assign_pointer(*pp, entry);
@@ -139,12 +136,11 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
return;
mutex_lock(&nf_hook_mutex);
- while ((p = nf_entry_dereference(*pp)) != NULL) {
- if (p->orig_ops == reg) {
+ for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
+ if (nf_hook_entry_ops(p) == reg) {
rcu_assign_pointer(*pp, p->next);
break;
}
- pp = &p->next;
}
mutex_unlock(&nf_hook_mutex);
if (!p) {
@@ -311,7 +307,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
int ret;
do {
- verdict = entry->ops.hook(entry->ops.priv, skb, state);
+ verdict = nf_hook_entry_hookfn(entry, skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
entry = rcu_dereference(entry->next);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 038c2ba0ae0f..3d02b0c13547 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3260,7 +3260,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
- if (IS_ERR(svc) || svc == NULL)
+ if (IS_ERR_OR_NULL(svc))
goto out_err;
/* Dump the destinations */
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 01d3d894de46..4e1a98fcc8c3 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -254,6 +254,54 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
return true;
}
+static inline bool decrement_ttl(struct netns_ipvs *ipvs,
+ int skb_af,
+ struct sk_buff *skb)
+{
+ struct net *net = ipvs->net;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ /* check and decrement ttl */
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ /* Force OUTPUT device used as source address */
+ skb->dev = dst->dev;
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
+
+ return false;
+ }
+
+ /* don't propagate ttl change to cloned packets */
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ return false;
+
+ ipv6_hdr(skb)->hop_limit--;
+ } else
+#endif
+ {
+ if (ip_hdr(skb)->ttl <= 1) {
+ /* Tell the sender its packet died... */
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ return false;
+ }
+
+ /* don't propagate ttl change to cloned packets */
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return false;
+
+ /* Decrease ttl */
+ ip_decrease_ttl(ip_hdr(skb));
+ }
+
+ return true;
+}
+
/* Get route to destination or remote server */
static int
__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
@@ -326,6 +374,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
return local;
}
+ if (!decrement_ttl(ipvs, skb_af, skb))
+ goto err_put;
+
if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
mtu = dst_mtu(&rt->dst);
} else {
@@ -473,6 +524,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
return local;
}
+ if (!decrement_ttl(ipvs, skb_af, skb))
+ goto err_put;
+
/* MTU checking */
if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
mtu = dst_mtu(&rt->dst);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 9bd34647225a..2d6ee1803415 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -125,6 +125,54 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+ int ret;
+
+ might_sleep();
+
+ ret = nf_ct_l3proto_try_module_get(nfproto);
+ if (ret < 0)
+ return ret;
+
+ /* we already have a reference, can't fail */
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nfproto);
+ rcu_read_unlock();
+
+ if (!l3proto->net_ns_get)
+ return 0;
+
+ ret = l3proto->net_ns_get(net);
+ if (ret < 0)
+ nf_ct_l3proto_module_put(nfproto);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, u8 nfproto)
+{
+ const struct nf_conntrack_l3proto *l3proto;
+
+ might_sleep();
+
+ /* same as nf_conntrack_netns_get(), reference assumed */
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nfproto);
+ rcu_read_unlock();
+
+ if (WARN_ON(!l3proto))
+ return;
+
+ if (l3proto->net_ns_put)
+ l3proto->net_ns_put(net);
+
+ nf_ct_l3proto_module_put(nfproto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
struct nf_conntrack_l4proto *
nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
{
@@ -190,20 +238,19 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
+#ifdef CONFIG_SYSCTL
+extern unsigned int nf_conntrack_default_on;
+
int nf_ct_l3proto_pernet_register(struct net *net,
struct nf_conntrack_l3proto *proto)
{
- int ret;
-
- if (proto->init_net) {
- ret = proto->init_net(net);
- if (ret < 0)
- return ret;
- }
+ if (nf_conntrack_default_on == 0)
+ return 0;
- return 0;
+ return proto->net_ns_get ? proto->net_ns_get(net) : 0;
}
EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
+#endif
void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
{
@@ -224,6 +271,16 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
void nf_ct_l3proto_pernet_unregister(struct net *net,
struct nf_conntrack_l3proto *proto)
{
+ /*
+ * nf_conntrack_default_on *might* have registered hooks.
+ * ->net_ns_put must cope with more puts() than get(), i.e.
+ * if nf_conntrack_default_on was 0 at time of
+ * nf_ct_l3proto_pernet_register invocation this net_ns_put()
+ * should be a noop.
+ */
+ if (proto->net_ns_put)
+ proto->net_ns_put(net);
+
/* Remove all contrack entries for this protocol */
nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
}
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 073b047314dc..b68ce6ac13b3 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -9,7 +9,6 @@
*
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
@@ -384,17 +383,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
},
};
-/* this module per-net specifics */
-static unsigned int dccp_net_id __read_mostly;
-struct dccp_net {
- struct nf_proto_net pn;
- int dccp_loose;
- unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-
-static inline struct dccp_net *dccp_pernet(struct net *net)
+static inline struct nf_dccp_net *dccp_pernet(struct net *net)
{
- return net_generic(net, dccp_net_id);
+ return &net->ct.nf_ct_proto.dccp;
}
static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -424,7 +415,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
struct net *net = nf_ct_net(ct);
- struct dccp_net *dn;
+ struct nf_dccp_net *dn;
struct dccp_hdr _dh, *dh;
const char *msg;
u_int8_t state;
@@ -719,7 +710,7 @@ static int dccp_nlattr_size(void)
static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- struct dccp_net *dn = dccp_pernet(net);
+ struct nf_dccp_net *dn = dccp_pernet(net);
unsigned int *timeouts = data;
int i;
@@ -820,7 +811,7 @@ static struct ctl_table dccp_sysctl_table[] = {
#endif /* CONFIG_SYSCTL */
static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
- struct dccp_net *dn)
+ struct nf_dccp_net *dn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -850,7 +841,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
static int dccp_init_net(struct net *net, u_int16_t proto)
{
- struct dccp_net *dn = dccp_pernet(net);
+ struct nf_dccp_net *dn = dccp_pernet(net);
struct nf_proto_net *pn = &dn->pn;
if (!pn->users) {
@@ -868,7 +859,7 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
return dccp_kmemdup_sysctl_table(net, pn, dn);
}
-static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
.l3proto = AF_INET,
.l4proto = IPPROTO_DCCP,
.name = "dccp",
@@ -898,11 +889,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
.nla_policy = dccp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &dccp_net_id,
.init_net = dccp_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
-static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
.l3proto = AF_INET6,
.l4proto = IPPROTO_DCCP,
.name = "dccp",
@@ -932,56 +923,6 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
.nla_policy = dccp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &dccp_net_id,
.init_net = dccp_init_net,
};
-
-static struct nf_conntrack_l4proto *dccp_proto[] = {
- &dccp_proto4,
- &dccp_proto6,
-};
-
-static __net_init int dccp_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, dccp_proto,
- ARRAY_SIZE(dccp_proto));
-}
-
-static __net_exit void dccp_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, dccp_proto,
- ARRAY_SIZE(dccp_proto));
-}
-
-static struct pernet_operations dccp_net_ops = {
- .init = dccp_net_init,
- .exit = dccp_net_exit,
- .id = &dccp_net_id,
- .size = sizeof(struct dccp_net),
-};
-
-static int __init nf_conntrack_proto_dccp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&dccp_net_ops);
- if (ret < 0)
- return ret;
- ret = nf_ct_l4proto_register(dccp_proto, ARRAY_SIZE(dccp_proto));
- if (ret < 0)
- unregister_pernet_subsys(&dccp_net_ops);
- return ret;
-}
-
-static void __exit nf_conntrack_proto_dccp_fini(void)
-{
- nf_ct_l4proto_unregister(dccp_proto, ARRAY_SIZE(dccp_proto));
- unregister_pernet_subsys(&dccp_net_ops);
-}
-
-module_init(nf_conntrack_proto_dccp_init);
-module_exit(nf_conntrack_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index d096c2d6b87b..a0efde38da44 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -15,7 +15,6 @@
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
-#include <linux/module.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/sctp.h>
@@ -144,15 +143,9 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
}
};
-static unsigned int sctp_net_id __read_mostly;
-struct sctp_net {
- struct nf_proto_net pn;
- unsigned int timeouts[SCTP_CONNTRACK_MAX];
-};
-
-static inline struct sctp_net *sctp_pernet(struct net *net)
+static inline struct nf_sctp_net *sctp_pernet(struct net *net)
{
- return net_generic(net, sctp_net_id);
+ return &net->ct.nf_ct_proto.sctp;
}
static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -600,7 +593,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
unsigned int *timeouts = data;
- struct sctp_net *sn = sctp_pernet(net);
+ struct nf_sctp_net *sn = sctp_pernet(net);
int i;
/* set default SCTP timeouts. */
@@ -708,7 +701,7 @@ static struct ctl_table sctp_sysctl_table[] = {
#endif
static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct sctp_net *sn)
+ struct nf_sctp_net *sn)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -735,7 +728,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
static int sctp_init_net(struct net *net, u_int16_t proto)
{
- struct sctp_net *sn = sctp_pernet(net);
+ struct nf_sctp_net *sn = sctp_pernet(net);
struct nf_proto_net *pn = &sn->pn;
if (!pn->users) {
@@ -748,7 +741,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
return sctp_kmemdup_sysctl_table(pn, sn);
}
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.l3proto = PF_INET,
.l4proto = IPPROTO_SCTP,
.name = "sctp",
@@ -778,11 +771,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
.nla_policy = sctp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &sctp_net_id,
.init_net = sctp_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
.l3proto = PF_INET6,
.l4proto = IPPROTO_SCTP,
.name = "sctp",
@@ -812,57 +805,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
#endif
- .net_id = &sctp_net_id,
.init_net = sctp_init_net,
};
-
-static struct nf_conntrack_l4proto *sctp_proto[] = {
- &nf_conntrack_l4proto_sctp4,
- &nf_conntrack_l4proto_sctp6,
-};
-
-static int sctp_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, sctp_proto,
- ARRAY_SIZE(sctp_proto));
-}
-
-static void sctp_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, sctp_proto,
- ARRAY_SIZE(sctp_proto));
-}
-
-static struct pernet_operations sctp_net_ops = {
- .init = sctp_net_init,
- .exit = sctp_net_exit,
- .id = &sctp_net_id,
- .size = sizeof(struct sctp_net),
-};
-
-static int __init nf_conntrack_proto_sctp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&sctp_net_ops);
- if (ret < 0)
- return ret;
- ret = nf_ct_l4proto_register(sctp_proto, ARRAY_SIZE(sctp_proto));
- if (ret < 0)
- unregister_pernet_subsys(&sctp_net_ops);
- return ret;
-}
-
-static void __exit nf_conntrack_proto_sctp_fini(void)
-{
- nf_ct_l4proto_unregister(sctp_proto, ARRAY_SIZE(sctp_proto));
- unregister_pernet_subsys(&sctp_net_ops);
-}
-
-module_init(nf_conntrack_proto_sctp_init);
-module_exit(nf_conntrack_proto_sctp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kiran Kumar Immidi");
-MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
-MODULE_ALIAS("ip_conntrack_proto_sctp");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 7808604c70a2..c35f7bf05d8c 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -9,7 +9,6 @@
#include <linux/types.h>
#include <linux/timer.h>
-#include <linux/module.h>
#include <linux/udp.h>
#include <linux/seq_file.h>
#include <linux/skbuff.h>
@@ -24,26 +23,14 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_log.h>
-enum udplite_conntrack {
- UDPLITE_CT_UNREPLIED,
- UDPLITE_CT_REPLIED,
- UDPLITE_CT_MAX
-};
-
static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
[UDPLITE_CT_UNREPLIED] = 30*HZ,
[UDPLITE_CT_REPLIED] = 180*HZ,
};
-static unsigned int udplite_net_id __read_mostly;
-struct udplite_net {
- struct nf_proto_net pn;
- unsigned int timeouts[UDPLITE_CT_MAX];
-};
-
-static inline struct udplite_net *udplite_pernet(struct net *net)
+static inline struct nf_udplite_net *udplite_pernet(struct net *net)
{
- return net_generic(net, udplite_net_id);
+ return &net->ct.nf_ct_proto.udplite;
}
static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
@@ -178,7 +165,7 @@ static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
unsigned int *timeouts = data;
- struct udplite_net *un = udplite_pernet(net);
+ struct nf_udplite_net *un = udplite_pernet(net);
/* set default timeouts for UDPlite. */
timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
@@ -237,7 +224,7 @@ static struct ctl_table udplite_sysctl_table[] = {
#endif /* CONFIG_SYSCTL */
static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct udplite_net *un)
+ struct nf_udplite_net *un)
{
#ifdef CONFIG_SYSCTL
if (pn->ctl_table)
@@ -257,7 +244,7 @@ static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
static int udplite_init_net(struct net *net, u_int16_t proto)
{
- struct udplite_net *un = udplite_pernet(net);
+ struct nf_udplite_net *un = udplite_pernet(net);
struct nf_proto_net *pn = &un->pn;
if (!pn->users) {
@@ -270,7 +257,7 @@ static int udplite_init_net(struct net *net, u_int16_t proto)
return udplite_kmemdup_sysctl_table(pn, un);
}
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
@@ -299,11 +286,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.nla_policy = udplite_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &udplite_net_id,
.init_net = udplite_init_net,
};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
-static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
{
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
@@ -332,54 +319,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.nla_policy = udplite_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .net_id = &udplite_net_id,
.init_net = udplite_init_net,
};
-
-static struct nf_conntrack_l4proto *udplite_proto[] = {
- &nf_conntrack_l4proto_udplite4,
- &nf_conntrack_l4proto_udplite6,
-};
-
-static int udplite_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, udplite_proto,
- ARRAY_SIZE(udplite_proto));
-}
-
-static void udplite_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, udplite_proto,
- ARRAY_SIZE(udplite_proto));
-}
-
-static struct pernet_operations udplite_net_ops = {
- .init = udplite_net_init,
- .exit = udplite_net_exit,
- .id = &udplite_net_id,
- .size = sizeof(struct udplite_net),
-};
-
-static int __init nf_conntrack_proto_udplite_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&udplite_net_ops);
- if (ret < 0)
- return ret;
- ret = nf_ct_l4proto_register(udplite_proto, ARRAY_SIZE(udplite_proto));
- if (ret < 0)
- unregister_pernet_subsys(&udplite_net_ops);
- return ret;
-}
-
-static void __exit nf_conntrack_proto_udplite_exit(void)
-{
- nf_ct_l4proto_unregister(udplite_proto, ARRAY_SIZE(udplite_proto));
- unregister_pernet_subsys(&udplite_net_ops);
-}
-
-module_init(nf_conntrack_proto_udplite_init);
-module_exit(nf_conntrack_proto_udplite_exit);
-
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5f446cd9f3fd..d009ae663453 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -452,6 +452,9 @@ static int log_invalid_proto_max __read_mostly = 255;
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
+extern unsigned int nf_conntrack_default_on;
+unsigned int nf_conntrack_default_on __read_mostly = 1;
+
static int
nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -517,6 +520,13 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "nf_conntrack_default_on",
+ .data = &nf_conntrack_default_on,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 44ae986c383f..c9d7f95768ab 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -14,6 +14,29 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+{
+ if (skb_mac_header_was_set(skb))
+ skb_push(skb, skb->mac_len);
+
+ skb->dev = dev;
+ dev_queue_xmit(skb);
+}
+
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+ if (!dev) {
+ kfree_skb(pkt->skb);
+ return;
+ }
+
+ nf_do_netdev_egress(pkt->skb, dev);
+}
+EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
+
void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
{
struct net_device *dev;
@@ -24,14 +47,8 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
return;
skb = skb_clone(pkt->skb, GFP_ATOMIC);
- if (skb == NULL)
- return;
-
- if (skb_mac_header_was_set(skb))
- skb_push(skb, skb->mac_len);
-
- skb->dev = dev;
- dev_queue_xmit(skb);
+ if (skb)
+ nf_do_netdev_egress(skb, dev);
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index ed9b80815fa0..dc61399e30be 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -177,6 +177,7 @@ EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
/* bridge and netdev logging families share this code. */
void nf_log_l2packet(struct net *net, u_int8_t pf,
+ __be16 protocol,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -184,7 +185,7 @@ void nf_log_l2packet(struct net *net, u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- switch (eth_hdr(skb)->h_proto) {
+ switch (protocol) {
case htons(ETH_P_IP):
nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
loginfo, "%s", prefix);
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
index 1f645949f3d8..350eb147754d 100644
--- a/net/netfilter/nf_log_netdev.c
+++ b/net/netfilter/nf_log_netdev.c
@@ -23,7 +23,8 @@ static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- nf_log_l2packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
+ loginfo, prefix);
}
static struct nf_logger nf_netdev_logger __read_mostly = {
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index bbb8f3df79f7..94b14c5a8b17 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -42,7 +42,7 @@ struct nf_nat_conn_key {
const struct nf_conntrack_zone *zone;
};
-static struct rhashtable nf_nat_bysource_table;
+static struct rhltable nf_nat_bysource_table;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -193,9 +193,12 @@ static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
const struct nf_nat_conn_key *key = arg->key;
const struct nf_conn *ct = obj;
- return same_src(ct, key->tuple) &&
- net_eq(nf_ct_net(ct), key->net) &&
- nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL);
+ if (!same_src(ct, key->tuple) ||
+ !net_eq(nf_ct_net(ct), key->net) ||
+ !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
+ return 1;
+
+ return 0;
}
static struct rhashtable_params nf_nat_bysource_params = {
@@ -204,7 +207,6 @@ static struct rhashtable_params nf_nat_bysource_params = {
.obj_cmpfn = nf_nat_bysource_cmp,
.nelem_hint = 256,
.min_size = 1024,
- .nulls_base = (1U << RHT_BASE_SHIFT),
};
/* Only called for SRC manip */
@@ -223,12 +225,15 @@ find_appropriate_src(struct net *net,
.tuple = tuple,
.zone = zone
};
+ struct rhlist_head *hl;
- ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key,
- nf_nat_bysource_params);
- if (!ct)
+ hl = rhltable_lookup(&nf_nat_bysource_table, &key,
+ nf_nat_bysource_params);
+ if (!hl)
return 0;
+ ct = container_of(hl, typeof(*ct), nat_bysource);
+
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
@@ -446,11 +451,17 @@ nf_nat_setup_info(struct nf_conn *ct,
}
if (maniptype == NF_NAT_MANIP_SRC) {
+ struct nf_nat_conn_key key = {
+ .net = nf_ct_net(ct),
+ .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ .zone = nf_ct_zone(ct),
+ };
int err;
- err = rhashtable_insert_fast(&nf_nat_bysource_table,
- &ct->nat_bysource,
- nf_nat_bysource_params);
+ err = rhltable_insert_key(&nf_nat_bysource_table,
+ &key,
+ &ct->nat_bysource,
+ nf_nat_bysource_params);
if (err)
return NF_DROP;
}
@@ -567,8 +578,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
ct->status &= ~IPS_NAT_DONE_MASK;
- rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+ nf_nat_bysource_params);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -671,6 +682,18 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
&nf_nat_l4proto_tcp);
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
&nf_nat_l4proto_udp);
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
+ &nf_nat_l4proto_dccp);
+#endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
+ &nf_nat_l4proto_sctp);
+#endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
+ &nf_nat_l4proto_udplite);
+#endif
mutex_unlock(&nf_nat_proto_mutex);
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
@@ -698,8 +721,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
if (!nat)
return;
- rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+ nf_nat_bysource_params);
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -834,13 +857,13 @@ static int __init nf_nat_init(void)
{
int ret;
- ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
+ ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
if (ret)
return ret;
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- rhashtable_destroy(&nf_nat_bysource_table);
+ rhltable_destroy(&nf_nat_bysource_table);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -864,7 +887,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
- rhashtable_destroy(&nf_nat_bysource_table);
+ rhltable_destroy(&nf_nat_bysource_table);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -883,7 +906,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
- rhashtable_destroy(&nf_nat_bysource_table);
+ rhltable_destroy(&nf_nat_bysource_table);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 15c47b246d0d..269fcd5dc34c 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -10,8 +10,6 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/dccp.h>
@@ -73,7 +71,7 @@ dccp_manip_pkt(struct sk_buff *skb,
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.l4proto = IPPROTO_DCCP,
.manip_pkt = dccp_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -82,35 +80,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_dccp_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_dccp_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
-
-}
-
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP NAT protocol helper");
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index cbc7ade1487b..31d358691af0 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -7,9 +7,7 @@
*/
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/sctp.h>
-#include <linux/module.h>
#include <net/sctp/checksum.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -49,12 +47,15 @@ sctp_manip_pkt(struct sk_buff *skb,
hdr->dest = tuple->dst.u.sctp.port;
}
- hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+ skb->ip_summed = CHECKSUM_NONE;
+ }
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.l4proto = IPPROTO_SCTP,
.manip_pkt = sctp_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -63,34 +64,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_sctp_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_sctp_exit(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
-}
-
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
index 58340c97bd83..366bfbfd82a1 100644
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -8,11 +8,9 @@
*/
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/udp.h>
#include <linux/netfilter.h>
-#include <linux/module.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -64,7 +62,7 @@ udplite_manip_pkt(struct sk_buff *skb,
return true;
}
-static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
.l4proto = IPPROTO_UDPLITE,
.manip_pkt = udplite_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
@@ -73,34 +71,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
-
-static int __init nf_nat_proto_udplite_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
- if (err < 0)
- goto err1;
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
-err1:
- return err;
-}
-
-static void __exit nf_nat_proto_udplite_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
-}
-
-module_init(nf_nat_proto_udplite_init);
-module_exit(nf_nat_proto_udplite_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 77cba9f6ccb6..4a7662486f44 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -185,7 +185,7 @@ static unsigned int nf_iterate(struct sk_buff *skb,
do {
repeat:
- verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
+ verdict = nf_hook_entry_hookfn((*entryp), skb, state);
if (verdict != NF_ACCEPT) {
if (verdict != NF_REPEAT)
return verdict;
@@ -200,7 +200,6 @@ repeat:
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
struct nf_hook_entry *hook_entry = entry->hook;
- struct nf_hook_ops *elem = &hook_entry->ops;
struct sk_buff *skb = entry->skb;
const struct nf_afinfo *afinfo;
int err;
@@ -209,7 +208,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
/* Continue traversal iff userspace said ok... */
if (verdict == NF_REPEAT)
- verdict = elem->hook(elem->priv, skb, &entry->state);
+ verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
if (verdict == NF_ACCEPT) {
afinfo = nf_get_afinfo(entry->state.pf);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 026581b04ea8..a019a87e58ee 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -22,6 +22,7 @@
#include <net/sock.h>
static LIST_HEAD(nf_tables_expressions);
+static LIST_HEAD(nf_tables_objects);
/**
* nft_register_afinfo - register nf_tables address family info
@@ -110,12 +111,12 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->seq = nlh->nlmsg_seq;
}
-static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
- u32 size)
+static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
+ int msg_type, u32 size, gfp_t gfp)
{
struct nft_trans *trans;
- trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL);
+ trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
if (trans == NULL)
return NULL;
@@ -125,6 +126,12 @@ static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
return trans;
}
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
+{
+ return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
+}
+
static void nft_trans_destroy(struct nft_trans *trans)
{
list_del(&trans->list);
@@ -304,6 +311,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
return err;
}
+static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_object *obj)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWOBJ)
+ nft_activate_next(ctx->net, obj);
+
+ nft_trans_obj(trans) = obj;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+}
+
+static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
+{
+ int err;
+
+ err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
+ if (err < 0)
+ return err;
+
+ nft_deactivate_next(ctx->net, obj);
+ ctx->table->use--;
+
+ return err;
+}
+
/*
* Tables
*/
@@ -688,6 +727,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
INIT_LIST_HEAD(&table->chains);
INIT_LIST_HEAD(&table->sets);
+ INIT_LIST_HEAD(&table->objects);
table->flags = flags;
nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -709,6 +749,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
{
int err;
struct nft_chain *chain, *nc;
+ struct nft_object *obj, *ne;
struct nft_set *set, *ns;
list_for_each_entry(chain, &ctx->table->chains, list) {
@@ -735,6 +776,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
goto out;
}
+ list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
+ err = nft_delobj(ctx, obj);
+ if (err < 0)
+ goto out;
+ }
+
list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
if (!nft_is_active_next(ctx->net, chain))
continue;
@@ -2411,6 +2458,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 },
[NFTA_SET_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
+ [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2462,6 +2510,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
}
return ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
const struct nlattr *nla,
@@ -2480,6 +2529,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
}
return ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
const char *name)
@@ -2568,9 +2618,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
goto nla_put_failure;
}
+ if (set->flags & NFT_SET_OBJECT &&
+ nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
+ goto nla_put_failure;
if (set->timeout &&
- nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout),
+ nla_put_be64(skb, NFTA_SET_TIMEOUT,
+ cpu_to_be64(jiffies_to_msecs(set->timeout)),
NFTA_SET_PAD))
goto nla_put_failure;
if (set->gc_int &&
@@ -2796,7 +2850,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
unsigned int size;
bool create;
u64 timeout;
- u32 ktype, dtype, flags, policy, gc_int;
+ u32 ktype, dtype, flags, policy, gc_int, objtype;
struct nft_set_desc desc;
unsigned char *udata;
u16 udlen;
@@ -2826,11 +2880,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
- NFT_SET_MAP | NFT_SET_EVAL))
+ NFT_SET_MAP | NFT_SET_EVAL |
+ NFT_SET_OBJECT))
return -EINVAL;
- /* Only one of both operations is supported */
- if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) ==
- (NFT_SET_MAP | NFT_SET_EVAL))
+ /* Only one of these operations is supported */
+ if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+ (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
return -EOPNOTSUPP;
}
@@ -2855,11 +2910,25 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
} else if (flags & NFT_SET_MAP)
return -EINVAL;
+ if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
+ if (!(flags & NFT_SET_OBJECT))
+ return -EINVAL;
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
+ if (objtype == NFT_OBJECT_UNSPEC ||
+ objtype > NFT_OBJECT_MAX)
+ return -EINVAL;
+ } else if (flags & NFT_SET_OBJECT)
+ return -EINVAL;
+ else
+ objtype = NFT_OBJECT_UNSPEC;
+
timeout = 0;
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT]));
+ timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+ nla[NFTA_SET_TIMEOUT])));
}
gc_int = 0;
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -2941,6 +3010,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->ktype = ktype;
set->klen = desc.klen;
set->dtype = dtype;
+ set->objtype = objtype;
set->dlen = desc.dlen;
set->flags = flags;
set->size = desc.size;
@@ -3062,6 +3132,7 @@ bind:
list_add_tail_rcu(&binding->list, &set->bindings);
return 0;
}
+EXPORT_SYMBOL_GPL(nf_tables_bind_set);
void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
@@ -3072,6 +3143,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
nft_is_active(ctx->net, set))
nf_tables_set_destroy(ctx, set);
}
+EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_KEY] = {
@@ -3083,6 +3155,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_EXPR] = {
.align = __alignof__(struct nft_expr),
},
+ [NFT_SET_EXT_OBJREF] = {
+ .len = sizeof(struct nft_object *),
+ .align = __alignof__(struct nft_object *),
+ },
[NFT_SET_EXT_FLAGS] = {
.len = sizeof(u8),
.align = __alignof__(u8),
@@ -3171,6 +3247,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+ nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
+ (*nft_set_ext_obj(ext))->name) < 0)
+ goto nla_put_failure;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
htonl(*nft_set_ext_flags(ext))))
@@ -3178,7 +3259,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- cpu_to_be64(*nft_set_ext_timeout(ext)),
+ cpu_to_be64(jiffies_to_msecs(
+ *nft_set_ext_timeout(ext))),
NFTA_SET_ELEM_PAD))
goto nla_put_failure;
@@ -3447,7 +3529,7 @@ void *nft_set_elem_init(const struct nft_set *set,
memcpy(nft_set_ext_data(ext), data, set->dlen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
*nft_set_ext_expiration(ext) =
- jiffies + msecs_to_jiffies(timeout);
+ jiffies + timeout;
if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
*nft_set_ext_timeout(ext) = timeout;
@@ -3464,7 +3546,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
nft_data_uninit(nft_set_ext_data(ext), set->dtype);
if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
-
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use--;
kfree(elem);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
@@ -3489,11 +3572,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ u8 genmask = nft_genmask_next(ctx->net);
struct nft_data_desc d1, d2;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
+ struct nft_object *obj = NULL;
struct nft_userdata *udata;
struct nft_data data;
enum nft_registers dreg;
@@ -3535,7 +3620,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
- timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT]));
+ timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+ nla[NFTA_SET_ELEM_TIMEOUT])));
} else if (set->flags & NFT_SET_TIMEOUT) {
timeout = set->timeout;
}
@@ -3555,6 +3641,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
}
+ if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
+ if (!(set->flags & NFT_SET_OBJECT)) {
+ err = -EINVAL;
+ goto err2;
+ }
+ obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+ set->objtype, genmask);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ goto err2;
+ }
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ }
+
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
err = nft_data_init(ctx, &data, sizeof(data), &d2,
nla[NFTA_SET_ELEM_DATA]);
@@ -3613,6 +3713,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
}
+ if (obj) {
+ *nft_set_ext_obj(ext) = obj;
+ obj->use++;
+ }
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
@@ -3622,10 +3726,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = set->ops->insert(ctx->net, set, &elem, &ext2);
if (err) {
if (err == -EEXIST) {
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
- nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
- memcmp(nft_set_ext_data(ext),
- nft_set_ext_data(ext2), set->dlen) != 0)
+ if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
+ memcmp(nft_set_ext_data(ext),
+ nft_set_ext_data(ext2), set->dlen) != 0) ||
+ (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
+ *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
err = -EBUSY;
else if (!(nlmsg_flags & NLM_F_EXCL))
err = 0;
@@ -3775,6 +3882,34 @@ err1:
return err;
}
+static int nft_flush_set(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ struct nft_trans *trans;
+ int err;
+
+ trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
+ sizeof(struct nft_trans_elem), GFP_ATOMIC);
+ if (!trans)
+ return -ENOMEM;
+
+ if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) {
+ err = -ENOENT;
+ goto err1;
+ }
+
+ nft_trans_elem_set(trans) = (struct nft_set *)set;
+ nft_trans_elem(trans) = *((struct nft_set_elem *)elem);
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+err1:
+ kfree(trans);
+ return err;
+}
+
static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -3785,9 +3920,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int rem, err = 0;
- if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
- return -EINVAL;
-
err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
if (err < 0)
return err;
@@ -3799,6 +3931,18 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
return -EBUSY;
+ if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
+ struct nft_set_dump_args args = {
+ .iter = {
+ .genmask = genmask,
+ .fn = nft_flush_set,
+ },
+ };
+ set->ops->walk(&ctx, set, &args.iter);
+
+ return args.iter.err;
+ }
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
if (err < 0)
@@ -3834,6 +3978,500 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
}
EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
+/*
+ * Stateful objects
+ */
+
+/**
+ * nft_register_obj- register nf_tables stateful object type
+ * @obj: object type
+ *
+ * Registers the object type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_obj(struct nft_object_type *obj_type)
+{
+ if (obj_type->type == NFT_OBJECT_UNSPEC)
+ return -EINVAL;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_rcu(&obj_type->list, &nf_tables_objects);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_obj);
+
+/**
+ * nft_unregister_obj - unregister nf_tables object type
+ * @obj: object type
+ *
+ * Unregisters the object type for use with nf_tables.
+ */
+void nft_unregister_obj(struct nft_object_type *obj_type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&obj_type->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_obj);
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+ const struct nlattr *nla,
+ u32 objtype, u8 genmask)
+{
+ struct nft_object *obj;
+
+ list_for_each_entry(obj, &table->objects, list) {
+ if (!nla_strcmp(nla, obj->name) &&
+ objtype == obj->type->type &&
+ nft_active_genmask(obj, genmask))
+ return obj;
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+
+static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
+ [NFTA_OBJ_TABLE] = { .type = NLA_STRING },
+ [NFTA_OBJ_NAME] = { .type = NLA_STRING },
+ [NFTA_OBJ_TYPE] = { .type = NLA_U32 },
+ [NFTA_OBJ_DATA] = { .type = NLA_NESTED },
+};
+
+static struct nft_object *nft_obj_init(const struct nft_object_type *type,
+ const struct nlattr *attr)
+{
+ struct nlattr *tb[type->maxattr + 1];
+ struct nft_object *obj;
+ int err;
+
+ if (attr) {
+ err = nla_parse_nested(tb, type->maxattr, attr, type->policy);
+ if (err < 0)
+ goto err1;
+ } else {
+ memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
+ }
+
+ err = -ENOMEM;
+ obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL);
+ if (obj == NULL)
+ goto err1;
+
+ err = type->init((const struct nlattr * const *)tb, obj);
+ if (err < 0)
+ goto err2;
+
+ obj->type = type;
+ return obj;
+err2:
+ kfree(obj);
+err1:
+ return ERR_PTR(err);
+}
+
+static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
+ struct nft_object *obj, bool reset)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ goto nla_put_failure;
+ if (obj->type->dump(skb, obj, reset) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+{
+ const struct nft_object_type *type;
+
+ list_for_each_entry(type, &nf_tables_objects, list) {
+ if (objtype == type->type)
+ return type;
+ }
+ return NULL;
+}
+
+static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+{
+ const struct nft_object_type *type;
+
+ type = __nft_obj_type_get(objtype);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+#ifdef CONFIG_MODULES
+ if (type == NULL) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-obj-%u", objtype);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_obj_type_get(objtype))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_object_type *type;
+ u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_object *obj;
+ struct nft_ctx ctx;
+ u32 objtype;
+ int err;
+
+ if (!nla[NFTA_OBJ_TYPE] ||
+ !nla[NFTA_OBJ_NAME] ||
+ !nla[NFTA_OBJ_DATA])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ if (err != -ENOENT)
+ return err;
+
+ obj = NULL;
+ }
+
+ if (obj != NULL) {
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ return 0;
+ }
+
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+ type = nft_obj_type_get(objtype);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]);
+ if (IS_ERR(obj)) {
+ err = PTR_ERR(obj);
+ goto err1;
+ }
+ obj->table = table;
+ nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
+
+ err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&obj->list, &table->objects);
+ table->use++;
+ return 0;
+err2:
+ if (obj->type->destroy)
+ obj->type->destroy(obj);
+ kfree(obj);
+err1:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, int event, u32 flags,
+ int family, const struct nft_table *table,
+ struct nft_object *obj, bool reset)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
+ nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
+ nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) ||
+ nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+struct nft_obj_filter {
+ char table[NFT_OBJ_MAXNAMELEN];
+ u32 type;
+};
+
+static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct nft_obj_filter *filter = cb->data;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_object *obj;
+ bool reset = false;
+
+ if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ reset = true;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(obj, &table->objects, list) {
+ if (!nft_is_active(net, obj))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (filter->table[0] &&
+ strcmp(filter->table, table->name))
+ goto cont;
+ if (filter->type != NFT_OBJECT_UNSPEC &&
+ obj->type->type != filter->type)
+ goto cont;
+
+ if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWOBJ,
+ NLM_F_MULTI | NLM_F_APPEND,
+ afi->family, table, obj, reset) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_dump_obj_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+
+ return 0;
+}
+
+static struct nft_obj_filter *
+nft_obj_filter_alloc(const struct nlattr * const nla[])
+{
+ struct nft_obj_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
+
+ if (nla[NFTA_OBJ_TABLE])
+ nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
+ NFT_TABLE_MAXNAMELEN);
+ if (nla[NFTA_OBJ_TYPE])
+ filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
+ return filter;
+}
+
+static int nf_tables_getobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_cur(net);
+ int family = nfmsg->nfgen_family;
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ struct nft_object *obj;
+ struct sk_buff *skb2;
+ bool reset = false;
+ u32 objtype;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ };
+
+ if (nla[NFTA_OBJ_TABLE] ||
+ nla[NFTA_OBJ_TYPE]) {
+ struct nft_obj_filter *filter;
+
+ filter = nft_obj_filter_alloc(nla);
+ if (IS_ERR(filter))
+ return -ENOMEM;
+
+ c.data = filter;
+ }
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ if (!nla[NFTA_OBJ_NAME] ||
+ !nla[NFTA_OBJ_TYPE])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+ reset = true;
+
+ err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ family, table, obj, reset);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+ kfree_skb(skb2);
+ return err;
+
+ return 0;
+}
+
+static void nft_obj_destroy(struct nft_object *obj)
+{
+ if (obj->type->destroy)
+ obj->type->destroy(obj);
+
+ module_put(obj->type->owner);
+ kfree(obj);
+}
+
+static int nf_tables_delobj(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u8 genmask = nft_genmask_next(net);
+ int family = nfmsg->nfgen_family;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_object *obj;
+ struct nft_ctx ctx;
+ u32 objtype;
+
+ if (!nla[NFTA_OBJ_TYPE] ||
+ !nla[NFTA_OBJ_NAME])
+ return -EINVAL;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ if (obj->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+ return nft_delobj(&ctx, obj);
+}
+
+int nft_obj_notify(struct net *net, struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ int family, int report, gfp_t gfp)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!report &&
+ !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
+ table, obj, false);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_obj_notify);
+
+static int nf_tables_obj_notify(const struct nft_ctx *ctx,
+ struct nft_object *obj, int event)
+{
+ return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
+ ctx->seq, event, ctx->afi->family, ctx->report,
+ GFP_KERNEL);
+}
+
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
@@ -3994,6 +4632,26 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_GETGEN] = {
.call = nf_tables_getgen,
},
+ [NFT_MSG_NEWOBJ] = {
+ .call_batch = nf_tables_newobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_GETOBJ] = {
+ .call = nf_tables_getobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_DELOBJ] = {
+ .call_batch = nf_tables_delobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_GETOBJ_RESET] = {
+ .call = nf_tables_getobj,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
};
static void nft_chain_commit_update(struct nft_trans *trans)
@@ -4036,6 +4694,9 @@ static void nf_tables_commit_release(struct nft_trans *trans)
nft_set_elem_destroy(nft_trans_elem_set(trans),
nft_trans_elem(trans).priv, true);
break;
+ case NFT_MSG_DELOBJ:
+ nft_obj_destroy(nft_trans_obj(trans));
+ break;
}
kfree(trans);
}
@@ -4143,6 +4804,17 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
atomic_dec(&te->set->nelems);
te->set->ndeact--;
break;
+ case NFT_MSG_NEWOBJ:
+ nft_clear(net, nft_trans_obj(trans));
+ nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELOBJ:
+ list_del_rcu(&nft_trans_obj(trans)->list);
+ nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+ NFT_MSG_DELOBJ);
+ break;
}
}
@@ -4177,6 +4849,9 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nft_set_elem_destroy(nft_trans_elem_set(trans),
nft_trans_elem(trans).priv, true);
break;
+ case NFT_MSG_NEWOBJ:
+ nft_obj_destroy(nft_trans_obj(trans));
+ break;
}
kfree(trans);
}
@@ -4257,6 +4932,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
+ case NFT_MSG_NEWOBJ:
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_obj(trans)->list);
+ break;
+ case NFT_MSG_DELOBJ:
+ trans->ctx.table->use++;
+ nft_clear(trans->ctx.net, nft_trans_obj(trans));
+ nft_trans_destroy(trans);
+ break;
}
}
@@ -4803,6 +5487,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
{
struct nft_table *table, *nt;
struct nft_chain *chain, *nc;
+ struct nft_object *obj, *ne;
struct nft_rule *rule, *nr;
struct nft_set *set, *ns;
struct nft_ctx ctx = {
@@ -4829,6 +5514,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
table->use--;
nft_set_destroy(set);
}
+ list_for_each_entry_safe(obj, ne, &table->objects, list) {
+ list_del(&obj->list);
+ table->use--;
+ nft_obj_destroy(obj);
+ }
list_for_each_entry_safe(chain, nc, &table->chains, list) {
list_del(&chain->list);
table->use--;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 763cb4d54e8d..200922bb2036 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1152,6 +1152,7 @@ MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
+MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */
module_init(nfnetlink_log_init);
module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 77db8358ab14..f6a02c5071c2 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -31,11 +31,10 @@ struct nft_counter_percpu_priv {
struct nft_counter_percpu __percpu *counter;
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
struct nft_counter_percpu *this_cpu;
local_bh_disable();
@@ -47,10 +46,64 @@ static void nft_counter_eval(const struct nft_expr *expr,
local_bh_enable();
}
-static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
+static inline void nft_counter_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ nft_counter_do_eval(priv, regs, pkt);
+}
+
+static int nft_counter_do_init(const struct nlattr * const tb[],
+ struct nft_counter_percpu_priv *priv)
+{
+ struct nft_counter_percpu __percpu *cpu_stats;
+ struct nft_counter_percpu *this_cpu;
+
+ cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
+ if (cpu_stats == NULL)
+ return -ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ if (tb[NFTA_COUNTER_PACKETS]) {
+ this_cpu->counter.packets =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ }
+ if (tb[NFTA_COUNTER_BYTES]) {
+ this_cpu->counter.bytes =
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ }
+ preempt_enable();
+ priv->counter = cpu_stats;
+ return 0;
+}
+
+static int nft_counter_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ return nft_counter_do_init(tb, priv);
+}
+
+static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
+{
+ free_percpu(priv->counter);
+}
+
+static void nft_counter_obj_destroy(struct nft_object *obj)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ nft_counter_do_destroy(priv);
+}
+
+static void nft_counter_fetch(struct nft_counter_percpu __percpu *counter,
struct nft_counter *total)
{
- const struct nft_counter_percpu *cpu_stats;
+ struct nft_counter_percpu *cpu_stats;
u64 bytes, packets;
unsigned int seq;
int cpu;
@@ -69,12 +122,52 @@ static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
}
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static u64 __nft_counter_reset(u64 *counter)
+{
+ u64 ret, old;
+
+ do {
+ old = *counter;
+ ret = cmpxchg64(counter, old, 0);
+ } while (ret != old);
+
+ return ret;
+}
+
+static void nft_counter_reset(struct nft_counter_percpu __percpu *counter,
+ struct nft_counter *total)
+{
+ struct nft_counter_percpu *cpu_stats;
+ u64 bytes, packets;
+ unsigned int seq;
+ int cpu;
+
+ memset(total, 0, sizeof(*total));
+ for_each_possible_cpu(cpu) {
+ bytes = packets = 0;
+
+ cpu_stats = per_cpu_ptr(counter, cpu);
+ do {
+ seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ packets += __nft_counter_reset(&cpu_stats->counter.packets);
+ bytes += __nft_counter_reset(&cpu_stats->counter.bytes);
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+
+ total->packets += packets;
+ total->bytes += bytes;
+ }
+}
+
+static int nft_counter_do_dump(struct sk_buff *skb,
+ const struct nft_counter_percpu_priv *priv,
+ bool reset)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
struct nft_counter total;
- nft_counter_fetch(priv->counter, &total);
+ if (reset)
+ nft_counter_reset(priv->counter, &total);
+ else
+ nft_counter_fetch(priv->counter, &total);
if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
NFTA_COUNTER_PAD) ||
@@ -87,36 +180,54 @@ nla_put_failure:
return -1;
}
+static int nft_counter_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
+
+ return nft_counter_do_dump(skb, priv, reset);
+}
+
static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
+static struct nft_object_type nft_counter_obj __read_mostly = {
+ .type = NFT_OBJECT_COUNTER,
+ .size = sizeof(struct nft_counter_percpu_priv),
+ .maxattr = NFTA_COUNTER_MAX,
+ .policy = nft_counter_policy,
+ .eval = nft_counter_obj_eval,
+ .init = nft_counter_obj_init,
+ .destroy = nft_counter_obj_destroy,
+ .dump = nft_counter_obj_dump,
+ .owner = THIS_MODULE,
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+
+ nft_counter_do_eval(priv, regs, pkt);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+
+ return nft_counter_do_dump(skb, priv, false);
+}
+
static int nft_counter_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- struct nft_counter_percpu __percpu *cpu_stats;
- struct nft_counter_percpu *this_cpu;
- cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
- if (cpu_stats == NULL)
- return -ENOMEM;
-
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->counter.packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
- }
- if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->counter.bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
- }
- preempt_enable();
- priv->counter = cpu_stats;
- return 0;
+ return nft_counter_do_init(tb, priv);
}
static void nft_counter_destroy(const struct nft_ctx *ctx,
@@ -124,7 +235,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- free_percpu(priv->counter);
+ nft_counter_do_destroy(priv);
}
static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
@@ -174,12 +285,26 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
static int __init nft_counter_module_init(void)
{
- return nft_register_expr(&nft_counter_type);
+ int err;
+
+ err = nft_register_obj(&nft_counter_obj);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_counter_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_counter_obj);
+ return err;
}
static void __exit nft_counter_module_exit(void)
{
nft_unregister_expr(&nft_counter_type);
+ nft_unregister_obj(&nft_counter_obj);
}
module_init(nft_counter_module_init);
@@ -188,3 +313,4 @@ module_exit(nft_counter_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("counter");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 6837348c8993..e6baeaebe653 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -208,37 +208,37 @@ static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
-static int nft_ct_l3proto_try_module_get(uint8_t family)
+static int nft_ct_netns_get(struct net *net, uint8_t family)
{
int err;
if (family == NFPROTO_INET) {
- err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4);
+ err = nf_ct_netns_get(net, NFPROTO_IPV4);
if (err < 0)
goto err1;
- err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6);
+ err = nf_ct_netns_get(net, NFPROTO_IPV6);
if (err < 0)
goto err2;
} else {
- err = nf_ct_l3proto_try_module_get(family);
+ err = nf_ct_netns_get(net, family);
if (err < 0)
goto err1;
}
return 0;
err2:
- nf_ct_l3proto_module_put(NFPROTO_IPV4);
+ nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
return err;
}
-static void nft_ct_l3proto_module_put(uint8_t family)
+static void nft_ct_netns_put(struct net *net, uint8_t family)
{
if (family == NFPROTO_INET) {
- nf_ct_l3proto_module_put(NFPROTO_IPV4);
- nf_ct_l3proto_module_put(NFPROTO_IPV6);
+ nf_ct_netns_put(net, NFPROTO_IPV4);
+ nf_ct_netns_put(net, NFPROTO_IPV6);
} else
- nf_ct_l3proto_module_put(family);
+ nf_ct_netns_put(net, family);
}
static int nft_ct_get_init(const struct nft_ctx *ctx,
@@ -342,7 +342,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ err = nft_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
return err;
@@ -390,7 +390,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
if (err < 0)
goto err1;
- err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ err = nft_ct_netns_get(ctx->net, ctx->afi->family);
if (err < 0)
goto err1;
@@ -405,7 +405,7 @@ err1:
static void nft_ct_get_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- nft_ct_l3proto_module_put(ctx->afi->family);
+ nf_ct_netns_put(ctx->net, ctx->afi->family);
}
static void nft_ct_set_destroy(const struct nft_ctx *ctx,
@@ -423,7 +423,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
break;
}
- nft_ct_l3proto_module_put(ctx->afi->family);
+ nft_ct_netns_put(ctx->net, ctx->afi->family);
}
static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 249c9b80c150..29a4906adc27 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -86,7 +86,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0)
return -EINVAL;
- priv->result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+ priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
switch (priv->result) {
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 763ebc3e0b2b..ce13a50b9189 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -26,8 +26,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
- nf_dup_netdev_egress(pkt, oif);
- regs->verdict.code = NF_DROP;
+ nf_fwd_netdev_egress(pkt, oif);
+ regs->verdict.code = NF_STOLEN;
}
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 97ad8e30e4b4..eb2721af898d 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -53,6 +53,7 @@ static int nft_hash_init(const struct nft_ctx *ctx,
{
struct nft_hash *priv = nft_expr_priv(expr);
u32 len;
+ int err;
if (!tb[NFTA_HASH_SREG] ||
!tb[NFTA_HASH_DREG] ||
@@ -66,8 +67,10 @@ static int nft_hash_init(const struct nft_ctx *ctx,
priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
- len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN]));
- if (len == 0 || len > U8_MAX)
+ err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
+ if (err < 0)
+ return err;
+ if (len == 0)
return -ERANGE;
priv->len = len;
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 81b5ad6165ac..11ce016cd479 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -77,7 +77,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
}
}
- return 0;
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
}
EXPORT_SYMBOL_GPL(nft_masq_init);
@@ -105,4 +105,4 @@ nla_put_failure:
EXPORT_SYMBOL_GPL(nft_masq_dump);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index ee2d71753746..19a7bf3236f9 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -209,7 +209,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(ctx->net, family);
}
static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -257,12 +257,21 @@ nla_put_failure:
return -1;
}
+static void
+nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ nf_ct_netns_put(ctx->net, priv->family);
+}
+
static struct nft_expr_type nft_nat_type;
static const struct nft_expr_ops nft_nat_ops = {
.type = &nft_nat_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
.eval = nft_nat_eval,
.init = nft_nat_init,
+ .destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
};
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
new file mode 100644
index 000000000000..415a65ba2b85
--- /dev/null
+++ b/net/netfilter/nft_objref.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
+
+static void nft_objref_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ u32 objtype;
+
+ if (!tb[NFTA_OBJREF_IMM_NAME] ||
+ !tb[NFTA_OBJREF_IMM_TYPE])
+ return -EINVAL;
+
+ objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
+ obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+ genmask);
+ if (IS_ERR(obj))
+ return -ENOENT;
+
+ nft_objref_priv(expr) = obj;
+ obj->use++;
+
+ return 0;
+}
+
+static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_object *obj = nft_objref_priv(expr);
+
+ if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
+ nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_objref_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ obj->use--;
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_ops = {
+ .type = &nft_objref_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
+ .eval = nft_objref_eval,
+ .init = nft_objref_init,
+ .destroy = nft_objref_destroy,
+ .dump = nft_objref_dump,
+};
+
+struct nft_objref_map {
+ struct nft_set *set;
+ enum nft_registers sreg:8;
+ struct nft_set_binding binding;
+};
+
+static void nft_objref_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+ const struct nft_set *set = priv->set;
+ const struct nft_set_ext *ext;
+ struct nft_object *obj;
+ bool found;
+
+ found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
+ &ext);
+ if (!found) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ obj = *nft_set_ext_obj(ext);
+ obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_map_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set *set;
+ int err;
+
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask);
+ if (IS_ERR(set)) {
+ if (tb[NFTA_OBJREF_SET_ID]) {
+ set = nf_tables_set_lookup_byid(ctx->net,
+ tb[NFTA_OBJREF_SET_ID],
+ genmask);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (!(set->flags & NFT_SET_OBJECT))
+ return -EINVAL;
+
+ priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
+ err = nft_validate_register_load(priv->sreg, set->klen);
+ if (err < 0)
+ return err;
+
+ priv->binding.flags = set->flags & NFT_SET_OBJECT;
+
+ err = nf_tables_bind_set(ctx, set, &priv->binding);
+ if (err < 0)
+ return err;
+
+ priv->set = set;
+ return 0;
+}
+
+static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) ||
+ nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_objref_map_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_map_ops = {
+ .type = &nft_objref_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
+ .eval = nft_objref_map_eval,
+ .init = nft_objref_map_init,
+ .destroy = nft_objref_map_destroy,
+ .dump = nft_objref_map_dump,
+};
+
+static const struct nft_expr_ops *
+nft_objref_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_OBJREF_SET_SREG] &&
+ (tb[NFTA_OBJREF_SET_NAME] ||
+ tb[NFTA_OBJREF_SET_ID]))
+ return &nft_objref_map_ops;
+ else if (tb[NFTA_OBJREF_IMM_NAME] &&
+ tb[NFTA_OBJREF_IMM_TYPE])
+ return &nft_objref_ops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
+ [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING },
+ [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 },
+ [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 },
+ [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING },
+ [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
+};
+
+static struct nft_expr_type nft_objref_type __read_mostly = {
+ .name = "objref",
+ .select_ops = nft_objref_select_ops,
+ .policy = nft_objref_policy,
+ .maxattr = NFTA_OBJREF_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_objref_module_init(void)
+{
+ return nft_register_expr(&nft_objref_type);
+}
+
+static void __exit nft_objref_module_exit(void)
+{
+ nft_unregister_expr(&nft_objref_type);
+}
+
+module_init(nft_objref_module_init);
+module_exit(nft_objref_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("objref");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 98fb5d7b8087..36d2b1096546 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,10 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
@@ -164,6 +169,87 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.dump = nft_payload_dump,
};
+static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
+{
+ *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ if (*sum == 0)
+ *sum = CSUM_MANGLED_0;
+}
+
+static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff)
+{
+ struct udphdr *uh, _uh;
+
+ uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh);
+ if (!uh)
+ return false;
+
+ return uh->check;
+}
+
+static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ unsigned int *l4csum_offset)
+{
+ switch (pkt->tprot) {
+ case IPPROTO_TCP:
+ *l4csum_offset = offsetof(struct tcphdr, check);
+ break;
+ case IPPROTO_UDP:
+ if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+ return -1;
+ /* Fall through. */
+ case IPPROTO_UDPLITE:
+ *l4csum_offset = offsetof(struct udphdr, check);
+ break;
+ case IPPROTO_ICMPV6:
+ *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ break;
+ default:
+ return -1;
+ }
+
+ *l4csum_offset += pkt->xt.thoff;
+ return 0;
+}
+
+static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
+ struct sk_buff *skb,
+ __wsum fsum, __wsum tsum)
+{
+ int l4csum_offset;
+ __sum16 sum;
+
+ /* If we cannot determine layer 4 checksum offset or this packet doesn't
+ * require layer 4 checksum recalculation, skip this packet.
+ */
+ if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0)
+ return 0;
+
+ if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ /* Checksum mangling for an arbitrary amount of bytes, based on
+ * inet_proto_csum_replace*() functions.
+ */
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ nft_csum_replace(&sum, fsum, tsum);
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum),
+ tsum);
+ }
+ } else {
+ sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum),
+ tsum));
+ }
+
+ if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
+ skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
+ return -1;
+
+ return 0;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -204,14 +290,15 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
- sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum),
- tsum));
- if (sum == 0)
- sum = CSUM_MANGLED_0;
+ nft_csum_replace(&sum, fsum, tsum);
if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
goto err;
+
+ if (priv->csum_flags &&
+ nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0)
+ goto err;
}
if (!skb_make_writable(skb, max(offset + priv->len, 0)) ||
@@ -240,6 +327,15 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
priv->csum_offset =
ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
+ u32 flags;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS]));
+ if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR)
+ return -EINVAL;
+
+ priv->csum_flags = flags;
+ }
switch (priv->csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
@@ -262,7 +358,8 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr
nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) ||
nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) ||
nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET,
- htonl(priv->csum_offset)))
+ htonl(priv->csum_offset)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags)))
goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c00104c07095..bd6efc53f26d 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -17,38 +17,59 @@
struct nft_quota {
u64 quota;
- bool invert;
- atomic64_t remain;
+ unsigned long flags;
+ atomic64_t consumed;
};
static inline bool nft_overquota(struct nft_quota *priv,
- const struct nft_pktinfo *pkt)
+ const struct sk_buff *skb)
{
- return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
+ return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
}
-static void nft_quota_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static inline bool nft_quota_invert(struct nft_quota *priv)
{
- struct nft_quota *priv = nft_expr_priv(expr);
+ return priv->flags & NFT_QUOTA_F_INV;
+}
- if (nft_overquota(priv, pkt) ^ priv->invert)
+static inline void nft_quota_do_eval(struct nft_quota *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
}
static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
[NFTA_QUOTA_BYTES] = { .type = NLA_U64 },
[NFTA_QUOTA_FLAGS] = { .type = NLA_U32 },
+ [NFTA_QUOTA_CONSUMED] = { .type = NLA_U64 },
};
-static int nft_quota_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+#define NFT_QUOTA_DEPLETED_BIT 1 /* From NFT_QUOTA_F_DEPLETED. */
+
+static void nft_quota_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_quota *priv = nft_expr_priv(expr);
- u32 flags = 0;
- u64 quota;
+ struct nft_quota *priv = nft_obj_data(obj);
+ bool overquota;
+
+ overquota = nft_overquota(priv, pkt->skb);
+ if (overquota ^ nft_quota_invert(priv))
+ regs->verdict.code = NFT_BREAK;
+
+ if (overquota &&
+ !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
+ nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
+ NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
+}
+
+static int nft_quota_do_init(const struct nlattr * const tb[],
+ struct nft_quota *priv)
+{
+ unsigned long flags = 0;
+ u64 quota, consumed = 0;
if (!tb[NFTA_QUOTA_BYTES])
return -EINVAL;
@@ -57,26 +78,60 @@ static int nft_quota_init(const struct nft_ctx *ctx,
if (quota > S64_MAX)
return -EOVERFLOW;
+ if (tb[NFTA_QUOTA_CONSUMED]) {
+ consumed = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_CONSUMED]));
+ if (consumed > quota)
+ return -EINVAL;
+ }
+
if (tb[NFTA_QUOTA_FLAGS]) {
flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
if (flags & ~NFT_QUOTA_F_INV)
return -EINVAL;
+ if (flags & NFT_QUOTA_F_DEPLETED)
+ return -EOPNOTSUPP;
}
priv->quota = quota;
- priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
- atomic64_set(&priv->remain, quota);
+ priv->flags = flags;
+ atomic64_set(&priv->consumed, consumed);
return 0;
}
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
+ bool reset)
{
- const struct nft_quota *priv = nft_expr_priv(expr);
- u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
+ u32 flags = priv->flags;
+ u64 consumed;
+
+ if (reset) {
+ consumed = atomic64_xchg(&priv->consumed, 0);
+ if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
+ flags |= NFT_QUOTA_F_DEPLETED;
+ } else {
+ consumed = atomic64_read(&priv->consumed);
+ }
+
+ /* Since we inconditionally increment consumed quota for each packet
+ * that we see, don't go over the quota boundary in what we send to
+ * userspace.
+ */
+ if (consumed > priv->quota)
+ consumed = priv->quota;
if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
NFTA_QUOTA_PAD) ||
+ nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed),
+ NFTA_QUOTA_PAD) ||
nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
goto nla_put_failure;
return 0;
@@ -85,6 +140,50 @@ nla_put_failure:
return -1;
}
+static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
+ bool reset)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_dump(skb, priv, reset);
+}
+
+static struct nft_object_type nft_quota_obj __read_mostly = {
+ .type = NFT_OBJECT_QUOTA,
+ .size = sizeof(struct nft_quota),
+ .maxattr = NFTA_QUOTA_MAX,
+ .policy = nft_quota_policy,
+ .init = nft_quota_obj_init,
+ .eval = nft_quota_obj_eval,
+ .dump = nft_quota_obj_dump,
+ .owner = THIS_MODULE,
+};
+
+static void nft_quota_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_dump(skb, priv, false);
+}
+
static struct nft_expr_type nft_quota_type;
static const struct nft_expr_ops nft_quota_ops = {
.type = &nft_quota_type,
@@ -105,12 +204,26 @@ static struct nft_expr_type nft_quota_type __read_mostly = {
static int __init nft_quota_module_init(void)
{
- return nft_register_expr(&nft_quota_type);
+ int err;
+
+ err = nft_register_obj(&nft_quota_obj);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_quota_type);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+err1:
+ nft_unregister_obj(&nft_quota_obj);
+ return err;
}
static void __exit nft_quota_module_exit(void)
{
- nft_unregister_expr(&nft_quota_type);
+ nft_unregister_expr(&nft_quota_type);
+ nft_unregister_obj(&nft_quota_obj);
}
module_init(nft_quota_module_init);
@@ -119,3 +232,4 @@ module_exit(nft_quota_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("quota");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 009062606697..9edc74eedc10 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -59,6 +59,12 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
int err;
u32 op;
+ if (!tb[NFTA_RANGE_SREG] ||
+ !tb[NFTA_RANGE_OP] ||
+ !tb[NFTA_RANGE_FROM_DATA] ||
+ !tb[NFTA_RANGE_TO_DATA])
+ return -EINVAL;
+
err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
&desc_from, tb[NFTA_RANGE_FROM_DATA]);
if (err < 0)
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 03f7bf40ae75..40dcd05146d5 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -79,7 +79,7 @@ int nft_redir_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
}
EXPORT_SYMBOL_GPL(nft_redir_init);
@@ -108,4 +108,4 @@ nla_put_failure:
EXPORT_SYMBOL_GPL(nft_redir_dump);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index a3dface3e6e6..1e20e2bbb6d9 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,6 +167,19 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
nft_set_elem_clear_busy(&he->ext);
}
+static bool nft_hash_deactivate_one(const struct net *net,
+ const struct nft_set *set, void *priv)
+{
+ struct nft_hash_elem *he = priv;
+
+ if (!nft_set_elem_mark_busy(&he->ext) ||
+ !nft_is_active(net, &he->ext)) {
+ nft_set_elem_change_active(net, set, &he->ext);
+ return true;
+ }
+ return false;
+}
+
static void *nft_hash_deactivate(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem)
@@ -181,13 +194,10 @@ static void *nft_hash_deactivate(const struct net *net,
rcu_read_lock();
he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
- if (he != NULL) {
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext))
- nft_set_elem_change_active(net, set, &he->ext);
- else
- he = NULL;
- }
+ if (he != NULL &&
+ !nft_hash_deactivate_one(net, set, he))
+ he = NULL;
+
rcu_read_unlock();
return he;
@@ -387,6 +397,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = {
.insert = nft_hash_insert,
.activate = nft_hash_activate,
.deactivate = nft_hash_deactivate,
+ .deactivate_one = nft_hash_deactivate_one,
.remove = nft_hash_remove,
.lookup = nft_hash_lookup,
.update = nft_hash_update,
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 36493a7cae88..08376e50f6cd 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -171,6 +171,15 @@ static void nft_rbtree_activate(const struct net *net,
nft_set_elem_change_active(net, set, &rbe->ext);
}
+static bool nft_rbtree_deactivate_one(const struct net *net,
+ const struct nft_set *set, void *priv)
+{
+ struct nft_rbtree_elem *rbe = priv;
+
+ nft_set_elem_change_active(net, set, &rbe->ext);
+ return true;
+}
+
static void *nft_rbtree_deactivate(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem)
@@ -204,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
parent = parent->rb_right;
continue;
}
- nft_set_elem_change_active(net, set, &rbe->ext);
+ nft_rbtree_deactivate_one(net, set, rbe);
return rbe;
}
}
@@ -295,6 +304,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = {
.insert = nft_rbtree_insert,
.remove = nft_rbtree_remove,
.deactivate = nft_rbtree_deactivate,
+ .deactivate_one = nft_rbtree_deactivate_one,
.activate = nft_rbtree_activate,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ad818e52859b..2ff499680cc6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define XT_PCPU_BLOCK_SIZE 4096
struct compat_delta {
unsigned int offset; /* offset in kernel */
@@ -958,7 +959,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!info) {
- info = vmalloc(sz);
+ info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_HIGHMEM,
+ PAGE_KERNEL);
if (!info)
return NULL;
}
@@ -1615,6 +1618,59 @@ void xt_proto_fini(struct net *net, u_int8_t af)
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
+/**
+ * xt_percpu_counter_alloc - allocate x_tables rule counter
+ *
+ * @state: pointer to xt_percpu allocation state
+ * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
+ *
+ * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
+ * contain the address of the real (percpu) counter.
+ *
+ * Rule evaluation needs to use xt_get_this_cpu_counter() helper
+ * to fetch the real percpu counter.
+ *
+ * To speed up allocation and improve data locality, a 4kb block is
+ * allocated.
+ *
+ * xt_percpu_counter_alloc_state contains the base address of the
+ * allocated page and the current sub-offset.
+ *
+ * returns false on error.
+ */
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+ struct xt_counters *counter)
+{
+ BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2));
+
+ if (nr_cpu_ids <= 1)
+ return true;
+
+ if (!state->mem) {
+ state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE,
+ XT_PCPU_BLOCK_SIZE);
+ if (!state->mem)
+ return false;
+ }
+ counter->pcnt = (__force unsigned long)(state->mem + state->off);
+ state->off += sizeof(*counter);
+ if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) {
+ state->mem = NULL;
+ state->off = 0;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
+
+void xt_percpu_counter_free(struct xt_counters *counters)
+{
+ unsigned long pcnt = counters->pcnt;
+
+ if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0)
+ free_percpu((void __percpu *)pcnt);
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+
static int __net_init xt_net_init(struct net *net)
{
int i;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index e04dc282e3bb..da56c06a443c 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -106,7 +106,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -115,7 +115,7 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target connsecmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 6669e68d589e..95c750358747 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -216,7 +216,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err1;
#endif
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
goto err1;
@@ -260,7 +260,7 @@ out:
err3:
nf_ct_tmpl_free(ct);
err2:
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
err1:
return ret;
}
@@ -341,7 +341,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (help)
module_put(help->helper->me);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
xt_ct_destroy_timeout(ct);
nf_ct_put(info->ct);
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 94d0b5411192..e45a01255e70 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -60,7 +60,12 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
return -EINVAL;
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void netmap_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static unsigned int
@@ -111,7 +116,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static struct xt_target netmap_tg_reg[] __read_mostly = {
@@ -127,6 +132,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg6_checkentry,
+ .destroy = netmap_tg_destroy,
.me = THIS_MODULE,
},
{
@@ -141,6 +147,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg4_check,
+ .destroy = netmap_tg_destroy,
.me = THIS_MODULE,
},
};
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97..91a373a3f534 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
- gen_kill_estimator(&est->bstats, &est->rstats);
+ gen_kill_estimator(&est->rate_est);
/*
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
- ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 651dce65a30b..98a4c6d4f1cb 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -40,7 +40,13 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
- return 0;
+
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
/* FIXME: Take multiple ranges --RR */
@@ -56,7 +62,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
}
static unsigned int
@@ -72,6 +78,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
.revision = 0,
.table = "nat",
.checkentry = redirect_tg6_checkentry,
+ .destroy = redirect_tg_destroy,
.target = redirect_tg6,
.targetsize = sizeof(struct nf_nat_range),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -85,6 +92,7 @@ static struct xt_target redirect_tg_reg[] __read_mostly = {
.table = "nat",
.target = redirect_tg4,
.checkentry = redirect_tg4_check,
+ .destroy = redirect_tg_destroy,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_OUT),
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index dbd72cc40e42..80cb7babeb64 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -531,6 +531,11 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
static int tproxy_tg6_check(const struct xt_tgchk_param *par)
{
const struct ip6t_ip6 *i = par->entryinfo;
+ int err;
+
+ err = nf_defrag_ipv6_enable(par->net);
+ if (err)
+ return err;
if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
!(i->invflags & IP6T_INV_PROTO))
@@ -545,6 +550,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
static int tproxy_tg4_check(const struct xt_tgchk_param *par)
{
const struct ipt_ip *i = par->entryinfo;
+ int err;
+
+ err = nf_defrag_ipv4_enable(par->net);
+ if (err)
+ return err;
if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
&& !(i->invflags & IPT_INV_PROTO))
@@ -596,11 +606,6 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
static int __init tproxy_tg_init(void)
{
- nf_defrag_ipv4_enable();
-#ifdef XT_TPROXY_HAVE_IPV6
- nf_defrag_ipv6_enable();
-#endif
-
return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index dffee9d47ec4..2dedaa23ab0a 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/netfilter/xt_bpf.h>
#include <linux/netfilter/x_tables.h>
@@ -20,15 +21,15 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_bpf");
MODULE_ALIAS("ip6t_bpf");
-static int bpf_mt_check(const struct xt_mtchk_param *par)
+static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len,
+ struct bpf_prog **ret)
{
- struct xt_bpf_info *info = par->matchinfo;
struct sock_fprog_kern program;
- program.len = info->bpf_program_num_elem;
- program.filter = info->bpf_program;
+ program.len = len;
+ program.filter = insns;
- if (bpf_prog_create(&info->filter, &program)) {
+ if (bpf_prog_create(ret, &program)) {
pr_info("bpf: check failed: parse error\n");
return -EINVAL;
}
@@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par)
return 0;
}
+static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ *ret = prog;
+ return 0;
+}
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+}
+
+static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ if (info->mode == XT_BPF_MODE_BYTECODE)
+ return __bpf_mt_check_bytecode(info->bpf_program,
+ info->bpf_program_num_elem,
+ &info->filter);
+ else if (info->mode == XT_BPF_MODE_FD_PINNED ||
+ info->mode == XT_BPF_MODE_FD_ELF)
+ return __bpf_mt_check_fd(info->fd, &info->filter);
+ else
+ return -EINVAL;
+}
+
static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
@@ -43,31 +80,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
return BPF_PROG_RUN(info->filter, skb);
}
+static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
+ return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb);
+}
+
static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
+
+ bpf_prog_destroy(info->filter);
+}
+
+static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info_v1 *info = par->matchinfo;
+
bpf_prog_destroy(info->filter);
}
-static struct xt_match bpf_mt_reg __read_mostly = {
- .name = "bpf",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = bpf_mt_check,
- .match = bpf_mt,
- .destroy = bpf_mt_destroy,
- .matchsize = sizeof(struct xt_bpf_info),
- .me = THIS_MODULE,
+static struct xt_match bpf_mt_reg[] __read_mostly = {
+ {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "bpf",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check_v1,
+ .match = bpf_mt_v1,
+ .destroy = bpf_mt_destroy_v1,
+ .matchsize = sizeof(struct xt_bpf_info_v1),
+ .me = THIS_MODULE,
+ },
};
static int __init bpf_mt_init(void)
{
- return xt_register_match(&bpf_mt_reg);
+ return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
static void __exit bpf_mt_exit(void)
{
- xt_unregister_match(&bpf_mt_reg);
+ xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg));
}
module_init(bpf_mt_init);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index d4bec261e74e..cad0b7b5eb35 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -110,7 +110,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
sinfo->direction != XT_CONNBYTES_DIR_BOTH)
return -EINVAL;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -129,7 +129,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match connbytes_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 03d66f1c5e69..7827128d5a95 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -61,7 +61,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -70,14 +70,14 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
ret = nf_connlabels_get(par->net, info->bit);
if (ret < 0)
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
return ret;
}
static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
{
nf_connlabels_put(par->net);
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match connlabels_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index bb3845339efd..2aff2b7c4689 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -368,7 +368,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd));
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for "
"address family %u\n", par->family);
@@ -378,7 +378,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
/* init private data */
info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
if (info->data == NULL) {
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
return -ENOMEM;
}
@@ -414,7 +414,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
const struct xt_connlimit_info *info = par->matchinfo;
unsigned int i;
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
destroy_tree(&info->data->climit_root4[i]);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index b83e158e116a..9935d5029b0e 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -77,7 +77,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -86,7 +86,7 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static bool
@@ -107,7 +107,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -116,7 +116,7 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_target connmark_tg_reg __read_mostly = {
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 2dea15ebc55b..c0fb217bc649 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -271,7 +271,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -280,7 +280,7 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match conntrack_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index f679dd4c272a..38a78151c0e9 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -59,7 +59,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
struct xt_helper_info *info = par->matchinfo;
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -71,7 +71,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
static void helper_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match helper_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index ec06fb1cb16f..1cde0e4985b7 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -44,12 +44,18 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
switch (minfo->flags) {
case XT_MULTIPORT_SOURCE:
- return (src >= s && src <= e) ^ minfo->invert;
+ if (src >= s && src <= e)
+ return true ^ minfo->invert;
+ break;
case XT_MULTIPORT_DESTINATION:
- return (dst >= s && dst <= e) ^ minfo->invert;
+ if (dst >= s && dst <= e)
+ return true ^ minfo->invert;
+ break;
case XT_MULTIPORT_EITHER:
- return ((dst >= s && dst <= e) ||
- (src >= s && src <= e)) ^ minfo->invert;
+ if ((dst >= s && dst <= e) ||
+ (src >= s && src <= e))
+ return true ^ minfo->invert;
+ break;
default:
break;
}
@@ -59,11 +65,17 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
switch (minfo->flags) {
case XT_MULTIPORT_SOURCE:
- return (src == s) ^ minfo->invert;
+ if (src == s)
+ return true ^ minfo->invert;
+ break;
case XT_MULTIPORT_DESTINATION:
- return (dst == s) ^ minfo->invert;
+ if (dst == s)
+ return true ^ minfo->invert;
+ break;
case XT_MULTIPORT_EITHER:
- return (src == s || dst == s) ^ minfo->invert;
+ if (src == s || dst == s)
+ return true ^ minfo->invert;
+ break;
default:
break;
}
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bea7464cc43f..8107b3eb865f 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -23,7 +23,17 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
par->target->name);
return -EINVAL;
}
- return 0;
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static int xt_nat_checkentry(const struct xt_tgchk_param *par)
+{
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static void xt_nat_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static void xt_nat_convert_range(struct nf_nat_range *dst,
@@ -106,6 +116,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
.name = "SNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
+ .destroy = xt_nat_destroy,
.target = xt_snat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
@@ -118,6 +129,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
.name = "DNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
@@ -129,6 +141,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "SNAT",
.revision = 1,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_snat_target_v1,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
@@ -139,6 +153,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "DNAT",
.revision = 1,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v1,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a..1db02f6fca54 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
- struct gnet_stats_rate_est64 *r;
+ struct gnet_stats_rate_est64 sample = {0};
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
- spin_lock_bh(&info->est1->lock);
- r = &info->est1->rstats;
+ gen_estimator_read(&info->est1->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
- pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+ bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+ pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
} else {
- bps1 = r->bps;
- pps1 = r->pps;
+ bps1 = sample.bps;
+ pps1 = sample.pps;
}
- spin_unlock_bh(&info->est1->lock);
if (info->flags & XT_RATEEST_MATCH_ABS) {
bps2 = info->bps2;
pps2 = info->pps2;
} else {
- spin_lock_bh(&info->est2->lock);
- r = &info->est2->rstats;
+ gen_estimator_read(&info->est2->rate_est, &sample);
+
if (info->flags & XT_RATEEST_MATCH_DELTA) {
- bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
- pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+ bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+ pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
} else {
- bps2 = r->bps;
- pps2 = r->pps;
+ bps2 = sample.bps;
+ pps2 = sample.pps;
}
- spin_unlock_bh(&info->est2->lock);
}
switch (info->mode) {
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2198914707f5..770bbec878f1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -147,9 +147,28 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
}
#endif
+static int socket_mt_enable_defrag(struct net *net, int family)
+{
+ switch (family) {
+ case NFPROTO_IPV4:
+ return nf_defrag_ipv4_enable(net);
+#ifdef XT_SOCKET_HAVE_IPV6
+ case NFPROTO_IPV6:
+ return nf_defrag_ipv6_enable(net);
+#endif
+ }
+ WARN_ONCE(1, "Unknown family %d\n", family);
+ return 0;
+}
+
static int socket_mt_v1_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+ int err;
+
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V1) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
@@ -161,6 +180,11 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par)
static int socket_mt_v2_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
+ int err;
+
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V2) {
pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
@@ -173,7 +197,11 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
{
const struct xt_socket_mtinfo3 *info =
(struct xt_socket_mtinfo3 *)par->matchinfo;
+ int err;
+ err = socket_mt_enable_defrag(par->net, par->family);
+ if (err)
+ return err;
if (info->flags & ~XT_SOCKET_FLAGS_V3) {
pr_info("unknown flags 0x%x\n",
info->flags & ~XT_SOCKET_FLAGS_V3);
@@ -268,11 +296,6 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
static int __init socket_mt_init(void)
{
- nf_defrag_ipv4_enable();
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
- nf_defrag_ipv6_enable();
-#endif
-
return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
}
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index a507922d80cd..5746a33789a5 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -43,7 +43,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
{
int ret;
- ret = nf_ct_l3proto_try_module_get(par->family);
+ ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -52,7 +52,7 @@ static int state_mt_check(const struct xt_mtchk_param *par)
static void state_mt_destroy(const struct xt_mtdtor_param *par)
{
- nf_ct_l3proto_module_put(par->family);
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match state_mt_reg __read_mostly = {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 62bea4591054..246f29d365c0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -329,7 +329,6 @@ static void netlink_sock_destruct(struct sock *sk)
if (nlk->cb_running) {
if (nlk->cb.done)
nlk->cb.done(&nlk->cb);
-
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
@@ -346,6 +345,14 @@ static void netlink_sock_destruct(struct sock *sk)
WARN_ON(nlk_sk(sk)->groups);
}
+static void netlink_sock_destruct_work(struct work_struct *work)
+{
+ struct netlink_sock *nlk = container_of(work, struct netlink_sock,
+ work);
+
+ sk_free(&nlk->sk);
+}
+
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
* SMP. Look, when several writers sleep and reader wakes them up, all but one
* immediately hit write lock and grab all the cpus. Exclusive sleep solves
@@ -648,8 +655,18 @@ out_module:
static void deferred_put_nlk_sk(struct rcu_head *head)
{
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+ struct sock *sk = &nlk->sk;
+
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (nlk->cb_running && nlk->cb.done) {
+ INIT_WORK(&nlk->work, netlink_sock_destruct_work);
+ schedule_work(&nlk->work);
+ return;
+ }
- sock_put(&nlk->sk);
+ sk_free(sk);
}
static int netlink_release(struct socket *sock)
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 3cfd6cc60504..4fdb38318977 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -3,6 +3,7 @@
#include <linux/rhashtable.h>
#include <linux/atomic.h>
+#include <linux/workqueue.h>
#include <net/sock.h>
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
@@ -33,6 +34,7 @@ struct netlink_sock {
struct rhash_head node;
struct rcu_head rcu;
+ struct work_struct work;
};
static inline struct netlink_sock *nlk_sk(struct sock *sk)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 9b8a028b7dad..6b78bab27755 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -370,8 +370,11 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
skb_orphan(skb);
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
- if (err)
+ if (err) {
+ if (err != -EINPROGRESS)
+ kfree_skb(skb);
return err;
+ }
key->ip.proto = ipv6_hdr(skb)->nexthdr;
ovs_cb.mru = IP6CB(skb)->frag_max_size;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fab9bbfdead5..89f2e8c1f4dc 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3593,19 +3593,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
if (optlen != sizeof(val))
return -EINVAL;
- if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
- return -EBUSY;
if (copy_from_user(&val, optval, sizeof(val)))
return -EFAULT;
switch (val) {
case TPACKET_V1:
case TPACKET_V2:
case TPACKET_V3:
- po->tp_version = val;
- return 0;
+ break;
default:
return -EINVAL;
}
+ lock_sock(sk);
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+ ret = -EBUSY;
+ } else {
+ po->tp_version = val;
+ ret = 0;
+ }
+ release_sock(sk);
+ return ret;
}
case PACKET_RESERVE:
{
@@ -4109,6 +4115,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
/* Added to avoid minimal code churn */
struct tpacket_req *req = &req_u->req;
+ lock_sock(sk);
/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
net_warn_ratelimited("Tx-ring is not supported.\n");
@@ -4190,7 +4197,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
goto out;
}
- lock_sock(sk);
/* Detach socket from network */
spin_lock(&po->bind_lock);
@@ -4239,11 +4245,11 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (!tx_ring)
prb_shutdown_retire_blk_timer(po, rb_queue);
}
- release_sock(sk);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
+ release_sock(sk);
return err;
}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 1a0399dea764..57bb52361e0f 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -665,6 +665,8 @@ out_recv:
out_pernet:
unregister_pernet_subsys(&rds_tcp_net_ops);
out_slab:
+ if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
+ pr_warn("could not unregister rds_tcp_dev_notifier\n");
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1c..2095c83ce773 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
- gen_kill_estimator(&p->tcfa_bstats,
- &p->tcfa_rate_est);
+ gen_kill_estimator(&p->tcfa_rate_est);
/*
* gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
- gen_kill_estimator(&a->tcfa_bstats,
- &a->tcfa_rate_est);
+ gen_kill_estimator(&a->tcfa_rate_est);
call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
- &p->tcfa_rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
p->tcfa_qstats.qlen) < 0)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1aa4ecf41baf..1c60317f0121 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -28,7 +28,6 @@ struct tcf_bpf_cfg {
struct bpf_prog *filter;
struct sock_filter *bpf_ops;
const char *bpf_name;
- u32 bpf_fd;
u16 bpf_num_ops;
bool is_ebpf;
};
@@ -118,13 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
struct sk_buff *skb)
{
- if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd))
- return -EMSGSIZE;
+ struct nlattr *nla;
if (prog->bpf_name &&
nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
return -EMSGSIZE;
+ nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST,
+ sizeof(prog->filter->digest));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
return 0;
}
@@ -233,7 +238,6 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
}
}
- cfg->bpf_fd = bpf_fd;
cfg->bpf_name = name;
cfg->filter = fp;
cfg->is_ebpf = true;
@@ -332,8 +336,6 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
if (cfg.bpf_num_ops)
prog->bpf_num_ops = cfg.bpf_num_ops;
- if (cfg.bpf_fd)
- prog->bpf_fd = cfg.bpf_fd;
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index b2d417b8f46c..2d9fa6e0a1b4 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
+#include <linux/if_arp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -73,20 +74,6 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
-static bool dev_is_mac_header_xmit(const struct net_device *dev)
-{
- switch (dev->type) {
- case ARPHRD_TUNNEL:
- case ARPHRD_TUNNEL6:
- case ARPHRD_SIT:
- case ARPHRD_IPGRE:
- case ARPHRD_VOID:
- case ARPHRD_NONE:
- return false;
- }
- return true;
-}
-
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind)
@@ -328,6 +315,17 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
+static int tcf_mirred_device(const struct tc_action *a, struct net *net,
+ struct net_device **mirred_dev)
+{
+ int ifindex = tcf_mirred_ifindex(a);
+
+ *mirred_dev = __dev_get_by_index(net, ifindex);
+ if (!*mirred_dev)
+ return -EINVAL;
+ return 0;
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
@@ -340,6 +338,7 @@ static struct tc_action_ops act_mirred_ops = {
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
.size = sizeof(struct tcf_mirred),
+ .get_dev = tcf_mirred_device,
};
static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index eda322045e75..b27c4daec88f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
kfree(keys);
}
+static bool offset_valid(struct sk_buff *skb, int offset)
+{
+ if (offset > 0 && offset > skb->len)
+ return false;
+
+ if (offset < 0 && -offset > skb_headroom(skb))
+ return false;
+
+ return true;
+}
+
static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
if (tkey->offmask) {
char *d, _d;
+ if (!offset_valid(skb, off + tkey->at)) {
+ pr_info("tc filter pedit 'at' offset %d out of bounds\n",
+ off + tkey->at);
+ goto bad;
+ }
d = skb_header_pointer(skb, off + tkey->at, 1,
&_d);
if (!d)
@@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
" offset must be on 32 bit boundaries\n");
goto bad;
}
- if (offset > 0 && offset > skb->len) {
- pr_info("tc filter pedit"
- " offset %d can't exceed pkt length %d\n",
- offset, skb->len);
+
+ if (!offset_valid(skb, off + offset)) {
+ pr_info("tc filter pedit offset %d out of bounds\n",
+ offset);
goto bad;
}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index c990b73a6c85..0ba91d1ce994 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
- !gen_estimator_active(&police->tcf_bstats,
- &police->tcf_rate_est))) {
+ !gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
- if (police->tcfp_ewma_rate &&
- police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+ if (police->tcfp_ewma_rate) {
+ struct gnet_stats_rate_est64 sample;
+
+ if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+ sample.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2b2a7974e4bb..3fbba79a4ef0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -430,7 +430,8 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
if (!skb)
return -ENOBUFS;
- if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
+ n->nlmsg_flags, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
@@ -681,6 +682,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+ struct net_device **hw_dev)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ const struct tc_action *a;
+ LIST_HEAD(actions);
+
+ if (tc_no_actions(exts))
+ return -EINVAL;
+
+ tcf_exts_to_list(exts, &actions);
+ list_for_each_entry(a, &actions, list) {
+ if (a->ops->get_dev) {
+ a->ops->get_dev(a, dev_net(dev), hw_dev);
+ break;
+ }
+ }
+ if (*hw_dev)
+ return 0;
+#endif
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(tcf_exts_get_dev);
+
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index eb219b78cd49..5877f6061b57 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -62,9 +62,6 @@ static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
- if (head == NULL)
- return 0UL;
-
list_for_each_entry(f, &head->flist, link) {
if (f->handle == handle) {
l = (unsigned long) f;
@@ -109,7 +106,6 @@ static bool basic_destroy(struct tcf_proto *tp, bool force)
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, basic_delete_filter);
}
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 52dc85acca7d..adc776048d1a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -45,10 +45,7 @@ struct cls_bpf_prog {
u32 gen_flags;
struct tcf_exts exts;
u32 handle;
- union {
- u32 bpf_fd;
- u16 bpf_num_ops;
- };
+ u16 bpf_num_ops;
struct sock_filter *bpf_ops;
const char *bpf_name;
struct tcf_proto *tp;
@@ -244,7 +241,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
return 0;
}
-static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
{
tcf_exts_destroy(&prog->exts);
@@ -258,22 +255,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
kfree(prog);
}
-static void __cls_bpf_delete_prog(struct rcu_head *rcu)
+static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
{
- struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
- cls_bpf_delete_prog(prog->tp, prog);
+ __cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
}
-static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
{
- struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
-
cls_bpf_stop_offload(tp, prog);
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
- call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+ call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+}
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ __cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
return 0;
}
@@ -285,14 +282,9 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
if (!force && !list_empty(&head->plist))
return false;
- list_for_each_entry_safe(prog, tmp, &head->plist, link) {
- cls_bpf_stop_offload(tp, prog);
- list_del_rcu(&prog->link);
- tcf_unbind_filter(tp, &prog->res);
- call_rcu(&prog->rcu, __cls_bpf_delete_prog);
- }
+ list_for_each_entry_safe(prog, tmp, &head->plist, link)
+ __cls_bpf_delete(tp, prog);
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
@@ -303,9 +295,6 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
struct cls_bpf_prog *prog;
unsigned long ret = 0UL;
- if (head == NULL)
- return 0UL;
-
list_for_each_entry(prog, &head->plist, link) {
if (prog->handle == handle) {
ret = (unsigned long) prog;
@@ -377,7 +366,6 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
}
prog->bpf_ops = NULL;
- prog->bpf_fd = bpf_fd;
prog->bpf_name = name;
prog->filter = fp;
@@ -519,14 +507,14 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
ret = cls_bpf_offload(tp, prog, oldprog);
if (ret) {
- cls_bpf_delete_prog(tp, prog);
+ __cls_bpf_delete_prog(prog);
return ret;
}
if (oldprog) {
list_replace_rcu(&oldprog->link, &prog->link);
tcf_unbind_filter(tp, &oldprog->res);
- call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
+ call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
} else {
list_add_rcu(&prog->link, &head->plist);
}
@@ -561,13 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
struct sk_buff *skb)
{
- if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd))
- return -EMSGSIZE;
+ struct nlattr *nla;
if (prog->bpf_name &&
nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
return -EMSGSIZE;
+ nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
return 0;
}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 85233c470035..c1f20077837f 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -137,11 +137,10 @@ static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force)
if (!force)
return false;
-
- if (head) {
- RCU_INIT_POINTER(tp->root, NULL);
+ /* Head can still be NULL due to cls_cgroup_init(). */
+ if (head)
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
- }
+
return true;
}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index e39672394c7b..6575aba87630 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -596,7 +596,6 @@ static bool flow_destroy(struct tcf_proto *tp, bool force)
list_del_rcu(&f->list);
call_rcu(&f->rcu, flow_destroy_filter);
}
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e8dd09af0d0c..e040c5140f61 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/rhashtable.h>
+#include <linux/workqueue.h>
#include <linux/if_ether.h>
#include <linux/in6.h>
@@ -38,6 +39,7 @@ struct fl_flow_key {
struct flow_dissector_key_ipv6_addrs ipv6;
};
struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_icmp icmp;
struct flow_dissector_key_keyid enc_key_id;
union {
struct flow_dissector_key_ipv4_addrs enc_ipv4;
@@ -65,7 +67,10 @@ struct cls_fl_head {
bool mask_assigned;
struct list_head filters;
struct rhashtable_params ht_params;
- struct rcu_head rcu;
+ union {
+ struct work_struct work;
+ struct rcu_head rcu;
+ };
};
struct cls_fl_filter {
@@ -78,6 +83,8 @@ struct cls_fl_filter {
u32 handle;
u32 flags;
struct rcu_head rcu;
+ struct tc_to_netdev tc;
+ struct net_device *hw_dev;
};
static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -201,85 +208,110 @@ static void fl_destroy_filter(struct rcu_head *head)
kfree(f);
}
-static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_DESTROY;
- offload.cookie = cookie;
+ offload.cookie = (unsigned long)f;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
struct flow_dissector *dissector,
struct fl_flow_key *mask,
- struct fl_flow_key *key,
- struct tcf_exts *actions,
- unsigned long cookie, u32 flags)
+ struct cls_fl_filter *f)
{
struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct tc_to_netdev *tc = &f->tc;
int err;
- if (!tc_should_offload(dev, tp, flags))
- return tc_skip_sw(flags) ? -EINVAL : 0;
+ if (!tc_can_offload(dev, tp)) {
+ if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
+ (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) {
+ f->hw_dev = dev;
+ return tc_skip_sw(f->flags) ? -EINVAL : 0;
+ }
+ dev = f->hw_dev;
+ tc->egress_dev = true;
+ } else {
+ f->hw_dev = dev;
+ }
offload.command = TC_CLSFLOWER_REPLACE;
- offload.cookie = cookie;
+ offload.cookie = (unsigned long)f;
offload.dissector = dissector;
offload.mask = mask;
- offload.key = key;
- offload.exts = actions;
+ offload.key = &f->key;
+ offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
- &tc);
+ tc);
- if (tc_skip_sw(flags))
+ if (tc_skip_sw(f->flags))
return err;
-
return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
{
- struct net_device *dev = tp->q->dev_queue->dev;
struct tc_cls_flower_offload offload = {0};
- struct tc_to_netdev tc;
+ struct net_device *dev = f->hw_dev;
+ struct tc_to_netdev *tc = &f->tc;
- if (!tc_should_offload(dev, tp, 0))
+ if (!tc_can_offload(dev, tp))
return;
offload.command = TC_CLSFLOWER_STATS;
offload.cookie = (unsigned long)f;
offload.exts = &f->exts;
- tc.type = TC_SETUP_CLSFLOWER;
- tc.cls_flower = &offload;
+ tc->type = TC_SETUP_CLSFLOWER;
+ tc->cls_flower = &offload;
- dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
}
static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
{
list_del_rcu(&f->list);
- fl_hw_destroy_filter(tp, (unsigned long)f);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, f);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fl_destroy_filter);
}
+static void fl_destroy_sleepable(struct work_struct *work)
+{
+ struct cls_fl_head *head = container_of(work, struct cls_fl_head,
+ work);
+ if (head->mask_assigned)
+ rhashtable_destroy(&head->ht);
+ kfree(head);
+ module_put(THIS_MODULE);
+}
+
+static void fl_destroy_rcu(struct rcu_head *rcu)
+{
+ struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
+
+ INIT_WORK(&head->work, fl_destroy_sleepable);
+ schedule_work(&head->work);
+}
+
static bool fl_destroy(struct tcf_proto *tp, bool force)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -290,10 +322,10 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
list_for_each_entry_safe(f, next, &head->filters, list)
__fl_delete(tp, f);
- RCU_INIT_POINTER(tp->root, NULL);
- if (head->mask_assigned)
- rhashtable_destroy(&head->ht);
- kfree_rcu(head, rcu);
+
+ __module_get(THIS_MODULE);
+ call_rcu(&head->rcu, fl_destroy_rcu);
+
return true;
}
@@ -355,6 +387,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -389,6 +431,39 @@ static void fl_set_key_vlan(struct nlattr **tb,
}
}
+static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
+ u32 *dissector_key, u32 *dissector_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (flower_mask & flower_flag_bit) {
+ *dissector_mask |= dissector_flag_bit;
+ if (flower_key & flower_flag_bit)
+ *dissector_key |= dissector_flag_bit;
+ }
+}
+
+static void fl_set_key_flags(struct nlattr **tb,
+ u32 *flags_key, u32 *flags_mask)
+{
+ u32 key, mask;
+
+ if (!tb[TCA_FLOWER_KEY_FLAGS])
+ return;
+
+ key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
+
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ mask = ~0;
+ else
+ mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+
+ *flags_key = 0;
+ *flags_mask = 0;
+
+ fl_set_key_flag(key, mask, flags_key, flags_mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
@@ -471,6 +546,26 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST,
&mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
sizeof(key->tp.dst));
+ } else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code));
+ } else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6) {
+ fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE,
+ &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type));
+ fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE,
+ &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code));
}
if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
@@ -515,6 +610,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
&mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
sizeof(key->enc_tp.dst));
+ fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+
return 0;
}
@@ -581,6 +678,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_PORTS, tp);
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ICMP, icmp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_VLAN, vlan);
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
@@ -743,20 +842,21 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- err = fl_hw_replace_filter(tp,
- &head->dissector,
- &mask.key,
- &fnew->key,
- &fnew->exts,
- (unsigned long)fnew,
- fnew->flags);
- if (err)
- goto errout;
+ if (!tc_skip_hw(fnew->flags)) {
+ err = fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ fnew);
+ if (err)
+ goto errout;
+ }
if (fold) {
- rhashtable_remove_fast(&head->ht, &fold->ht_node,
- head->ht_params);
- fl_hw_destroy_filter(tp, (unsigned long)fold);
+ if (!tc_skip_sw(fold->flags))
+ rhashtable_remove_fast(&head->ht, &fold->ht_node,
+ head->ht_params);
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, fold);
}
*arg = (unsigned long) fnew;
@@ -782,8 +882,9 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
- rhashtable_remove_fast(&head->ht, &f->ht_node,
- head->ht_params);
+ if (!tc_skip_sw(f->flags))
+ rhashtable_remove_fast(&head->ht, &f->ht_node,
+ head->ht_params);
__fl_delete(tp, f);
return 0;
}
@@ -847,6 +948,42 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
return 0;
}
+static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask,
+ u32 *flower_key, u32 *flower_mask,
+ u32 flower_flag_bit, u32 dissector_flag_bit)
+{
+ if (dissector_mask & dissector_flag_bit) {
+ *flower_mask |= flower_flag_bit;
+ if (dissector_key & dissector_flag_bit)
+ *flower_key |= flower_flag_bit;
+ }
+}
+
+static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
+{
+ u32 key, mask;
+ __be32 _key, _mask;
+ int err;
+
+ if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask)))
+ return 0;
+
+ key = 0;
+ mask = 0;
+
+ fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+ TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+
+ _key = cpu_to_be32(key);
+ _mask = cpu_to_be32(mask);
+
+ err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key);
+ if (err)
+ return err;
+
+ return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
+}
+
static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -879,7 +1016,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
}
- fl_hw_update_stats(tp, f);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -943,6 +1081,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
&mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK,
sizeof(key->tp.dst))))
goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IP) &&
+ key->basic.ip_proto == IPPROTO_ICMP &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV4_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
+ else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+ key->basic.ip_proto == IPPROTO_ICMPV6 &&
+ (fl_dump_key_val(skb, &key->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type,
+ TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,
+ sizeof(key->icmp.type)) ||
+ fl_dump_key_val(skb, &key->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code,
+ TCA_FLOWER_KEY_ICMPV6_CODE_MASK,
+ sizeof(key->icmp.code))))
+ goto nla_put_failure;
if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
(fl_dump_key_val(skb, &key->enc_ipv4.src,
@@ -981,6 +1141,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
sizeof(key->enc_tp.dst)))
goto nla_put_failure;
+ if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
+ goto nla_put_failure;
+
nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
if (tcf_exts_dump(skb, &f->exts))
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 25927b6c4436..f935429bd5ef 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -114,7 +114,6 @@ static bool mall_destroy(struct tcf_proto *tp, bool force)
call_rcu(&f->rcu, mall_destroy_filter);
}
- RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
return true;
}
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f05a19fb073..322438fb3ffc 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -152,7 +152,8 @@ static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
return -1;
nhptr = ip_hdr(skb);
#endif
-
+ if (unlikely(!head))
+ return -1;
restart:
#if RSVP_DST_LEN == 4
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 96144bdf30db..0751245a6ace 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -543,7 +543,6 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
walker.fn = tcindex_destroy_element;
tcindex_walk(tp, &walker);
- RCU_INIT_POINTER(tp->root, NULL);
call_rcu(&p->rcu, __tcindex_destroy);
return true;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f337f1bdd1d4..d7b93429f0cc 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
- gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfb..9ffe1c220b02 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -122,7 +122,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto __rcu *filter_list;
@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
kfree(cl);
}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef84..bb4cbdf75004 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
struct Qdisc *qdisc;
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c2..6eb9c8e88519 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
- gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ gen_kill_estimator(&qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e..3ffaa6fb0990 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->qdisc);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
}
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.rtwork = cl->cl_cumul;
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9926fe4f3b6f..760f39e7caee 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -111,7 +111,7 @@ struct htb_class {
unsigned int children;
struct htb_class *parent; /* parent class */
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
/*
* Written often fields
@@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
@@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f743..f9e712ce2d15 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
- struct gnet_stats_rate_est64 rate_est;
+ struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
@@ -508,7 +508,7 @@ set_change_agg:
new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
if (new_agg == NULL) {
err = -ENOBUFS;
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
goto destroy_class;
}
sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
struct qfq_sched *q = qdisc_priv(sch);
qfq_rm_from_agg(q, cl);
- gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
- gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
return -1;
diff --git a/net/socket.c b/net/socket.c
index f9e26c68c3cf..58353835497d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1,3 +1,4 @@
+
/*
* NET An implementation of the SOCKET network access protocol.
*
@@ -341,8 +342,23 @@ static const struct xattr_handler sockfs_xattr_handler = {
.get = sockfs_xattr_get,
};
+static int sockfs_security_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *suffix, const void *value,
+ size_t size, int flags)
+{
+ /* Handled by LSM. */
+ return -EAGAIN;
+}
+
+static const struct xattr_handler sockfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .set = sockfs_security_xattr_set,
+};
+
static const struct xattr_handler *sockfs_xattr_handlers[] = {
&sockfs_xattr_handler,
+ &sockfs_security_xattr_handler,
NULL
};
@@ -678,9 +694,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
empty = 0;
- if (!empty)
+ if (!empty) {
put_cmsg(msg, SOL_SOCKET,
SCM_TIMESTAMPING, sizeof(tss), &tss);
+
+ if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS))
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
+ skb->len, skb->data);
+ }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
@@ -1898,7 +1919,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct sockaddr_storage address;
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
unsigned char ctl[sizeof(struct cmsghdr) + 20]
- __attribute__ ((aligned(sizeof(__kernel_size_t))));
+ __aligned(sizeof(__kernel_size_t));
/* 20 is size of ipv6_pktinfo */
unsigned char *ctl_buf = ctl;
int ctl_len;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c3f652395a80..3bc1d61694cb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1002,14 +1002,8 @@ static void svc_age_temp_xprts(unsigned long closure)
void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
{
struct svc_xprt *xprt;
- struct svc_sock *svsk;
- struct socket *sock;
struct list_head *le, *next;
LIST_HEAD(to_be_closed);
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
spin_lock_bh(&serv->sv_lock);
list_for_each_safe(le, next, &serv->sv_tempsocks) {
@@ -1027,10 +1021,7 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
list_del_init(le);
xprt = list_entry(le, struct svc_xprt, xpt_list);
dprintk("svc_age_temp_xprts_now: closing %p\n", xprt);
- svsk = container_of(xprt, struct svc_sock, sk_xprt);
- sock = svsk->sk_sock;
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
+ xprt->xpt_ops->xpo_kill_temp_xprt(xprt);
svc_close_xprt(xprt);
}
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 78da4aee3543..135ec2c11b3b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -451,6 +451,21 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}
+static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk;
+ struct socket *sock;
+ struct linger no_linger = {
+ .l_onoff = 1,
+ .l_linger = 0,
+ };
+
+ svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ sock = svsk->sk_sock;
+ kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+ (char *)&no_linger, sizeof(no_linger));
+}
+
/*
* See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
*/
@@ -660,6 +675,10 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
return NULL;
}
+static void svc_udp_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -679,6 +698,7 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_has_wspace = svc_udp_has_wspace,
.xpo_accept = svc_udp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_kill_temp_xprt = svc_udp_kill_temp_xprt,
};
static struct svc_xprt_class svc_udp_class = {
@@ -1254,6 +1274,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_kill_temp_xprt = svc_tcp_kill_temp_xprt,
};
static struct svc_xprt_class svc_tcp_class = {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 6864fb967038..1334de2715c2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -67,6 +67,7 @@ static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
static int svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
static struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
@@ -79,6 +80,7 @@ static struct svc_xprt_ops svc_rdma_ops = {
.xpo_has_wspace = svc_rdma_has_wspace,
.xpo_accept = svc_rdma_accept,
.xpo_secure_port = svc_rdma_secure_port,
+ .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
};
struct svc_xprt_class svc_rdma_class = {
@@ -1317,6 +1319,10 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp)
return 1;
}
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
{
struct ib_send_wr *bad_wr, *n_wr;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 975dbeb60ab0..52d74760fb68 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -421,6 +421,10 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
dev = dev_get_by_name(net, driver_name);
if (!dev)
return -ENODEV;
+ if (tipc_mtu_bad(dev, 0)) {
+ dev_put(dev);
+ return -EINVAL;
+ }
/* Associate TIPC bearer with L2 bearer */
rcu_assign_pointer(b->media_ptr, dev);
@@ -610,8 +614,6 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
if (!b)
return NOTIFY_DONE;
- b->mtu = dev->mtu;
-
switch (evt) {
case NETDEV_CHANGE:
if (netif_carrier_ok(dev))
@@ -624,6 +626,11 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
tipc_reset_bearer(net, b);
break;
case NETDEV_CHANGEMTU:
+ if (tipc_mtu_bad(dev, 0)) {
+ bearer_disable(net, b);
+ break;
+ }
+ b->mtu = dev->mtu;
tipc_reset_bearer(net, b);
break;
case NETDEV_CHANGEADDR:
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 78892e2f53e3..278ff7f616f9 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -39,6 +39,7 @@
#include "netlink.h"
#include "core.h"
+#include "msg.h"
#include <net/genetlink.h>
#define MAX_MEDIA 3
@@ -59,6 +60,9 @@
#define TIPC_MEDIA_TYPE_IB 2
#define TIPC_MEDIA_TYPE_UDP 3
+/* minimum bearer MTU */
+#define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE)
+
/**
* struct tipc_media_addr - destination address used by TIPC bearers
* @value: address info (format defined by media)
@@ -215,4 +219,13 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq);
+/* check if device MTU is too low for tipc headers */
+static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve)
+{
+ if (dev->mtu >= TIPC_MIN_BEARER_MTU + reserve)
+ return false;
+ netdev_warn(dev, "MTU too low for tipc bearer\n");
+ return true;
+}
+
#endif /* _TIPC_BEARER_H */
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 1055164c6232..bda89bf9f4ff 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -47,8 +47,8 @@
#include <linux/pkt_sched.h>
struct tipc_stats {
- u32 sent_info; /* used in counting # sent packets */
- u32 recv_info; /* used in counting # recv'd packets */
+ u32 sent_pkts;
+ u32 recv_pkts;
u32 sent_states;
u32 recv_states;
u32 sent_probes;
@@ -857,7 +857,6 @@ void tipc_link_reset(struct tipc_link *l)
l->acked = 0;
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
- l->stats.recv_info = 0;
l->stale_count = 0;
l->bc_peer_is_up = false;
memset(&l->mon_state, 0, sizeof(l->mon_state));
@@ -888,6 +887,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *transmq = &l->transmq;
struct sk_buff_head *backlogq = &l->backlogq;
struct sk_buff *skb, *_skb, *bskb;
+ int pkt_cnt = skb_queue_len(list);
/* Match msg importance against this and all higher backlog limits: */
if (!skb_queue_empty(backlogq)) {
@@ -901,6 +901,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
return -EMSGSIZE;
}
+ if (pkt_cnt > 1) {
+ l->stats.sent_fragmented++;
+ l->stats.sent_fragments += pkt_cnt;
+ }
+
/* Prepare each packet for sending, and add to relevant queue: */
while (skb_queue_len(list)) {
skb = skb_peek(list);
@@ -920,6 +925,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
__skb_queue_tail(xmitq, _skb);
TIPC_SKB_CB(skb)->ackers = l->ackers;
l->rcv_unacked = 0;
+ l->stats.sent_pkts++;
seqno++;
continue;
}
@@ -968,6 +974,7 @@ void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
msg_set_ack(hdr, ack);
msg_set_bcast_ack(hdr, bc_ack);
l->rcv_unacked = 0;
+ l->stats.sent_pkts++;
seqno++;
}
l->snd_nxt = seqno;
@@ -1260,7 +1267,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Deliver packet */
l->rcv_nxt++;
- l->stats.recv_info++;
+ l->stats.recv_pkts++;
if (!tipc_data_input(l, skb, l->inputq))
rc |= tipc_link_input(l, skb, l->inputq);
if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
@@ -1492,8 +1499,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
l->tolerance = peers_tol;
- if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI,
- TIPC_MAX_LINK_PRI)) {
+ /* Update own prio if peer indicates a different value */
+ if ((peers_prio != l->priority) &&
+ in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) {
l->priority = peers_prio;
rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
@@ -1799,10 +1807,6 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
void tipc_link_reset_stats(struct tipc_link *l)
{
memset(&l->stats, 0, sizeof(l->stats));
- if (!link_is_bc_sndlink(l)) {
- l->stats.sent_info = l->snd_nxt;
- l->stats.recv_info = l->rcv_nxt;
- }
}
static void link_print(struct tipc_link *l, const char *str)
@@ -1866,12 +1870,12 @@ static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s)
};
struct nla_map map[] = {
- {TIPC_NLA_STATS_RX_INFO, s->recv_info},
+ {TIPC_NLA_STATS_RX_INFO, 0},
{TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments},
{TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented},
{TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles},
{TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled},
- {TIPC_NLA_STATS_TX_INFO, s->sent_info},
+ {TIPC_NLA_STATS_TX_INFO, 0},
{TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments},
{TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented},
{TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles},
@@ -1946,9 +1950,9 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
goto attr_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->stats.recv_pkts))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->stats.sent_pkts))
goto attr_msg_full;
if (tipc_link_is_up(link))
@@ -2003,12 +2007,12 @@ static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
};
struct nla_map map[] = {
- {TIPC_NLA_STATS_RX_INFO, stats->recv_info},
+ {TIPC_NLA_STATS_RX_INFO, stats->recv_pkts},
{TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
{TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
{TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
{TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
- {TIPC_NLA_STATS_TX_INFO, stats->sent_info},
+ {TIPC_NLA_STATS_TX_INFO, stats->sent_pkts},
{TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
{TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
{TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
@@ -2075,9 +2079,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
goto attr_msg_full;
if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, 0))
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0))
goto attr_msg_full;
prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index ed97a5876ebe..9e109bb1a207 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -455,14 +455,14 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr,
int i, applied_bef;
state->probing = false;
- if (!dlen)
- return;
/* Sanity check received domain record */
- if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) {
- pr_warn_ratelimited("Received illegal domain record\n");
+ if (dlen < dom_rec_len(arrv_dom, 0))
+ return;
+ if (dlen != dom_rec_len(arrv_dom, new_member_cnt))
+ return;
+ if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen)
return;
- }
/* Synch generation numbers with peer if link just came up */
if (!state->synched) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 22d92f0ec5ac..333c5dae0072 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1,7 +1,7 @@
/*
* net/tipc/socket.c: TIPC socket API
*
- * Copyright (c) 2001-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 2001-2007, 2012-2016, Ericsson AB
* Copyright (c) 2004-2008, 2010-2013, Wind River Systems
* All rights reserved.
*
@@ -127,54 +127,8 @@ static const struct proto_ops packet_ops;
static const struct proto_ops stream_ops;
static const struct proto_ops msg_ops;
static struct proto tipc_proto;
-
static const struct rhashtable_params tsk_rht_params;
-/*
- * Revised TIPC socket locking policy:
- *
- * Most socket operations take the standard socket lock when they start
- * and hold it until they finish (or until they need to sleep). Acquiring
- * this lock grants the owner exclusive access to the fields of the socket
- * data structures, with the exception of the backlog queue. A few socket
- * operations can be done without taking the socket lock because they only
- * read socket information that never changes during the life of the socket.
- *
- * Socket operations may acquire the lock for the associated TIPC port if they
- * need to perform an operation on the port. If any routine needs to acquire
- * both the socket lock and the port lock it must take the socket lock first
- * to avoid the risk of deadlock.
- *
- * The dispatcher handling incoming messages cannot grab the socket lock in
- * the standard fashion, since invoked it runs at the BH level and cannot block.
- * Instead, it checks to see if the socket lock is currently owned by someone,
- * and either handles the message itself or adds it to the socket's backlog
- * queue; in the latter case the queued message is processed once the process
- * owning the socket lock releases it.
- *
- * NOTE: Releasing the socket lock while an operation is sleeping overcomes
- * the problem of a blocked socket operation preventing any other operations
- * from occurring. However, applications must be careful if they have
- * multiple threads trying to send (or receive) on the same socket, as these
- * operations might interfere with each other. For example, doing a connect
- * and a receive at the same time might allow the receive to consume the
- * ACK message meant for the connect. While additional work could be done
- * to try and overcome this, it doesn't seem to be worthwhile at the present.
- *
- * NOTE: Releasing the socket lock while an operation is sleeping also ensures
- * that another operation that must be performed in a non-blocking manner is
- * not delayed for very long because the lock has already been taken.
- *
- * NOTE: This code assumes that certain fields of a port/socket pair are
- * constant over its lifetime; such fields can be examined without taking
- * the socket lock and/or port lock, and do not need to be re-read even
- * after resuming processing after waiting. These fields include:
- * - socket type
- * - pointer to socket sk structure (aka tipc_sock structure)
- * - pointer to port structure
- * - port reference
- */
-
static u32 tsk_own_node(struct tipc_sock *tsk)
{
return msg_prevnode(&tsk->phdr);
@@ -230,7 +184,7 @@ static struct tipc_sock *tipc_sk(const struct sock *sk)
static bool tsk_conn_cong(struct tipc_sock *tsk)
{
- return tsk->snt_unacked >= tsk->snd_win;
+ return tsk->snt_unacked > tsk->snd_win;
}
/* tsk_blocks(): translate a buffer size in bytes to number of
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 78cab9c5a445..b58dc95f3d35 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -697,6 +697,11 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
udp_conf.use_udp_checksums = false;
ub->ifindex = dev->ifindex;
+ if (tipc_mtu_bad(dev, sizeof(struct iphdr) +
+ sizeof(struct udphdr))) {
+ err = -EINVAL;
+ goto err;
+ }
b->mtu = dev->mtu - sizeof(struct iphdr)
- sizeof(struct udphdr);
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6a705d0ff889..1752d6b10ac4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2199,7 +2199,8 @@ out:
* Sleep until more data has arrived. But check for races..
*/
static long unix_stream_data_wait(struct sock *sk, long timeo,
- struct sk_buff *last, unsigned int last_len)
+ struct sk_buff *last, unsigned int last_len,
+ bool freezable)
{
struct sk_buff *tail;
DEFINE_WAIT(wait);
@@ -2220,7 +2221,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
unix_state_unlock(sk);
- timeo = freezable_schedule_timeout(timeo);
+ if (freezable)
+ timeo = freezable_schedule_timeout(timeo);
+ else
+ timeo = schedule_timeout(timeo);
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD))
@@ -2250,7 +2254,8 @@ struct unix_stream_read_state {
unsigned int splice_flags;
};
-static int unix_stream_read_generic(struct unix_stream_read_state *state)
+static int unix_stream_read_generic(struct unix_stream_read_state *state,
+ bool freezable)
{
struct scm_cookie scm;
struct socket *sock = state->socket;
@@ -2330,7 +2335,7 @@ again:
mutex_unlock(&u->iolock);
timeo = unix_stream_data_wait(sk, timeo, last,
- last_len);
+ last_len, freezable);
if (signal_pending(current)) {
err = sock_intr_errno(timeo);
@@ -2472,7 +2477,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
.flags = flags
};
- return unix_stream_read_generic(&state);
+ return unix_stream_read_generic(&state, true);
}
static int unix_stream_splice_actor(struct sk_buff *skb,
@@ -2503,7 +2508,7 @@ static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
flags & SPLICE_F_NONBLOCK)
state.flags = MSG_DONTWAIT;
- return unix_stream_read_generic(&state);
+ return unix_stream_read_generic(&state, false);
}
static int unix_shutdown(struct socket *sock, int mode)
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 936d7eee62d0..2e47f9f06b96 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -44,6 +44,10 @@ struct virtio_vsock {
spinlock_t send_pkt_list_lock;
struct list_head send_pkt_list;
+ struct work_struct loopback_work;
+ spinlock_t loopback_list_lock; /* protects loopback_list */
+ struct list_head loopback_list;
+
atomic_t queued_replies;
/* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
@@ -74,6 +78,42 @@ static u32 virtio_transport_get_local_cid(void)
return vsock->guest_cid;
}
+static void virtio_transport_loopback_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, loopback_work);
+ LIST_HEAD(pkts);
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_splice_init(&vsock->loopback_list, &pkts);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ mutex_lock(&vsock->rx_lock);
+ while (!list_empty(&pkts)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+
+ virtio_transport_recv_pkt(pkt);
+ }
+ mutex_unlock(&vsock->rx_lock);
+}
+
+static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
+ struct virtio_vsock_pkt *pkt)
+{
+ int len = pkt->len;
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_add_tail(&pkt->list, &vsock->loopback_list);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->loopback_work);
+
+ return len;
+}
+
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -159,6 +199,9 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return -ENODEV;
}
+ if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid)
+ return virtio_transport_send_pkt_loopback(vsock, pkt);
+
if (pkt->reply)
atomic_inc(&vsock->queued_replies);
@@ -510,10 +553,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
mutex_init(&vsock->event_lock);
spin_lock_init(&vsock->send_pkt_list_lock);
INIT_LIST_HEAD(&vsock->send_pkt_list);
+ spin_lock_init(&vsock->loopback_list_lock);
+ INIT_LIST_HEAD(&vsock->loopback_list);
INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
INIT_WORK(&vsock->event_work, virtio_transport_event_work);
INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+ INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
mutex_lock(&vsock->rx_lock);
virtio_vsock_rx_fill(vsock);
@@ -539,6 +585,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
struct virtio_vsock *vsock = vdev->priv;
struct virtio_vsock_pkt *pkt;
+ flush_work(&vsock->loopback_work);
flush_work(&vsock->rx_work);
flush_work(&vsock->tx_work);
flush_work(&vsock->event_work);
@@ -565,6 +612,15 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
}
spin_unlock_bh(&vsock->send_pkt_list_lock);
+ spin_lock_bh(&vsock->loopback_list_lock);
+ while (!list_empty(&vsock->loopback_list)) {
+ pkt = list_first_entry(&vsock->loopback_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
mutex_lock(&the_virtio_vsock_mutex);
the_virtio_vsock = NULL;
vsock_core_exit();
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 9820fa251daa..af6e023020b1 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -71,6 +71,7 @@ struct cfg80211_registered_device {
struct list_head bss_list;
struct rb_root bss_tree;
u32 bss_generation;
+ u32 bss_entries;
struct cfg80211_scan_request *scan_req; /* protected by RTNL */
struct sk_buff *scan_msg;
struct cfg80211_sched_scan_request __rcu *sched_scan_req;
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 71447cf86306..ba0a1f398ce5 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -556,7 +556,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
break;
- case 0:
+ default:
memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
break;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index b5bd58d0f731..35ad69fd0838 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -57,6 +57,19 @@
* also linked into the probe response struct.
*/
+/*
+ * Limit the number of BSS entries stored in mac80211. Each one is
+ * a bit over 4k at most, so this limits to roughly 4-5M of memory.
+ * If somebody wants to really attack this though, they'd likely
+ * use small beacons, and only one type of frame, limiting each of
+ * the entries to a much smaller size (in order to generate more
+ * entries in total, so overhead is bigger.)
+ */
+static int bss_entries_limit = 1000;
+module_param(bss_entries_limit, int, 0644);
+MODULE_PARM_DESC(bss_entries_limit,
+ "limit to number of scan BSS entries (per wiphy, default 1000)");
+
#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
static void bss_free(struct cfg80211_internal_bss *bss)
@@ -137,6 +150,10 @@ static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev,
list_del_init(&bss->list);
rb_erase(&bss->rbn, &rdev->bss_tree);
+ rdev->bss_entries--;
+ WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list),
+ "rdev bss entries[%d]/list[empty:%d] corruption\n",
+ rdev->bss_entries, list_empty(&rdev->bss_list));
bss_ref_put(rdev, bss);
return true;
}
@@ -163,6 +180,40 @@ static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev,
rdev->bss_generation++;
}
+static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_internal_bss *bss, *oldest = NULL;
+ bool ret;
+
+ lockdep_assert_held(&rdev->bss_lock);
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (atomic_read(&bss->hold))
+ continue;
+
+ if (!list_empty(&bss->hidden_list) &&
+ !bss->pub.hidden_beacon_bss)
+ continue;
+
+ if (oldest && time_before(oldest->ts, bss->ts))
+ continue;
+ oldest = bss;
+ }
+
+ if (WARN_ON(!oldest))
+ return false;
+
+ /*
+ * The callers make sure to increase rdev->bss_generation if anything
+ * gets removed (and a new entry added), so there's no need to also do
+ * it here.
+ */
+
+ ret = __cfg80211_unlink_bss(rdev, oldest);
+ WARN_ON(!ret);
+ return ret;
+}
+
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
bool send_message)
{
@@ -689,6 +740,7 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
const u8 *ie;
int i, ssidlen;
u8 fold = 0;
+ u32 n_entries = 0;
ies = rcu_access_pointer(new->pub.beacon_ies);
if (WARN_ON(!ies))
@@ -712,6 +764,12 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
/* This is the bad part ... */
list_for_each_entry(bss, &rdev->bss_list, list) {
+ /*
+ * we're iterating all the entries anyway, so take the
+ * opportunity to validate the list length accounting
+ */
+ n_entries++;
+
if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid))
continue;
if (bss->pub.channel != new->pub.channel)
@@ -740,6 +798,10 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
new->pub.beacon_ies);
}
+ WARN_ONCE(n_entries != rdev->bss_entries,
+ "rdev bss entries[%d]/list[len:%d] corruption\n",
+ rdev->bss_entries, n_entries);
+
return true;
}
@@ -894,7 +956,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
}
}
+ if (rdev->bss_entries >= bss_entries_limit &&
+ !cfg80211_bss_expire_oldest(rdev)) {
+ kfree(new);
+ goto drop;
+ }
+
list_add_tail(&new->list, &rdev->bss_list);
+ rdev->bss_entries++;
rb_insert_bss(rdev, new);
found = new;
}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 88725f8eefad..e9d040d29846 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1159,7 +1159,8 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
58500000,
65000000,
78000000,
- 0,
+ /* not in the spec, but some devices use this: */
+ 86500000,
},
{ 13500000,
27000000,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index fd6986634e6f..5bf7e1bfeac7 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1268,12 +1268,14 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
err = security_xfrm_policy_lookup(pol->security,
fl->flowi_secid,
policy_to_flow_dir(dir));
- if (!err && !xfrm_pol_hold_rcu(pol))
- goto again;
- else if (err == -ESRCH)
+ if (!err) {
+ if (!xfrm_pol_hold_rcu(pol))
+ goto again;
+ } else if (err == -ESRCH) {
pol = NULL;
- else
+ } else {
pol = ERR_PTR(err);
+ }
} else
pol = NULL;
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 08892091cfe3..671a1d0333f0 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2450,7 +2450,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
#endif
type = nlh->nlmsg_type;