summaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig47
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c9
-rw-r--r--net/sched/act_ct.c984
-rw-r--r--net/sched/act_ctinfo.c407
-rw-r--r--net/sched/act_mirred.c23
-rw-r--r--net/sched/act_mpls.c406
-rw-r--r--net/sched/cls_api.c216
-rw-r--r--net/sched/cls_flower.c195
-rw-r--r--net/sched/cls_fw.c13
-rw-r--r--net/sched/cls_matchall.c9
-rw-r--r--net/sched/cls_u32.c15
-rw-r--r--net/sched/em_ipt.c48
-rw-r--r--net/sched/sch_etf.c10
-rw-r--r--net/sched/sch_ingress.c8
-rw-r--r--net/sched/sch_taprio.c421
16 files changed, 2578 insertions, 236 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2c72d95c3050..dd55b9ac3a66 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -842,6 +842,17 @@ config NET_ACT_CSUM
To compile this code as a module, choose M here: the
module will be called act_csum.
+config NET_ACT_MPLS
+ tristate "MPLS manipulation"
+ depends on NET_CLS_ACT
+ help
+ Say Y here to push or pop MPLS headers.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_mpls.
+
config NET_ACT_VLAN
tristate "Vlan manipulation"
depends on NET_CLS_ACT
@@ -877,6 +888,23 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_CTINFO
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
+ Say Y here to allow transfer of a connmark stored information.
+ Current actions transfer connmark stored DSCP into
+ ipv4/v6 diffserv and/or to transfer connmark to packet
+ mark. Both are useful for restoring egress based marks
+ back onto ingress connections for qdisc priority mapping
+ purposes.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ctinfo.
+
config NET_ACT_SKBMOD
tristate "skb data modification action"
depends on NET_CLS_ACT
@@ -912,6 +940,17 @@ config NET_ACT_TUNNEL_KEY
To compile this code as a module, choose M here: the
module will be called act_tunnel_key.
+config NET_ACT_CT
+ tristate "connection tracking tc action"
+ depends on NET_CLS_ACT && NF_CONNTRACK
+ help
+ Say Y here to allow sending the packets to conntrack module.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ct.
+
config NET_IFE_SKBMARK
tristate "Support to encoding decoding skb mark on IFE action"
depends on NET_ACT_IFE
@@ -924,14 +963,6 @@ config NET_IFE_SKBTCINDEX
tristate "Support to encoding decoding skb tcindex on IFE action"
depends on NET_ACT_IFE
-config NET_CLS_IND
- bool "Incoming device classification"
- depends on NET_CLS_U32 || NET_CLS_FW
- ---help---
- Say Y here to extend the u32 and fw classifier to support
- classification based on the incoming device. This option is
- likely to disappear in favour of the metadata ematch.
-
endif # NET_SCHED
config NET_SCH_FIFO
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..415d1e1f237e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -18,15 +18,18 @@ obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+obj-$(CONFIG_NET_ACT_MPLS) += act_mpls.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o
obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_ACT_CT) += act_ct.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4e5d2e9ace5d..339712296164 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -221,12 +221,13 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
mutex_lock(&idrinfo->lock);
s_i = cb->args[0];
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
index++;
if (index < s_i)
continue;
@@ -292,6 +293,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct idr *idr = &idrinfo->action_idr;
struct tc_action *p;
unsigned long id = 1;
+ unsigned long tmp;
nest = nla_nest_start_noflag(skb, 0);
if (nest == NULL)
@@ -300,7 +302,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
goto nla_put_failure;
mutex_lock(&idrinfo->lock);
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
ret = tcf_idr_release_unsafe(p);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
@@ -533,8 +535,9 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tc_action *p;
int ret;
unsigned long id = 1;
+ unsigned long tmp;
- idr_for_each_entry_ul(idr, p, id) {
+ idr_for_each_entry_ul(idr, p, tmp, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED)
module_put(ops->owner);
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 000000000000..b501ce0cf116
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* -
+ * net/sched/act_ct.c Connection Tracking action
+ *
+ * Authors: Paul Blakey <paulb@mellanox.com>
+ * Yossi Kuperman <yossiku@mellanox.com>
+ * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/ip.h>
+#include <net/ipv6_frag.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+#include <net/tc_act/tc_ct.h>
+
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static struct tc_action_ops act_ct_ops;
+static unsigned int ct_net_id;
+
+struct tc_ct_action_net {
+ struct tc_action_net tn; /* Must be first */
+ bool labels;
+};
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
+static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
+ u16 zone_id, bool force)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+ if (!net_eq(net, read_pnet(&ct->ct_net)))
+ return false;
+ if (nf_ct_zone(ct)->id != zone_id)
+ return false;
+
+ /* Force conntrack entry direction. */
+ if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_kill(ct);
+
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return false;
+ }
+
+ return true;
+}
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+ int err;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = ntohs(ip_hdr(skb)->tot_len);
+ break;
+ case NFPROTO_IPV6:
+ len = sizeof(struct ipv6hdr)
+ + ntohs(ipv6_hdr(skb)->payload_len);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ err = pskb_trim_rcsum(skb, len);
+
+ return err;
+}
+
+static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
+{
+ u8 family = NFPROTO_UNSPEC;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ family = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ family = NFPROTO_IPV6;
+ break;
+ default:
+ break;
+ }
+
+ return family;
+}
+
+static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int len;
+
+ len = skb_network_offset(skb) + sizeof(struct iphdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ *frag = ip_is_fragment(ip_hdr(skb));
+ return 0;
+}
+
+static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
+{
+ unsigned int flags = 0, len, payload_ofs = 0;
+ unsigned short frag_off;
+ int nexthdr;
+
+ len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
+ nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
+ if (unlikely(nexthdr < 0))
+ return -EPROTO;
+
+ *frag = flags & IP6_FH_F_FRAG;
+ return 0;
+}
+
+static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u8 family, u16 zone)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ int err = 0;
+ bool frag;
+
+ /* Previously seen (loopback)? Ignore. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+ return 0;
+
+ if (family == NFPROTO_IPV4)
+ err = tcf_ct_ipv4_is_fragment(skb, &frag);
+ else
+ err = tcf_ct_ipv6_is_fragment(skb, &frag);
+ if (err || !frag)
+ return err;
+
+ skb_get(skb);
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err && err != -EINPROGRESS)
+ goto out_free;
+ } else { /* NFPROTO_IPV6 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err && err != -EINPROGRESS)
+ goto out_free;
+#else
+ err = -EOPNOTSUPP;
+ goto out_free;
+#endif
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+ return err;
+
+out_free:
+ kfree_skb(skb);
+ return err;
+}
+
+static void tcf_ct_params_free(struct rcu_head *head)
+{
+ struct tcf_ct_params *params = container_of(head,
+ struct tcf_ct_params, rcu);
+
+ if (params->tmpl)
+ nf_conntrack_put(&params->tmpl->ct_general);
+ kfree(params);
+}
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype)
+{
+ int hooknum, err = NF_ACCEPT;
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (skb->protocol == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto out;
+ } else if (IS_ENABLED(CONFIG_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto out;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ /* fall through */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto out;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto out;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
+ return err;
+}
+#endif /* CONFIG_NF_NAT */
+
+static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ u32 new_mark;
+
+ if (!mask)
+ return;
+
+ new_mark = mark | (ct->mark & ~(mask));
+ if (ct->mark != new_mark) {
+ ct->mark = new_mark;
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+#endif
+}
+
+static void tcf_ct_act_set_labels(struct nf_conn *ct,
+ u32 *labels,
+ u32 *labels_m)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
+ size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+
+ if (!memchr_inv(labels_m, 0, labels_sz))
+ return;
+
+ nf_connlabels_replace(ct, labels, labels_m, 4);
+#endif
+}
+
+static int tcf_ct_act_nat(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ int ct_action,
+ struct nf_nat_range2 *range,
+ bool commit)
+{
+#if IS_ENABLED(CONFIG_NF_NAT)
+ enum nf_nat_manip_type maniptype;
+
+ if (!(ct_action & TCA_CT_ACT_NAT))
+ return NF_ACCEPT;
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_DROP; /* Can't NAT. */
+
+ if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+ (ctinfo != IP_CT_RELATED || commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (ct_action & TCA_CT_ACT_NAT_SRC) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (ct_action & TCA_CT_ACT_NAT_DST) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT;
+ }
+
+ return ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+#else
+ return NF_ACCEPT;
+#endif
+}
+
+static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct net *net = dev_net(skb->dev);
+ bool cached, commit, clear, force;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ct *c = to_ct(a);
+ struct nf_conn *tmpl = NULL;
+ struct nf_hook_state state;
+ int nh_ofs, err, retval;
+ struct tcf_ct_params *p;
+ struct nf_conn *ct;
+ u8 family;
+
+ p = rcu_dereference_bh(c->params);
+
+ retval = READ_ONCE(c->tcf_action);
+ commit = p->ct_action & TCA_CT_ACT_COMMIT;
+ clear = p->ct_action & TCA_CT_ACT_CLEAR;
+ force = p->ct_action & TCA_CT_ACT_FORCE;
+ tmpl = p->tmpl;
+
+ if (clear) {
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ }
+
+ goto out;
+ }
+
+ family = tcf_ct_skb_nf_family(skb);
+ if (family == NFPROTO_UNSPEC)
+ goto drop;
+
+ /* The conntrack module expects to be working at L3.
+ * We also try to pull the IPv4/6 header to linear area
+ */
+ nh_ofs = skb_network_offset(skb);
+ skb_pull_rcsum(skb, nh_ofs);
+ err = tcf_ct_handle_fragments(net, skb, family, p->zone);
+ if (err == -EINPROGRESS) {
+ retval = TC_ACT_STOLEN;
+ goto out;
+ }
+ if (err)
+ goto drop;
+
+ err = tcf_ct_skb_network_trim(skb, family);
+ if (err)
+ goto drop;
+
+ /* If we are recirculating packets to match on ct fields and
+ * committing with a separate ct action, then we don't need to
+ * actually run the packet through conntrack twice unless it's for a
+ * different zone.
+ */
+ cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
+ if (!cached) {
+ /* Associate skb with specified zone. */
+ if (tmpl) {
+ ct = nf_ct_get(skb, &ctinfo);
+ if (skb_nfct(skb))
+ nf_conntrack_put(skb_nfct(skb));
+ nf_conntrack_get(&tmpl->ct_general);
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
+ }
+
+ state.hook = NF_INET_PRE_ROUTING;
+ state.net = net;
+ state.pf = family;
+ err = nf_conntrack_in(skb, &state);
+ if (err != NF_ACCEPT)
+ goto out_push;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ goto out_push;
+ nf_ct_deliver_cached_events(ct);
+
+ err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
+ if (err != NF_ACCEPT)
+ goto drop;
+
+ if (commit) {
+ tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
+ tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
+
+ /* This will take care of sending queued events
+ * even if the connection is already confirmed.
+ */
+ nf_conntrack_confirm(skb);
+ }
+
+out_push:
+ skb_push_rcsum(skb, nh_ofs);
+
+out:
+ bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+ return retval;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+ [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
+ [TCA_CT_ACTION] = { .type = NLA_U16 },
+ [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
+ [TCA_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_CT_MARK] = { .type = NLA_U32 },
+ [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
+ [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr) },
+ [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
+ [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct nf_nat_range2 *range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!IS_ENABLED(CONFIG_NF_NAT)) {
+ NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
+ return -EOPNOTSUPP;
+ }
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
+ (p->ct_action & TCA_CT_ACT_NAT_DST)) {
+ NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
+ return -EOPNOTSUPP;
+ }
+
+ range = &p->range;
+ if (tb[TCA_CT_NAT_IPV4_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
+
+ p->ipv4_range = true;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.ip =
+ nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
+
+ range->max_addr.ip = max_attr ?
+ nla_get_in_addr(max_attr) :
+ range->min_addr.ip;
+ } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
+ struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
+
+ p->ipv4_range = false;
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ range->min_addr.in6 =
+ nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
+
+ range->max_addr.in6 = max_attr ?
+ nla_get_in6_addr(max_attr) :
+ range->min_addr.in6;
+ }
+
+ if (tb[TCA_CT_NAT_PORT_MIN]) {
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
+
+ range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
+ nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
+ range->min_proto.all;
+ }
+
+ return 0;
+}
+
+static void tcf_ct_set_key_val(struct nlattr **tb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ if (!tb[val_type])
+ return;
+ nla_memcpy(val, tb[val_type], len);
+
+ if (!mask)
+ return;
+
+ if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
+ memset(mask, 0xff, len);
+ else
+ nla_memcpy(mask, tb[mask_type], len);
+}
+
+static int tcf_ct_fill_params(struct net *net,
+ struct tcf_ct_params *p,
+ struct tc_ct *parm,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+ struct nf_conntrack_zone zone;
+ struct nf_conn *tmpl;
+ int err;
+
+ p->zone = NF_CT_DEFAULT_ZONE_ID;
+
+ tcf_ct_set_key_val(tb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action));
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ return 0;
+
+ err = tcf_ct_fill_params_nat(p, parm, tb, extack);
+ if (err)
+ return err;
+
+ if (tb[TCA_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark));
+ }
+
+ if (tb[TCA_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ if (!tn->labels) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
+ return -EOPNOTSUPP;
+ }
+ tcf_ct_set_key_val(tb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels));
+ }
+
+ if (tb[TCA_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
+ return -EOPNOTSUPP;
+ }
+
+ tcf_ct_set_key_val(tb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone));
+ }
+
+ if (p->zone == NF_CT_DEFAULT_ZONE_ID)
+ return 0;
+
+ nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
+ tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
+ if (!tmpl) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
+ return -ENOMEM;
+ }
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ nf_conntrack_get(&tmpl->ct_general);
+ p->tmpl = tmpl;
+
+ return 0;
+}
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int replace, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+ struct tcf_ct_params *params = NULL;
+ struct nlattr *tb[TCA_CT_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tc_ct *parm;
+ struct tcf_ct *c;
+ int err, res = 0;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CT_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_CT_PARMS]);
+
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+
+ if (!err) {
+ err = tcf_idr_create(tn, parm->index, est, a,
+ &act_ct_ops, bind, true);
+ if (err) {
+ tcf_idr_cleanup(tn, parm->index);
+ return err;
+ }
+ res = ACT_P_CREATED;
+ } else {
+ if (bind)
+ return 0;
+
+ if (!replace) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ }
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto cleanup;
+
+ c = to_ct(*a);
+
+ params = kzalloc(sizeof(*params), GFP_KERNEL);
+ if (unlikely(!params)) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+
+ err = tcf_ct_fill_params(net, params, parm, tb, extack);
+ if (err)
+ goto cleanup;
+
+ spin_lock_bh(&c->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+ spin_unlock_bh(&c->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (params)
+ kfree_rcu(params, rcu);
+ if (res == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return res;
+
+cleanup:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ kfree(params);
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_ct_cleanup(struct tc_action *a)
+{
+ struct tcf_ct_params *params;
+ struct tcf_ct *c = to_ct(a);
+
+ params = rcu_dereference_protected(c->params, 1);
+ if (params)
+ call_rcu(&params->rcu, tcf_ct_params_free);
+}
+
+static int tcf_ct_dump_key_val(struct sk_buff *skb,
+ void *val, int val_type,
+ void *mask, int mask_type,
+ int len)
+{
+ int err;
+
+ if (mask && !memchr_inv(mask, 0, len))
+ return 0;
+
+ err = nla_put(skb, val_type, len, val);
+ if (err)
+ return err;
+
+ if (mask_type != TCA_CT_UNSPEC) {
+ err = nla_put(skb, mask_type, len, mask);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
+{
+ struct nf_nat_range2 *range = &p->range;
+
+ if (!(p->ct_action & TCA_CT_ACT_NAT))
+ return 0;
+
+ if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
+ return 0;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS) {
+ if (p->ipv4_range) {
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
+ range->min_addr.ip))
+ return -1;
+ if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
+ range->max_addr.ip))
+ return -1;
+ } else {
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
+ &range->min_addr.in6))
+ return -1;
+ if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
+ &range->max_addr.in6))
+ return -1;
+ }
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
+ range->min_proto.all))
+ return -1;
+ if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
+ range->max_proto.all))
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ct *c = to_ct(a);
+ struct tcf_ct_params *p;
+
+ struct tc_ct opt = {
+ .index = c->tcf_index,
+ .refcnt = refcount_read(&c->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&c->tcf_lock);
+ p = rcu_dereference_protected(c->params,
+ lockdep_is_held(&c->tcf_lock));
+ opt.action = c->tcf_action;
+
+ if (tcf_ct_dump_key_val(skb,
+ &p->ct_action, TCA_CT_ACTION,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->ct_action)))
+ goto nla_put_failure;
+
+ if (p->ct_action & TCA_CT_ACT_CLEAR)
+ goto skip_dump;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ tcf_ct_dump_key_val(skb,
+ &p->mark, TCA_CT_MARK,
+ &p->mark_mask, TCA_CT_MARK_MASK,
+ sizeof(p->mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ tcf_ct_dump_key_val(skb,
+ p->labels, TCA_CT_LABELS,
+ p->labels_mask, TCA_CT_LABELS_MASK,
+ sizeof(p->labels)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ tcf_ct_dump_key_val(skb,
+ &p->zone, TCA_CT_ZONE,
+ NULL, TCA_CT_UNSPEC,
+ sizeof(p->zone)))
+ goto nla_put_failure;
+
+ if (tcf_ct_dump_nat(skb, p))
+ goto nla_put_failure;
+
+skip_dump:
+ if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &c->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&c->tcf_lock);
+
+ return skb->len;
+nla_put_failure:
+ spin_unlock_bh(&c->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ct_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_ct *c = to_ct(a);
+
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
+}
+
+static struct tc_action_ops act_ct_ops = {
+ .kind = "ct",
+ .id = TCA_ID_CT,
+ .owner = THIS_MODULE,
+ .act = tcf_ct_act,
+ .dump = tcf_ct_dump,
+ .init = tcf_ct_init,
+ .cleanup = tcf_ct_cleanup,
+ .walk = tcf_ct_walker,
+ .lookup = tcf_ct_search,
+ .stats_update = tcf_stats_update,
+ .size = sizeof(struct tcf_ct),
+};
+
+static __net_init int ct_init_net(struct net *net)
+{
+ unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+ if (nf_connlabels_get(net, n_bits - 1)) {
+ tn->labels = false;
+ pr_err("act_ct: Failed to set connlabels length");
+ } else {
+ tn->labels = true;
+ }
+
+ return tc_action_net_init(&tn->tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
+
+ if (tn->labels)
+ nf_connlabels_put(net);
+ }
+ rtnl_unlock();
+
+ tc_action_net_exit(net_list, ct_net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+ .init = ct_init_net,
+ .exit_batch = ct_exit_net,
+ .id = &ct_net_id,
+ .size = sizeof(struct tc_ct_action_net),
+};
+
+static int __init ct_init_module(void)
+{
+ return tcf_register_action(&act_ct_ops, &ct_net_ops);
+}
+
+static void __exit ct_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);
+MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
+MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
+MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
+MODULE_DESCRIPTION("Connection tracking action");
+MODULE_LICENSE("GPL v2");
+
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
new file mode 100644
index 000000000000..10eb2bb99861
--- /dev/null
+++ b/net/sched/act_ctinfo.c
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_ctinfo.h>
+#include <net/tc_act/tc_ctinfo.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static struct tc_action_ops act_ctinfo_ops;
+static unsigned int ctinfo_net_id;
+
+static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb, int wlen, int proto)
+{
+ u8 dscp, newdscp;
+
+ newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv4_change_dsfield(ip_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ case NFPROTO_IPV6:
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
+ if (dscp != newdscp) {
+ if (likely(!skb_try_make_writable(skb, wlen))) {
+ ipv6_change_dsfield(ipv6_hdr(skb),
+ INET_ECN_MASK,
+ newdscp);
+ ca->stats_dscp_set++;
+ } else {
+ ca->stats_dscp_error++;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
+ struct tcf_ctinfo_params *cp,
+ struct sk_buff *skb)
+{
+ ca->stats_cpmark_set++;
+ skb->mark = ct->mark & cp->cpmarkmask;
+}
+
+static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash = NULL;
+ struct tcf_ctinfo *ca = to_ctinfo(a);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone zone;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_ctinfo_params *cp;
+ struct nf_conn *ct;
+ int proto, wlen;
+ int action;
+
+ cp = rcu_dereference_bh(ca->params);
+
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+ action = READ_ONCE(ca->tcf_action);
+
+ wlen = skb_network_offset(skb);
+ if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) { /* look harder, usually ingress */
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, cp->net, &tuple))
+ goto out;
+ zone.id = cp->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ }
+
+ if (cp->mode & CTINFO_MODE_DSCP)
+ if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
+ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
+
+ if (cp->mode & CTINFO_MODE_CPMARK)
+ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
+
+ if (thash)
+ nf_ct_put(ct);
+out:
+ return action;
+}
+
+static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
+ [TCA_CTINFO_ACT] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct
+ tc_ctinfo) },
+ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
+ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
+ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
+};
+
+static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+ struct nlattr *tb[TCA_CTINFO_MAX + 1];
+ struct tcf_ctinfo_params *cp_new;
+ struct tcf_chain *goto_ch = NULL;
+ u32 dscpmask = 0, dscpstatemask;
+ struct tc_ctinfo *actparm;
+ struct tcf_ctinfo *ci;
+ u8 dscpmaskshift;
+ int ret = 0, err;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_CTINFO_ACT]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing required TCA_CTINFO_ACT attribute");
+ return -EINVAL;
+ }
+ actparm = nla_data(tb[TCA_CTINFO_ACT]);
+
+ /* do some basic validation here before dynamically allocating things */
+ /* that we would otherwise have to clean up. */
+ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
+ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
+ /* need contiguous 6 bit mask */
+ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
+ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_MASK],
+ "dscp mask must be 6 contiguous bits");
+ return -EINVAL;
+ }
+ dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
+ nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
+ /* mask & statemask must not overlap */
+ if (dscpmask & dscpstatemask) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_CTINFO_PARMS_DSCP_STATEMASK],
+ "dscp statemask must not overlap dscp mask");
+ return -EINVAL;
+ }
+ }
+
+ /* done the validation:now to the actual action allocation */
+ err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
+ if (!err) {
+ ret = tcf_idr_create(tn, actparm->index, est, a,
+ &act_ctinfo_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, actparm->index);
+ return ret;
+ }
+ ret = ACT_P_CREATED;
+ } else if (err > 0) {
+ if (bind) /* don't override defaults */
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ } else {
+ return err;
+ }
+
+ err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ ci = to_ctinfo(*a);
+
+ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
+ if (unlikely(!cp_new)) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ cp_new->net = net;
+ cp_new->zone = tb[TCA_CTINFO_ZONE] ?
+ nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
+ if (dscpmask) {
+ cp_new->dscpmask = dscpmask;
+ cp_new->dscpmaskshift = dscpmaskshift;
+ cp_new->dscpstatemask = dscpstatemask;
+ cp_new->mode |= CTINFO_MODE_DSCP;
+ }
+
+ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
+ cp_new->cpmarkmask =
+ nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
+ cp_new->mode |= CTINFO_MODE_CPMARK;
+ }
+
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
+ rcu_swap_protected(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
+ spin_unlock_bh(&ci->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (cp_new)
+ kfree_rcu(cp_new, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+
+ return ret;
+
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tc_ctinfo opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ctinfo_params *cp;
+ struct tcf_t t;
+
+ spin_lock_bh(&ci->tcf_lock);
+ cp = rcu_dereference_protected(ci->params,
+ lockdep_is_held(&ci->tcf_lock));
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ opt.action = ci->tcf_action;
+ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
+ goto nla_put_failure;
+
+ if (cp->mode & CTINFO_MODE_DSCP) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
+ cp->dscpmask))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
+ cp->dscpstatemask))
+ goto nla_put_failure;
+ }
+
+ if (cp->mode & CTINFO_MODE_CPMARK) {
+ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
+ cp->cpmarkmask))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
+ ci->stats_dscp_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
+ ci->stats_dscp_error, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
+ ci->stats_cpmark_set, TCA_CTINFO_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ci->tcf_lock);
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ctinfo_ops = {
+ .kind = "ctinfo",
+ .id = TCA_ID_CTINFO,
+ .owner = THIS_MODULE,
+ .act = tcf_ctinfo_act,
+ .dump = tcf_ctinfo_dump,
+ .init = tcf_ctinfo_init,
+ .walk = tcf_ctinfo_walker,
+ .lookup = tcf_ctinfo_search,
+ .size = sizeof(struct tcf_ctinfo),
+};
+
+static __net_init int ctinfo_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+
+ return tc_action_net_init(tn, &act_ctinfo_ops);
+}
+
+static void __net_exit ctinfo_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, ctinfo_net_id);
+}
+
+static struct pernet_operations ctinfo_net_ops = {
+ .init = ctinfo_init_net,
+ .exit_batch = ctinfo_exit_net,
+ .id = &ctinfo_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ctinfo_init_module(void)
+{
+ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+static void __exit ctinfo_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
+}
+
+module_init(ctinfo_init_module);
+module_exit(ctinfo_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Connection tracking mark actions");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 58e7573dded4..055faa298c8e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -27,6 +27,9 @@
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
+#define MIRRED_RECURSION_LIMIT 4
+static DEFINE_PER_CPU(unsigned int, mirred_rec_level);
+
static bool tcf_mirred_is_act_redirect(int action)
{
return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -210,6 +213,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
struct sk_buff *skb2 = skb;
bool m_mac_header_xmit;
struct net_device *dev;
+ unsigned int rec_level;
int retval, err = 0;
bool use_reinsert;
bool want_ingress;
@@ -217,6 +221,14 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
int m_eaction;
int mac_len;
+ rec_level = __this_cpu_inc_return(mirred_rec_level);
+ if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
+ net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
+ netdev_name(skb->dev));
+ __this_cpu_dec(mirred_rec_level);
+ return TC_ACT_SHOT;
+ }
+
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -277,7 +289,9 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
if (use_reinsert) {
res->ingress = want_ingress;
res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- return TC_ACT_REINSERT;
+ skb_tc_reinsert(skb, res);
+ __this_cpu_dec(mirred_rec_level);
+ return TC_ACT_CONSUMED;
}
}
@@ -292,6 +306,7 @@ out:
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
+ __this_cpu_dec(mirred_rec_level);
return retval;
}
@@ -411,6 +426,11 @@ static void tcf_mirred_put_dev(struct net_device *dev)
dev_put(dev);
}
+static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
+{
+ return nla_total_size(sizeof(struct tc_mirred));
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.id = TCA_ID_MIRRED,
@@ -422,6 +442,7 @@ static struct tc_action_ops act_mirred_ops = {
.init = tcf_mirred_init,
.walk = tcf_mirred_walker,
.lookup = tcf_mirred_search,
+ .get_fill_size = tcf_mirred_get_fill_size,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
.put_dev = tcf_mirred_put_dev,
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
new file mode 100644
index 000000000000..ca2597ce4ac9
--- /dev/null
+++ b/net/sched/act_mpls.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/tc_act/tc_mpls.h>
+#include <net/mpls.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mpls.h>
+
+static unsigned int mpls_net_id;
+static struct tc_action_ops act_mpls_ops;
+
+#define ACT_MPLS_TTL_DEFAULT 255
+
+static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse,
+ struct tcf_mpls_params *p, bool set_bos)
+{
+ u32 new_lse = 0;
+
+ if (lse)
+ new_lse = be32_to_cpu(lse->label_stack_entry);
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) {
+ new_lse &= ~MPLS_LS_LABEL_MASK;
+ new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT;
+ }
+ if (p->tcfm_ttl) {
+ new_lse &= ~MPLS_LS_TTL_MASK;
+ new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT;
+ }
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) {
+ new_lse &= ~MPLS_LS_TC_MASK;
+ new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT;
+ }
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) {
+ new_lse &= ~MPLS_LS_S_MASK;
+ new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT;
+ } else if (set_bos) {
+ new_lse |= 1 << MPLS_LS_S_SHIFT;
+ }
+
+ return cpu_to_be32(new_lse);
+}
+
+static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ __be32 new_lse;
+ int ret;
+
+ tcf_lastuse_update(&m->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+
+ /* Ensure 'data' points at mac_header prior calling mpls manipulating
+ * functions.
+ */
+ if (skb_at_tc_ingress(skb))
+ skb_push_rcsum(skb, skb->mac_len);
+
+ ret = READ_ONCE(m->tcf_action);
+
+ p = rcu_dereference_bh(m->mpls_p);
+
+ switch (p->tcfm_action) {
+ case TCA_MPLS_ACT_POP:
+ if (skb_mpls_pop(skb, p->tcfm_proto))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false);
+ if (skb_mpls_update_lse(skb, new_lse))
+ goto drop;
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (skb_mpls_dec_ttl(skb))
+ goto drop;
+ break;
+ }
+
+ if (skb_at_tc_ingress(skb))
+ skb_pull_rcsum(skb, skb->mac_len);
+
+ return ret;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
+ return TC_ACT_SHOT;
+}
+
+static int valid_label(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u32 *label = nla_data(attr);
+
+ if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) {
+ NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
+ [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
+ [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
+ [TCA_MPLS_PROTO] = { .type = NLA_U16 },
+ [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
+ [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7),
+ [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
+ [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
+};
+
+static int tcf_mpls_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp, struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+ struct nlattr *tb[TCA_MPLS_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_mpls_params *p;
+ struct tc_mpls *parm;
+ bool exists = false;
+ struct tcf_mpls *m;
+ int ret = 0, err;
+ u8 mpls_ttl = 0;
+
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_MPLS_PARMS]) {
+ NL_SET_ERR_MSG_MOD(extack, "No MPLS params");
+ return -EINVAL;
+ }
+ parm = nla_data(tb[TCA_MPLS_PARMS]);
+
+ /* Verify parameters against action type. */
+ switch (parm->m_action) {
+ case TCA_MPLS_ACT_POP:
+ if (!tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop");
+ return -EINVAL;
+ }
+ if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop");
+ return -EINVAL;
+ }
+ if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] ||
+ tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop");
+ return -EINVAL;
+ }
+ break;
+ case TCA_MPLS_ACT_DEC_TTL:
+ if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] ||
+ tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl");
+ return -EINVAL;
+ }
+ break;
+ case TCA_MPLS_ACT_PUSH:
+ if (!tb[TCA_MPLS_LABEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push");
+ return -EINVAL;
+ }
+ if (tb[TCA_MPLS_PROTO] &&
+ !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push");
+ return -EPROTONOSUPPORT;
+ }
+ /* Push needs a TTL - if not specified, set a default value. */
+ if (!tb[TCA_MPLS_TTL]) {
+#if IS_ENABLED(CONFIG_MPLS)
+ mpls_ttl = net->mpls.default_ttl ?
+ net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT;
+#else
+ mpls_ttl = ACT_MPLS_TTL_DEFAULT;
+#endif
+ }
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ if (tb[TCA_MPLS_PROTO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify");
+ return -EINVAL;
+ }
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action");
+ return -EINVAL;
+ }
+
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
+ if (exists && bind)
+ return 0;
+
+ if (!exists) {
+ ret = tcf_idr_create(tn, parm->index, est, a,
+ &act_mpls_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
+ return ret;
+ }
+
+ ret = ACT_P_CREATED;
+ } else if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+ if (err < 0)
+ goto release_idr;
+
+ m = to_mpls(*a);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ err = -ENOMEM;
+ goto put_chain;
+ }
+
+ p->tcfm_action = parm->m_action;
+ p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) :
+ ACT_MPLS_LABEL_NOT_SET;
+ p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) :
+ ACT_MPLS_TC_NOT_SET;
+ p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) :
+ mpls_ttl;
+ p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) :
+ ACT_MPLS_BOS_NOT_SET;
+ p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) :
+ htons(ETH_P_MPLS_UC);
+
+ spin_lock_bh(&m->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ spin_unlock_bh(&m->tcf_lock);
+
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (p)
+ kfree_rcu(p, rcu);
+
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
+ return ret;
+put_chain:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static void tcf_mpls_cleanup(struct tc_action *a)
+{
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+
+ p = rcu_dereference_protected(m->mpls_p, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_mpls *m = to_mpls(a);
+ struct tcf_mpls_params *p;
+ struct tc_mpls opt = {
+ .index = m->tcf_index,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&m->tcf_lock);
+ opt.action = m->tcf_action;
+ p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+ opt.m_action = p->tcfm_action;
+
+ if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET &&
+ nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label))
+ goto nla_put_failure;
+
+ if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc))
+ goto nla_put_failure;
+
+ if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl))
+ goto nla_put_failure;
+
+ if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET &&
+ nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &m->tcf_tm);
+
+ if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&m->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&m->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tcf_mpls_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_mpls_ops = {
+ .kind = "mpls",
+ .id = TCA_ID_MPLS,
+ .owner = THIS_MODULE,
+ .act = tcf_mpls_act,
+ .dump = tcf_mpls_dump,
+ .init = tcf_mpls_init,
+ .cleanup = tcf_mpls_cleanup,
+ .walk = tcf_mpls_walker,
+ .lookup = tcf_mpls_search,
+ .size = sizeof(struct tcf_mpls),
+};
+
+static __net_init int mpls_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, mpls_net_id);
+
+ return tc_action_net_init(tn, &act_mpls_ops);
+}
+
+static void __net_exit mpls_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, mpls_net_id);
+}
+
+static struct pernet_operations mpls_net_ops = {
+ .init = mpls_init_net,
+ .exit_batch = mpls_exit_net,
+ .id = &mpls_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init mpls_init_module(void)
+{
+ return tcf_register_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+static void __exit mpls_cleanup_module(void)
+{
+ tcf_unregister_action(&act_mpls_ops, &mpls_net_ops);
+}
+
+module_init(mpls_init_module);
+module_exit(mpls_cleanup_module);
+
+MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPLS manipulation actions");
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ad36bbcc583e..638c1bc1ea1b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -35,6 +35,7 @@
#include <net/tc_act/tc_police.h>
#include <net/tc_act/tc_sample.h>
#include <net/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_ct.h>
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -672,21 +673,27 @@ static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
kfree(indr_block_cb);
}
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo);
+
static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
struct tc_indr_block_cb *indr_block_cb,
- enum tc_block_command command)
+ enum flow_block_command command)
{
- struct tc_block_offload bo = {
+ struct flow_block_offload bo = {
.command = command,
- .binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
- .block = indr_dev->block,
+ .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+ .net = dev_net(indr_dev->dev),
+ .block_shared = tcf_block_shared(indr_dev->block),
};
+ INIT_LIST_HEAD(&bo.cb_list);
if (!indr_dev->block)
return;
indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
&bo);
+ tcf_block_setup(indr_dev->block, &bo);
}
int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
@@ -705,7 +712,7 @@ int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
if (err)
goto err_dev_put;
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
return 0;
err_dev_put:
@@ -742,7 +749,7 @@ void __tc_indr_block_cb_unregister(struct net_device *dev,
return;
/* Send unbind message if required to free any block cbs. */
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
+ tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
tc_indr_block_cb_del(indr_block_cb);
tc_indr_block_dev_put(indr_dev);
}
@@ -759,27 +766,31 @@ EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command,
+ enum flow_block_command command,
struct netlink_ext_ack *extack)
{
struct tc_indr_block_cb *indr_block_cb;
struct tc_indr_block_dev *indr_dev;
- struct tc_block_offload bo = {
+ struct flow_block_offload bo = {
.command = command,
.binder_type = ei->binder_type,
- .block = block,
+ .net = dev_net(dev),
+ .block_shared = tcf_block_shared(block),
.extack = extack,
};
+ INIT_LIST_HEAD(&bo.cb_list);
indr_dev = tc_indr_block_dev_lookup(dev);
if (!indr_dev)
return;
- indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+ indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
&bo);
+
+ tcf_block_setup(block, &bo);
}
static bool tcf_block_offload_in_use(struct tcf_block *block)
@@ -790,16 +801,24 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
static int tcf_block_offload_cmd(struct tcf_block *block,
struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command,
+ enum flow_block_command command,
struct netlink_ext_ack *extack)
{
- struct tc_block_offload bo = {};
+ struct flow_block_offload bo = {};
+ int err;
+ bo.net = dev_net(dev);
bo.command = command;
bo.binder_type = ei->binder_type;
- bo.block = block;
+ bo.block_shared = tcf_block_shared(block);
bo.extack = extack;
- return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ INIT_LIST_HEAD(&bo.cb_list);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0)
+ return err;
+
+ return tcf_block_setup(block, &bo);
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
@@ -820,20 +839,20 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
return -EOPNOTSUPP;
}
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
+ err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
if (err)
return err;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
return 0;
no_offload_dev_inc:
if (tcf_block_offload_in_use(block))
return -EOPNOTSUPP;
block->nooffloaddevcnt++;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
return 0;
}
@@ -843,11 +862,11 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
- tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+ err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
return;
@@ -1340,17 +1359,17 @@ static void tcf_block_release(struct Qdisc *q, struct tcf_block *block,
struct tcf_block_owner_item {
struct list_head list;
struct Qdisc *q;
- enum tcf_block_binder_type binder_type;
+ enum flow_block_binder_type binder_type;
};
static void
tcf_block_owner_netif_keep_dst(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
if (block->keep_dst &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
- binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+ binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
netif_keep_dst(qdisc_dev(q));
}
@@ -1367,7 +1386,7 @@ EXPORT_SYMBOL(tcf_block_netif_keep_dst);
static int tcf_block_owner_add(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -1382,7 +1401,7 @@ static int tcf_block_owner_add(struct tcf_block *block,
static void tcf_block_owner_del(struct tcf_block *block,
struct Qdisc *q,
- enum tcf_block_binder_type binder_type)
+ enum flow_block_binder_type binder_type)
{
struct tcf_block_owner_item *item;
@@ -1494,43 +1513,6 @@ void tcf_block_put(struct tcf_block *block)
EXPORT_SYMBOL(tcf_block_put);
-struct tcf_block_cb {
- struct list_head list;
- tc_setup_cb_t *cb;
- void *cb_ident;
- void *cb_priv;
- unsigned int refcnt;
-};
-
-void *tcf_block_cb_priv(struct tcf_block_cb *block_cb)
-{
- return block_cb->cb_priv;
-}
-EXPORT_SYMBOL(tcf_block_cb_priv);
-
-struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
-{ struct tcf_block_cb *block_cb;
-
- list_for_each_entry(block_cb, &block->cb_list, list)
- if (block_cb->cb == cb && block_cb->cb_ident == cb_ident)
- return block_cb;
- return NULL;
-}
-EXPORT_SYMBOL(tcf_block_cb_lookup);
-
-void tcf_block_cb_incref(struct tcf_block_cb *block_cb)
-{
- block_cb->refcnt++;
-}
-EXPORT_SYMBOL(tcf_block_cb_incref);
-
-unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
-{
- return --block_cb->refcnt;
-}
-EXPORT_SYMBOL(tcf_block_cb_decref);
-
static int
tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
void *cb_priv, bool add, bool offload_in_use,
@@ -1572,66 +1554,76 @@ err_playback_remove:
return err;
}
-struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv,
- struct netlink_ext_ack *extack)
+static int tcf_block_bind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
- int err;
+ struct flow_block_cb *block_cb, *next;
+ int err, i = 0;
- /* Replay any already present rules */
- err = tcf_block_playback_offloads(block, cb, cb_priv, true,
- tcf_block_offload_in_use(block),
- extack);
- if (err)
- return ERR_PTR(err);
+ list_for_each_entry(block_cb, &bo->cb_list, list) {
+ err = tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, true,
+ tcf_block_offload_in_use(block),
+ bo->extack);
+ if (err)
+ goto err_unroll;
- block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
- if (!block_cb)
- return ERR_PTR(-ENOMEM);
- block_cb->cb = cb;
- block_cb->cb_ident = cb_ident;
- block_cb->cb_priv = cb_priv;
- list_add(&block_cb->list, &block->cb_list);
- return block_cb;
-}
-EXPORT_SYMBOL(__tcf_block_cb_register);
+ i++;
+ }
+ list_splice(&bo->cb_list, &block->cb_list);
-int tcf_block_cb_register(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv, struct netlink_ext_ack *extack)
-{
- struct tcf_block_cb *block_cb;
+ return 0;
+
+err_unroll:
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ if (i-- > 0) {
+ list_del(&block_cb->list);
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ }
+ flow_block_cb_free(block_cb);
+ }
- block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
- extack);
- return PTR_ERR_OR_ZERO(block_cb);
+ return err;
}
-EXPORT_SYMBOL(tcf_block_cb_register);
-void __tcf_block_cb_unregister(struct tcf_block *block,
- struct tcf_block_cb *block_cb)
+static void tcf_block_unbind(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
- false, tcf_block_offload_in_use(block),
- NULL);
- list_del(&block_cb->list);
- kfree(block_cb);
+ struct flow_block_cb *block_cb, *next;
+
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ tcf_block_playback_offloads(block, block_cb->cb,
+ block_cb->cb_priv, false,
+ tcf_block_offload_in_use(block),
+ NULL);
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
}
-EXPORT_SYMBOL(__tcf_block_cb_unregister);
-void tcf_block_cb_unregister(struct tcf_block *block,
- tc_setup_cb_t *cb, void *cb_ident)
+static int tcf_block_setup(struct tcf_block *block,
+ struct flow_block_offload *bo)
{
- struct tcf_block_cb *block_cb;
+ int err;
- block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
- if (!block_cb)
- return;
- __tcf_block_cb_unregister(block, block_cb);
+ switch (bo->command) {
+ case FLOW_BLOCK_BIND:
+ err = tcf_block_bind(block, bo);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = 0;
+ tcf_block_unbind(block, bo);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
}
-EXPORT_SYMBOL(tcf_block_cb_unregister);
/* Main classifier routine: scans classifier chain attached
* to this qdisc, (optionally) tests for protocol and asks
@@ -3155,7 +3147,7 @@ EXPORT_SYMBOL(tcf_exts_dump_stats);
int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
void *type_data, bool err_stop)
{
- struct tcf_block_cb *block_cb;
+ struct flow_block_cb *block_cb;
int ok_count = 0;
int err;
@@ -3266,6 +3258,10 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->police.burst = tcf_police_tcfp_burst(act);
entry->police.rate_bytes_ps =
tcf_police_rate_bytes_ps(act);
+ } else if (is_tcf_ct(act)) {
+ entry->id = FLOW_ACTION_CT;
+ entry->ct.action = tcf_ct_action(act);
+ entry->ct.zone = tcf_ct_zone(act);
} else {
goto err_out;
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eedd5786c084..38d6e85693fc 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -26,8 +26,10 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <uapi/linux/netfilter/nf_conntrack_common.h>
+
struct fl_flow_key {
- int indev_ifindex;
+ struct flow_dissector_key_meta meta;
struct flow_dissector_key_control control;
struct flow_dissector_key_control enc_control;
struct flow_dissector_key_basic basic;
@@ -54,6 +56,7 @@ struct fl_flow_key {
struct flow_dissector_key_enc_opts enc_opts;
struct flow_dissector_key_ports tp_min;
struct flow_dissector_key_ports tp_max;
+ struct flow_dissector_key_ct ct;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -272,24 +275,40 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
return __fl_lookup(mask, mkey);
}
+static u16 fl_ct_info_to_flower_map[] = {
+ [IP_CT_ESTABLISHED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_ESTABLISHED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED,
+ [IP_CT_RELATED_REPLY] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_RELATED,
+ [IP_CT_NEW] = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_NEW,
+};
+
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- struct cls_fl_filter *f;
- struct fl_flow_mask *mask;
- struct fl_flow_key skb_key;
struct fl_flow_key skb_mkey;
+ struct fl_flow_key skb_key;
+ struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
list_for_each_entry_rcu(mask, &head->masks, list) {
fl_clear_masked_range(&skb_key, mask);
- skb_key.indev_ifindex = skb->skb_iif;
+ skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
/* skb_flow_dissect() does not set n_proto in case an unknown
* protocol, so do it rather here.
*/
skb_key.basic.n_proto = skb->protocol;
skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+ skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
+ fl_ct_info_to_flower_map,
+ ARRAY_SIZE(fl_ct_info_to_flower_map));
skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
fl_set_masked_key(&skb_mkey, &skb_key, mask);
@@ -390,14 +409,14 @@ static void fl_destroy_filter_work(struct work_struct *work)
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
if (!rtnl_held)
rtnl_lock();
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_DESTROY;
+ cls_flower.command = FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long) f;
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
@@ -415,8 +434,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = fl_head_dereference(tp);
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
bool skip_sw = tc_skip_sw(f->flags);
int err = 0;
@@ -430,7 +449,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
}
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
- cls_flower.command = TC_CLSFLOWER_REPLACE;
+ cls_flower.command = FLOW_CLS_REPLACE;
cls_flower.cookie = (unsigned long) f;
cls_flower.rule->match.dissector = &f->mask->dissector;
cls_flower.rule->match.mask = &f->mask->key;
@@ -479,14 +498,14 @@ errout:
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
bool rtnl_held)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
if (!rtnl_held)
rtnl_lock();
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
- cls_flower.command = TC_CLSFLOWER_STATS;
+ cls_flower.command = FLOW_CLS_STATS;
cls_flower.cookie = (unsigned long) f;
cls_flower.classid = f->res.classid;
@@ -524,24 +543,6 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
return f;
}
-static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
- unsigned long *handle)
-{
- struct cls_fl_head *head = fl_head_dereference(tp);
- struct cls_fl_filter *f;
-
- rcu_read_lock();
- while ((f = idr_get_next_ul(&head->handle_idr, handle))) {
- /* don't return filters that are being deleted */
- if (refcount_inc_not_zero(&f->refcnt))
- break;
- ++(*handle);
- }
- rcu_read_unlock();
-
- return f;
-}
-
static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
bool *last, bool rtnl_held,
struct netlink_ext_ack *extack)
@@ -704,6 +705,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
[TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_CT_STATE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_STATE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_ZONE_MASK] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CT_MARK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_MARK_MASK] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_CT_LABELS] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = 128 / BITS_PER_BYTE },
};
static const struct nla_policy
@@ -725,11 +736,11 @@ static void fl_set_key_val(struct nlattr **tb,
{
if (!tb[val_type])
return;
- memcpy(val, nla_data(tb[val_type]), len);
+ nla_memcpy(val, tb[val_type], len);
if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type])
memset(mask, 0xff, len);
else
- memcpy(mask, nla_data(tb[mask_type]), len);
+ nla_memcpy(mask, tb[mask_type], len);
}
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
@@ -1015,21 +1026,65 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
return 0;
}
+static int fl_set_key_ct(struct nlattr **tb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_ZONE]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
+ NL_SET_ERR_MSG(extack, "Conntrack zones isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_MARK]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
+ NL_SET_ERR_MSG(extack, "Conntrack mark isn't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark));
+ }
+ if (tb[TCA_FLOWER_KEY_CT_LABELS]) {
+ if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
+ NL_SET_ERR_MSG(extack, "Conntrack labels aren't enabled");
+ return -EOPNOTSUPP;
+ }
+ fl_set_key_val(tb, key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels));
+ }
+
+ return 0;
+}
+
static int fl_set_key(struct net *net, struct nlattr **tb,
struct fl_flow_key *key, struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
{
__be16 ethertype;
int ret = 0;
-#ifdef CONFIG_NET_CLS_IND
+
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV], extack);
if (err < 0)
return err;
- key->indev_ifindex = err;
- mask->indev_ifindex = 0xffffffff;
+ key->meta.ingress_ifindex = err;
+ mask->meta.ingress_ifindex = 0xffffffff;
}
-#endif
fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
@@ -1225,6 +1280,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
}
+ ret = fl_set_key_ct(tb, &key->ct, &mask->ct, extack);
+ if (ret)
+ return ret;
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -1282,6 +1341,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_META, meta);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1323,6 +1384,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CT, ct);
skb_flow_dissector_init(dissector, keys, cnt);
}
@@ -1691,20 +1754,25 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
bool rtnl_held)
{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+ unsigned long id = arg->cookie, tmp;
struct cls_fl_filter *f;
arg->count = arg->skip;
- while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
+ idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) {
+ /* don't return filters that are being deleted */
+ if (!refcount_inc_not_zero(&f->refcnt))
+ continue;
if (arg->fn(tp, f, arg) < 0) {
__fl_put(f);
arg->stop = 1;
break;
}
__fl_put(f);
- arg->cookie++;
arg->count++;
}
+ arg->cookie = id;
}
static struct cls_fl_filter *
@@ -1735,8 +1803,8 @@ fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add)
static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
- struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ struct flow_cls_offload cls_flower = {};
struct cls_fl_filter *f = NULL;
int err;
@@ -1757,7 +1825,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags,
extack);
cls_flower.command = add ?
- TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+ FLOW_CLS_REPLACE : FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long)f;
cls_flower.rule->match.dissector = &f->mask->dissector;
cls_flower.rule->match.mask = &f->mask->key;
@@ -1801,7 +1869,7 @@ next_flow:
static int fl_hw_create_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
cls_flower.rule = flow_rule_alloc(0);
@@ -1809,7 +1877,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
return -ENOMEM;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.command = FLOW_CLS_TMPLT_CREATE;
cls_flower.cookie = (unsigned long) tmplt;
cls_flower.rule->match.dissector = &tmplt->dissector;
cls_flower.rule->match.mask = &tmplt->mask;
@@ -1827,11 +1895,11 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
- struct tc_cls_flower_offload cls_flower = {};
+ struct flow_cls_offload cls_flower = {};
struct tcf_block *block = chain->block;
cls_flower.common.chain_index = chain->index;
- cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
@@ -2077,6 +2145,40 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fl_dump_key_ct(struct sk_buff *skb,
+ struct flow_dissector_key_ct *key,
+ struct flow_dissector_key_ct *mask)
+{
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK) &&
+ fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+ &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+ sizeof(key->ct_state)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
+ fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+ &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+ sizeof(key->ct_zone)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
+ fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+ &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+ sizeof(key->ct_mark)))
+ goto nla_put_failure;
+
+ if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
+ fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+ &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+ sizeof(key->ct_labels)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
struct flow_dissector_key_enc_opts *enc_opts)
{
@@ -2123,10 +2225,10 @@ static int fl_dump_key_enc_opt(struct sk_buff *skb,
static int fl_dump_key(struct sk_buff *skb, struct net *net,
struct fl_flow_key *key, struct fl_flow_key *mask)
{
- if (mask->indev_ifindex) {
+ if (mask->meta.ingress_ifindex) {
struct net_device *dev;
- dev = __dev_get_by_index(net, key->indev_ifindex);
+ dev = __dev_get_by_index(net, key->meta.ingress_ifindex);
if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name))
goto nla_put_failure;
}
@@ -2310,6 +2412,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
goto nla_put_failure;
+ if (fl_dump_key_ct(skb, &key->ct, &mask->ct))
+ goto nla_put_failure;
+
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 4dab833f66cb..c9496c920d6f 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -8,9 +8,6 @@
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
- *
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
*/
#include <linux/module.h>
@@ -37,9 +34,7 @@ struct fw_filter {
struct fw_filter __rcu *next;
u32 id;
struct tcf_result res;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
struct rcu_work rwork;
@@ -67,10 +62,8 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->ifindex))
continue;
-#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
@@ -222,7 +215,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
tcf_bind_filter(tp, &f->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack);
@@ -230,7 +222,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
return ret;
f->ifindex = ret;
}
-#endif /* CONFIG_NET_CLS_IND */
err = -EINVAL;
if (tb[TCA_FW_MASK]) {
@@ -276,9 +267,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
fnew->id = f->id;
fnew->res = f->res;
-#ifdef CONFIG_NET_CLS_IND
fnew->ifindex = f->ifindex;
-#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT,
@@ -405,14 +394,12 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (f->res.classid &&
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (f->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, f->ifindex);
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
goto nla_put_failure;
}
-#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF &&
nla_put_u32(skb, TCA_FW_MASK, head->mask))
goto nla_put_failure;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 38c0a9f0f296..a30d2f8feb32 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
unsigned int in_hw_count;
struct tc_matchall_pcnt __percpu *pf;
struct rcu_work rwork;
+ bool deleting;
};
static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -258,7 +259,11 @@ err_exts_init:
static int mall_delete(struct tcf_proto *tp, void *arg, bool *last,
bool rtnl_held, struct netlink_ext_ack *extack)
{
- return -EOPNOTSUPP;
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+
+ head->deleting = true;
+ *last = true;
+ return 0;
}
static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
@@ -269,7 +274,7 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg,
if (arg->count < arg->skip)
goto skip;
- if (!head)
+ if (!head || head->deleting)
return;
if (arg->fn(tp, head, arg) < 0)
arg->stop = 1;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c7727de5e073..be9e46c77e8b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -20,9 +20,6 @@
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*
- * JHS: We should remove the CONFIG_NET_CLS_IND from here
- * eventually when the meta match extension is made available
- *
* nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
*/
@@ -48,9 +45,7 @@ struct tc_u_knode {
u32 handle;
struct tc_u_hnode __rcu *ht_up;
struct tcf_exts exts;
-#ifdef CONFIG_NET_CLS_IND
int ifindex;
-#endif
u8 fshift;
struct tcf_result res;
struct tc_u_hnode __rcu *ht_down;
@@ -176,12 +171,10 @@ check_terminal:
if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
-#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->ifindex)) {
n = rcu_dereference_bh(n->next);
goto next_knode;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
__this_cpu_inc(n->pf->rhit);
#endif
@@ -761,7 +754,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
tcf_bind_filter(tp, &n->res, base);
}
-#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_U32_INDEV], extack);
@@ -769,7 +761,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
return -EINVAL;
n->ifindex = ret;
}
-#endif
return 0;
}
@@ -817,9 +808,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp,
new->handle = n->handle;
RCU_INIT_POINTER(new->ht_up, n->ht_up);
-#ifdef CONFIG_NET_CLS_IND
new->ifindex = n->ifindex;
-#endif
new->fshift = n->fshift;
new->res = n->res;
new->flags = n->flags;
@@ -1351,14 +1340,12 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (tcf_exts_dump(skb, &n->exts) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NET_CLS_IND
if (n->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, n->ifindex);
if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
goto nla_put_failure;
}
-#endif
#ifdef CONFIG_CLS_U32_PERF
gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
n->sel.nkeys * sizeof(u64),
@@ -1422,9 +1409,7 @@ static int __init init_u32(void)
#ifdef CONFIG_CLS_U32_PERF
pr_info(" Performance counters on\n");
#endif
-#ifdef CONFIG_NET_CLS_IND
pr_info(" input device check on\n");
-#endif
#ifdef CONFIG_NET_CLS_ACT
pr_info(" Actions configured\n");
#endif
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index 243fd22f2248..9fff6480acc6 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -21,6 +21,7 @@
struct em_ipt_match {
const struct xt_match *match;
u32 hook;
+ u8 nfproto;
u8 match_data[0] __aligned(8);
};
@@ -71,11 +72,25 @@ static int policy_validate_match_data(struct nlattr **tb, u8 mrev)
return 0;
}
+static int addrtype_validate_match_data(struct nlattr **tb, u8 mrev)
+{
+ if (mrev != 1) {
+ pr_err("only addrtype match revision 1 supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct em_ipt_xt_match em_ipt_xt_matches[] = {
{
.match_name = "policy",
.validate_match_data = policy_validate_match_data
},
+ {
+ .match_name = "addrtype",
+ .validate_match_data = addrtype_validate_match_data
+ },
{}
};
@@ -115,6 +130,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
struct em_ipt_match *im = NULL;
struct xt_match *match;
int mdata_len, ret;
+ u8 nfproto;
ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len,
em_ipt_policy, NULL);
@@ -125,6 +141,15 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
!tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO])
return -EINVAL;
+ nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]);
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ break;
+ default:
+ return -EINVAL;
+ }
+
match = get_xt_match(tb);
if (IS_ERR(match)) {
pr_err("unable to load match\n");
@@ -140,6 +165,7 @@ static int em_ipt_change(struct net *net, void *data, int data_len,
im->match = match;
im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]);
+ im->nfproto = nfproto;
nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len);
ret = check_match(net, im, mdata_len);
@@ -182,15 +208,33 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
const struct em_ipt_match *im = (const void *)em->data;
struct xt_action_param acpar = {};
struct net_device *indev = NULL;
+ u8 nfproto = im->match->family;
struct nf_hook_state state;
int ret;
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV4;
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ if (nfproto == NFPROTO_UNSPEC)
+ nfproto = NFPROTO_IPV6;
+ break;
+ default:
+ return 0;
+ }
+
rcu_read_lock();
if (skb->skb_iif)
indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
- nf_hook_state_init(&state, im->hook, im->match->family,
+ nf_hook_state_init(&state, im->hook, nfproto,
indev ?: skb->dev, skb->dev, NULL, em->net, NULL);
acpar.match = im->match;
@@ -213,7 +257,7 @@ static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em)
return -EMSGSIZE;
if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0)
return -EMSGSIZE;
- if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0)
+ if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->nfproto) < 0)
return -EMSGSIZE;
if (nla_put(skb, TCA_EM_IPT_MATCH_DATA,
im->match->usersize ?: im->match->matchsize,
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index db0c2ba1d156..cebfb65d8556 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -22,10 +22,12 @@
#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK)
struct etf_sched_data {
bool offload;
bool deadline_mode;
+ bool skip_sock_check;
int clockid;
int queue;
s32 delta; /* in ns */
@@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
struct sock *sk = nskb->sk;
ktime_t now;
+ if (q->skip_sock_check)
+ goto skip;
+
if (!sk)
return false;
@@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
if (sk->sk_txtime_deadline_mode != q->deadline_mode)
return false;
+skip:
now = q->get_time();
if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
return false;
@@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
q->clockid = qopt->clockid;
q->offload = OFFLOAD_IS_ON(qopt);
q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+ q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt);
switch (q->clockid) {
case CLOCK_REALTIME:
@@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
if (q->deadline_mode)
opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+ if (q->skip_sock_check)
+ opt.flags |= TC_ETF_SKIP_SOCK_CHECK;
+
if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 0f65f617756b..bf56aa519797 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -83,7 +83,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
- q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->block_info.chain_head_change = clsact_chain_head_change;
q->block_info.chain_head_change_priv = &q->miniqp;
@@ -114,6 +114,7 @@ nla_put_failure:
}
static const struct Qdisc_class_ops ingress_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = ingress_find,
.walk = ingress_walk,
@@ -216,7 +217,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
- q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->ingress_block_info.chain_head_change = clsact_chain_head_change;
q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
@@ -227,7 +228,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
- q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+ q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
q->egress_block_info.chain_head_change = clsact_chain_head_change;
q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
@@ -246,6 +247,7 @@ static void clsact_destroy(struct Qdisc *sch)
}
static const struct Qdisc_class_ops clsact_class_ops = {
+ .flags = QDISC_CLASS_OPS_DOIT_UNLOCKED,
.leaf = ingress_leaf,
.find = clsact_find,
.walk = ingress_walk,
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 9ecfb8f5902a..388750ddc57a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -21,12 +21,17 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
static LIST_HEAD(taprio_list);
static DEFINE_SPINLOCK(taprio_list_lock);
#define TAPRIO_ALL_GATES_OPEN -1
+#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST))
+#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+
struct sched_entry {
struct list_head list;
@@ -35,6 +40,7 @@ struct sched_entry {
* packet leaves after this time.
*/
ktime_t close_time;
+ ktime_t next_txtime;
atomic_t budget;
int index;
u32 gate_mask;
@@ -55,6 +61,8 @@ struct sched_gate_list {
struct taprio_sched {
struct Qdisc **qdiscs;
struct Qdisc *root;
+ u32 flags;
+ enum tk_offsets tk_offset;
int clockid;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
* speeds it's sub-nanoseconds per byte
@@ -65,9 +73,9 @@ struct taprio_sched {
struct sched_entry __rcu *current_entry;
struct sched_gate_list __rcu *oper_sched;
struct sched_gate_list __rcu *admin_sched;
- ktime_t (*get_time)(void);
struct hrtimer advance_timer;
struct list_head taprio_list;
+ int txtime_delay;
};
static ktime_t sched_base_time(const struct sched_gate_list *sched)
@@ -78,6 +86,20 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched)
return ns_to_ktime(sched->base_time);
}
+static ktime_t taprio_get_time(struct taprio_sched *q)
+{
+ ktime_t mono = ktime_get();
+
+ switch (q->tk_offset) {
+ case TK_OFFS_MAX:
+ return mono;
+ default:
+ return ktime_mono_to_any(mono, q->tk_offset);
+ }
+
+ return KTIME_MAX;
+}
+
static void taprio_free_sched_cb(struct rcu_head *head)
{
struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
@@ -108,20 +130,263 @@ static void switch_schedules(struct taprio_sched *q,
*admin = NULL;
}
-static ktime_t get_cycle_time(struct sched_gate_list *sched)
+/* Get how much time has been already elapsed in the current cycle. */
+static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
+{
+ ktime_t time_since_sched_start;
+ s32 time_elapsed;
+
+ time_since_sched_start = ktime_sub(time, sched->base_time);
+ div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);
+
+ return time_elapsed;
+}
+
+static ktime_t get_interval_end_time(struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ struct sched_entry *entry,
+ ktime_t intv_start)
+{
+ s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
+ ktime_t intv_end, cycle_ext_end, cycle_end;
+
+ cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
+ intv_end = ktime_add_ns(intv_start, entry->interval);
+ cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);
+
+ if (ktime_before(intv_end, cycle_end))
+ return intv_end;
+ else if (admin && admin != sched &&
+ ktime_after(admin->base_time, cycle_end) &&
+ ktime_before(admin->base_time, cycle_ext_end))
+ return admin->base_time;
+ else
+ return cycle_end;
+}
+
+static int length_to_duration(struct taprio_sched *q, int len)
+{
+ return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+}
+
+/* Returns the entry corresponding to next available interval. If
+ * validate_interval is set, it only validates whether the timestamp occurs
+ * when the gate corresponding to the skb's traffic class is open.
+ */
+static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
+ struct Qdisc *sch,
+ struct sched_gate_list *sched,
+ struct sched_gate_list *admin,
+ ktime_t time,
+ ktime_t *interval_start,
+ ktime_t *interval_end,
+ bool validate_interval)
+{
+ ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
+ ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
+ struct sched_entry *entry = NULL, *entry_found = NULL;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool entry_available = false;
+ s32 cycle_elapsed;
+ int tc, n;
+
+ tc = netdev_get_prio_tc_map(dev, skb->priority);
+ packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));
+
+ *interval_start = 0;
+ *interval_end = 0;
+
+ if (!sched)
+ return NULL;
+
+ cycle = sched->cycle_time;
+ cycle_elapsed = get_cycle_time_elapsed(sched, time);
+ curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
+ cycle_end = ktime_add_ns(curr_intv_end, cycle);
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ curr_intv_start = curr_intv_end;
+ curr_intv_end = get_interval_end_time(sched, admin, entry,
+ curr_intv_start);
+
+ if (ktime_after(curr_intv_start, cycle_end))
+ break;
+
+ if (!(entry->gate_mask & BIT(tc)) ||
+ packet_transmit_time > entry->interval)
+ continue;
+
+ txtime = entry->next_txtime;
+
+ if (ktime_before(txtime, time) || validate_interval) {
+ transmit_end_time = ktime_add_ns(time, packet_transmit_time);
+ if ((ktime_before(curr_intv_start, time) &&
+ ktime_before(transmit_end_time, curr_intv_end)) ||
+ (ktime_after(curr_intv_start, time) && !validate_interval)) {
+ entry_found = entry;
+ *interval_start = curr_intv_start;
+ *interval_end = curr_intv_end;
+ break;
+ } else if (!entry_available && !validate_interval) {
+ /* Here, we are just trying to find out the
+ * first available interval in the next cycle.
+ */
+ entry_available = 1;
+ entry_found = entry;
+ *interval_start = ktime_add_ns(curr_intv_start, cycle);
+ *interval_end = ktime_add_ns(curr_intv_end, cycle);
+ }
+ } else if (ktime_before(txtime, earliest_txtime) &&
+ !entry_available) {
+ earliest_txtime = txtime;
+ entry_found = entry;
+ n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
+ *interval_start = ktime_add(curr_intv_start, n * cycle);
+ *interval_end = ktime_add(curr_intv_end, n * cycle);
+ }
+ }
+
+ return entry_found;
+}
+
+static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t interval_start, interval_end;
struct sched_entry *entry;
- ktime_t cycle = 0;
- if (sched->cycle_time != 0)
- return sched->cycle_time;
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ admin = rcu_dereference(q->admin_sched);
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
+ &interval_start, &interval_end, true);
+ rcu_read_unlock();
- list_for_each_entry(entry, &sched->entries, list)
- cycle = ktime_add_ns(cycle, entry->interval);
+ return entry;
+}
- sched->cycle_time = cycle;
+/* This returns the tstamp value set by TCP in terms of the set clock. */
+static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
- return cycle;
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+ if (!ipv6h)
+ return 0;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return 0;
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return 0;
+ }
+ } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
+ return 0;
+ }
+
+ return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset);
+}
+
+/* There are a few scenarios where we will have to modify the txtime from
+ * what is read from next_txtime in sched_entry. They are:
+ * 1. If txtime is in the past,
+ * a. The gate for the traffic class is currently open and packet can be
+ * transmitted before it closes, schedule the packet right away.
+ * b. If the gate corresponding to the traffic class is going to open later
+ * in the cycle, set the txtime of packet to the interval start.
+ * 2. If txtime is in the future, there are packets corresponding to the
+ * current traffic class waiting to be transmitted. So, the following
+ * possibilities exist:
+ * a. We can transmit the packet before the window containing the txtime
+ * closes.
+ * b. The window might close before the transmission can be completed
+ * successfully. So, schedule the packet in the next open window.
+ */
+static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
+{
+ ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_gate_list *sched, *admin;
+ ktime_t minimum_time, now, txtime;
+ int len, packet_transmit_time;
+ struct sched_entry *entry;
+ bool sched_changed;
+
+ now = taprio_get_time(q);
+ minimum_time = ktime_add_ns(now, q->txtime_delay);
+
+ tcp_tstamp = get_tcp_tstamp(q, skb);
+ minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);
+
+ rcu_read_lock();
+ admin = rcu_dereference(q->admin_sched);
+ sched = rcu_dereference(q->oper_sched);
+ if (admin && ktime_after(minimum_time, admin->base_time))
+ switch_schedules(q, &admin, &sched);
+
+ /* Until the schedule starts, all the queues are open */
+ if (!sched || ktime_before(minimum_time, sched->base_time)) {
+ txtime = minimum_time;
+ goto done;
+ }
+
+ len = qdisc_pkt_len(skb);
+ packet_transmit_time = length_to_duration(q, len);
+
+ do {
+ sched_changed = 0;
+
+ entry = find_entry_to_transmit(skb, sch, sched, admin,
+ minimum_time,
+ &interval_start, &interval_end,
+ false);
+ if (!entry) {
+ txtime = 0;
+ goto done;
+ }
+
+ txtime = entry->next_txtime;
+ txtime = max_t(ktime_t, txtime, minimum_time);
+ txtime = max_t(ktime_t, txtime, interval_start);
+
+ if (admin && admin != sched &&
+ ktime_after(txtime, admin->base_time)) {
+ sched = admin;
+ sched_changed = 1;
+ continue;
+ }
+
+ transmit_end_time = ktime_add(txtime, packet_transmit_time);
+ minimum_time = transmit_end_time;
+
+ /* Update the txtime of current entry to the next time it's
+ * interval starts.
+ */
+ if (ktime_after(transmit_end_time, interval_end))
+ entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
+ } while (sched_changed || ktime_after(transmit_end_time, interval_end));
+
+ entry->next_txtime = transmit_end_time;
+
+done:
+ rcu_read_unlock();
+ return txtime;
}
static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -137,6 +402,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
+ if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) {
+ if (!is_valid_interval(skb, sch))
+ return qdisc_drop(skb, sch, to_free);
+ } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb->tstamp = get_packet_txtime(skb, sch);
+ if (!skb->tstamp)
+ return qdisc_drop(skb, sch, to_free);
+ }
+
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
@@ -172,6 +446,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
if (!skb)
continue;
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ return skb;
+
prio = skb->priority;
tc = netdev_get_prio_tc_map(dev, prio);
@@ -184,11 +461,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
return NULL;
}
-static inline int length_to_duration(struct taprio_sched *q, int len)
-{
- return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
-}
-
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
@@ -232,6 +504,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
if (unlikely(!child))
continue;
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ skb = child->ops->dequeue(child);
+ if (!skb)
+ continue;
+ goto skb_found;
+ }
+
skb = child->ops->peek(child);
if (!skb)
continue;
@@ -243,7 +522,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
continue;
len = qdisc_pkt_len(skb);
- guard = ktime_add_ns(q->get_time(),
+ guard = ktime_add_ns(taprio_get_time(q),
length_to_duration(q, len));
/* In the case that there's no gate entry, there's no
@@ -262,6 +541,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
if (unlikely(!skb))
goto done;
+skb_found:
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
@@ -524,12 +804,22 @@ static int parse_taprio_schedule(struct nlattr **tb,
if (err < 0)
return err;
+ if (!new->cycle_time) {
+ struct sched_entry *entry;
+ ktime_t cycle = 0;
+
+ list_for_each_entry(entry, &new->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+ new->cycle_time = cycle;
+ }
+
return 0;
}
static int taprio_parse_mqprio_opt(struct net_device *dev,
struct tc_mqprio_qopt *qopt,
- struct netlink_ext_ack *extack)
+ struct netlink_ext_ack *extack,
+ u32 taprio_flags)
{
int i, j;
@@ -577,6 +867,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
return -EINVAL;
}
+ if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
+ continue;
+
/* Verify that the offset and counts do not overlap */
for (j = i + 1; j < qopt->num_tc; j++) {
if (last > qopt->offset[j]) {
@@ -598,14 +891,14 @@ static int taprio_get_start_time(struct Qdisc *sch,
s64 n;
base = sched_base_time(sched);
- now = q->get_time();
+ now = taprio_get_time(q);
if (ktime_after(base, now)) {
*start = base;
return 0;
}
- cycle = get_cycle_time(sched);
+ cycle = sched->cycle_time;
/* The qdisc is expected to have at least one sched_entry. Moreover,
* any entry must have 'interval' > 0. Thus if the cycle time is zero,
@@ -632,7 +925,7 @@ static void setup_first_close_time(struct taprio_sched *q,
first = list_first_entry(&sched->entries,
struct sched_entry, list);
- cycle = get_cycle_time(sched);
+ cycle = sched->cycle_time;
/* FIXME: find a better place to do this */
sched->cycle_close_time = ktime_add_ns(base, cycle);
@@ -707,6 +1000,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
return NOTIFY_DONE;
}
+static void setup_txtime(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
+{
+ struct sched_entry *entry;
+ u32 interval = 0;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ entry->next_txtime = ktime_add_ns(base, interval);
+ interval += entry->interval;
+ }
+}
+
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -715,6 +1020,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
+ u32 taprio_flags = 0;
int i, err, clockid;
unsigned long flags;
ktime_t start;
@@ -727,7 +1033,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
- err = taprio_parse_mqprio_opt(dev, mqprio, extack);
+ if (tb[TCA_TAPRIO_ATTR_FLAGS]) {
+ taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]);
+
+ if (q->flags != 0 && q->flags != taprio_flags) {
+ NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
+ return -EOPNOTSUPP;
+ } else if (!FLAGS_VALID(taprio_flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
+ return -EINVAL;
+ }
+
+ q->flags = taprio_flags;
+ }
+
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags);
if (err < 0)
return err;
@@ -786,7 +1106,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
/* Protects against enqueue()/dequeue() */
spin_lock_bh(qdisc_lock(sch));
- if (!hrtimer_active(&q->advance_timer)) {
+ if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
+ NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+ }
+
+ if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+ !hrtimer_active(&q->advance_timer)) {
hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
}
@@ -806,16 +1137,16 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
switch (q->clockid) {
case CLOCK_REALTIME:
- q->get_time = ktime_get_real;
+ q->tk_offset = TK_OFFS_REAL;
break;
case CLOCK_MONOTONIC:
- q->get_time = ktime_get;
+ q->tk_offset = TK_OFFS_MAX;
break;
case CLOCK_BOOTTIME:
- q->get_time = ktime_get_boottime;
+ q->tk_offset = TK_OFFS_BOOT;
break;
case CLOCK_TAI:
- q->get_time = ktime_get_clocktai;
+ q->tk_offset = TK_OFFS_TAI;
break;
default:
NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
@@ -829,20 +1160,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto unlock;
}
- setup_first_close_time(q, new_admin, start);
+ if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) {
+ setup_txtime(q, new_admin, start);
- /* Protects against advance_sched() */
- spin_lock_irqsave(&q->current_entry_lock, flags);
+ if (!oper) {
+ rcu_assign_pointer(q->oper_sched, new_admin);
+ err = 0;
+ new_admin = NULL;
+ goto unlock;
+ }
- taprio_start_sched(sch, start, new_admin);
+ rcu_assign_pointer(q->admin_sched, new_admin);
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+ } else {
+ setup_first_close_time(q, new_admin, start);
- rcu_assign_pointer(q->admin_sched, new_admin);
- if (admin)
- call_rcu(&admin->rcu, taprio_free_sched_cb);
- new_admin = NULL;
+ /* Protects against advance_sched() */
+ spin_lock_irqsave(&q->current_entry_lock, flags);
- spin_unlock_irqrestore(&q->current_entry_lock, flags);
+ taprio_start_sched(sch, start, new_admin);
+ rcu_assign_pointer(q->admin_sched, new_admin);
+ if (admin)
+ call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ spin_unlock_irqrestore(&q->current_entry_lock, flags);
+ }
+
+ new_admin = NULL;
err = 0;
unlock:
@@ -1080,6 +1426,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
goto options_error;
+ if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
+ goto options_error;
+
+ if (q->txtime_delay &&
+ nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
+ goto options_error;
+
if (oper && dump_schedule(skb, oper))
goto options_error;