diff options
author | Linus Torvalds | 2017-11-15 20:56:19 +0100 |
---|---|---|
committer | Linus Torvalds | 2017-11-15 20:56:19 +0100 |
commit | 5bbcc0f595fadb4cac0eddc4401035ec0bd95b09 (patch) | |
tree | 3b65e490cc36a6c6fecac1fa24d9e0ac9ced4455 /net/sched | |
parent | Merge tag 'mips_4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/jhogan... (diff) | |
parent | tcp: highest_sack fix (diff) | |
download | kernel-qcow2-linux-5bbcc0f595fadb4cac0eddc4401035ec0bd95b09.tar.gz kernel-qcow2-linux-5bbcc0f595fadb4cac0eddc4401035ec0bd95b09.tar.xz kernel-qcow2-linux-5bbcc0f595fadb4cac0eddc4401035ec0bd95b09.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Maintain the TCP retransmit queue using an rbtree, with 1GB
windows at 100Gb this really has become necessary. From Eric
Dumazet.
2) Multi-program support for cgroup+bpf, from Alexei Starovoitov.
3) Perform broadcast flooding in hardware in mv88e6xxx, from Andrew
Lunn.
4) Add meter action support to openvswitch, from Andy Zhou.
5) Add a data meta pointer for BPF accessible packets, from Daniel
Borkmann.
6) Namespace-ify almost all TCP sysctl knobs, from Eric Dumazet.
7) Turn on Broadcom Tags in b53 driver, from Florian Fainelli.
8) More work to move the RTNL mutex down, from Florian Westphal.
9) Add 'bpftool' utility, to help with bpf program introspection.
From Jakub Kicinski.
10) Add new 'cpumap' type for XDP_REDIRECT action, from Jesper
Dangaard Brouer.
11) Support 'blocks' of transformations in the packet scheduler which
can span multiple network devices, from Jiri Pirko.
12) TC flower offload support in cxgb4, from Kumar Sanghvi.
13) Priority based stream scheduler for SCTP, from Marcelo Ricardo
Leitner.
14) Thunderbolt networking driver, from Amir Levy and Mika Westerberg.
15) Add RED qdisc offloadability, and use it in mlxsw driver. From
Nogah Frankel.
16) eBPF based device controller for cgroup v2, from Roman Gushchin.
17) Add some fundamental tracepoints for TCP, from Song Liu.
18) Remove garbage collection from ipv6 route layer, this is a
significant accomplishment. From Wei Wang.
19) Add multicast route offload support to mlxsw, from Yotam Gigi"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2177 commits)
tcp: highest_sack fix
geneve: fix fill_info when link down
bpf: fix lockdep splat
net: cdc_ncm: GetNtbFormat endian fix
openvswitch: meter: fix NULL pointer dereference in ovs_meter_cmd_reply_start
netem: remove unnecessary 64 bit modulus
netem: use 64 bit divide by rate
tcp: Namespace-ify sysctl_tcp_default_congestion_control
net: Protect iterations over net::fib_notifier_ops in fib_seq_sum()
ipv6: set all.accept_dad to 0 by default
uapi: fix linux/tls.h userspace compilation error
usbnet: ipheth: prevent TX queue timeouts when device not ready
vhost_net: conditionally enable tx polling
uapi: fix linux/rxrpc.h userspace compilation errors
net: stmmac: fix LPI transitioning for dwmac4
atm: horizon: Fix irq release error
net-sysfs: trigger netlink notification on ifalias change via sysfs
openvswitch: Using kfree_rcu() to simplify the code
openvswitch: Make local function ovs_nsh_key_attr_size() static
openvswitch: Fix return value check in ovs_meter_cmd_features()
...
Diffstat (limited to 'net/sched')
41 files changed, 1867 insertions, 534 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e70ed26485a2..c03d86a7775e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -172,6 +172,17 @@ config NET_SCH_TBF To compile this code as a module, choose M here: the module will be called sch_tbf. +config NET_SCH_CBS + tristate "Credit Based Shaper (CBS)" + ---help--- + Say Y here if you want to use the Credit Based Shaper (CBS) packet + scheduling algorithm. + + See the top of <file:net/sched/sch_cbs.c> for more details. + + To compile this code as a module, choose M here: the + module will be called sch_cbs. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 9e43a4721ef8..5b635447e3f8 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8f2c63514956..4d33a50a8a6d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -21,6 +21,8 @@ #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/rhashtable.h> +#include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> @@ -1251,8 +1253,227 @@ out_module_put: return skb->len; } +struct tcf_action_net { + struct rhashtable egdev_ht; +}; + +static unsigned int tcf_action_net_id; + +struct tcf_action_egdev_cb { + struct list_head list; + tc_setup_cb_t *cb; + void *cb_priv; +}; + +struct tcf_action_egdev { + struct rhash_head ht_node; + const struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; +}; + +static const struct rhashtable_params tcf_action_egdev_ht_params = { + .key_offset = offsetof(struct tcf_action_egdev, dev), + .head_offset = offsetof(struct tcf_action_egdev, ht_node), + .key_len = sizeof(const struct net_device *), +}; + +static struct tcf_action_egdev * +tcf_action_egdev_lookup(const struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_lookup_fast(&tan->egdev_ht, &dev, + tcf_action_egdev_ht_params); +} + +static struct tcf_action_egdev * +tcf_action_egdev_get(const struct net_device *dev) +{ + struct tcf_action_egdev *egdev; + struct tcf_action_net *tan; + + egdev = tcf_action_egdev_lookup(dev); + if (egdev) + goto inc_ref; + + egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); + if (!egdev) + return NULL; + INIT_LIST_HEAD(&egdev->cb_list); + egdev->dev = dev; + tan = net_generic(dev_net(dev), tcf_action_net_id); + rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + +inc_ref: + egdev->refcnt++; + return egdev; +} + +static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) +{ + struct tcf_action_net *tan; + + if (--egdev->refcnt) + return; + tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); + rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + kfree(egdev); +} + +static struct tcf_action_egdev_cb * +tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) + if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) + return egdev_cb; + return NULL; +} + +static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, + enum tc_setup_type type, + void *type_data, bool err_stop) +{ + struct tcf_action_egdev_cb *egdev_cb; + int ok_count = 0; + int err; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) { + err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } + } + return ok_count; +} + +static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(egdev_cb)) + return -EEXIST; + egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); + if (!egdev_cb) + return -ENOMEM; + egdev_cb->cb = cb; + egdev_cb->cb_priv = cb_priv; + list_add(&egdev_cb->list, &egdev->cb_list); + return 0; +} + +static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(!egdev_cb)) + return; + list_del(&egdev_cb->list); + kfree(egdev_cb); +} + +static int __tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); + int err; + + if (!egdev) + return -ENOMEM; + err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); + if (err) + goto err_cb_add; + return 0; + +err_cb_add: + tcf_action_egdev_put(egdev); + return err; +} +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + int err; + + rtnl_lock(); + err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); + +static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (WARN_ON(!egdev)) + return; + tcf_action_egdev_cb_del(egdev, cb, cb_priv); + tcf_action_egdev_put(egdev); +} +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + rtnl_lock(); + __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); + +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (!egdev) + return 0; + return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); + +static __net_init int tcf_action_net_init(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); +} + +static void __net_exit tcf_action_net_exit(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + rhashtable_destroy(&tan->egdev_ht); +} + +static struct pernet_operations tcf_action_net_ops = { + .init = tcf_action_net_init, + .exit = tcf_action_net_exit, + .id = &tcf_action_net_id, + .size = sizeof(struct tcf_action_net), +}; + static int __init tc_action_init(void) { + int err; + + err = register_pernet_subsys(&tcf_action_net_ops); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c0c707eb2c96..5ef8ce8c83d4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 8ccd35825b6b..3007cb1310ea 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -248,6 +248,22 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) return ret; } +#ifdef CONFIG_MODULES +static const char *ife_meta_id2name(u32 metaid) +{ + switch (metaid) { + case IFE_META_SKBMARK: + return "skbmark"; + case IFE_META_PRIO: + return "skbprio"; + case IFE_META_TCINDEX: + return "tcindex"; + default: + return "unknown"; + } +} +#endif + /* called when adding new meta information * under ife->tcf_lock for existing action */ @@ -263,7 +279,7 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, if (exists) spin_unlock_bh(&ife->tcf_lock); rtnl_unlock(); - request_module("ifemeta%u", metaid); + request_module("ife-meta-%s", ife_meta_id2name(metaid)); rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); @@ -392,10 +408,14 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); spin_unlock_bh(&ife->tcf_lock); + + p = rcu_dereference_protected(ife->params, 1); + kfree_rcu(p, rcu); } /* under ife->tcf_lock for existing action */ @@ -432,6 +452,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_ife_params *p, *p_old; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -450,24 +471,41 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_IFE_PARMS]); + /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because + * they cannot run as the same time. Check on all other values which + * are not supported right now. + */ + if (parm->flags & ~IFE_ENCODE) + return -EINVAL; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + exists = tcf_idr_check(tn, parm->index, a, bind); - if (exists && bind) + if (exists && bind) { + kfree(p); return 0; + } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, - bind, false); - if (ret) + bind, true); + if (ret) { + kfree(p); return ret; + } ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + kfree(p); return -EEXIST; + } } ife = to_ife(*a); - ife->flags = parm->flags; + p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) @@ -478,24 +516,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, saddr = nla_data(tb[TCA_IFE_SMAC]); } - if (exists) - spin_lock_bh(&ife->tcf_lock); ife->tcf_action = parm->action; if (parm->flags & IFE_ENCODE) { if (daddr) - ether_addr_copy(ife->eth_dst, daddr); + ether_addr_copy(p->eth_dst, daddr); else - eth_zero_addr(ife->eth_dst); + eth_zero_addr(p->eth_dst); if (saddr) - ether_addr_copy(ife->eth_src, saddr); + ether_addr_copy(p->eth_src, saddr); else - eth_zero_addr(ife->eth_src); + eth_zero_addr(p->eth_src); - ife->eth_type = ife_type; + p->eth_type = ife_type; } + if (exists) + spin_lock_bh(&ife->tcf_lock); + if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); @@ -511,6 +550,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } @@ -531,6 +571,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } } @@ -538,6 +579,11 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + p_old = rtnl_dereference(ife->params); + rcu_assign_pointer(ife->params, p); + if (p_old) + kfree_rcu(p_old, rcu); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -549,12 +595,13 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, .bindcnt = ife->tcf_bindcnt - bind, .action = ife->tcf_action, - .flags = ife->flags, + .flags = p->flags, }; struct tcf_t t; @@ -565,17 +612,17 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; - if (!is_zero_ether_addr(ife->eth_dst)) { - if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst)) + if (!is_zero_ether_addr(p->eth_dst)) { + if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } - if (!is_zero_ether_addr(ife->eth_src)) { - if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src)) + if (!is_zero_ether_addr(p->eth_src)) { + if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } - if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type)) + if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { @@ -617,19 +664,15 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); - spin_unlock(&ife->tcf_lock); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -647,14 +690,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); - ife->tcf_qstats.overlimits++; + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -683,7 +724,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) + struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; @@ -706,23 +747,20 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ - ife->tcf_qstats.overlimits++; - spin_unlock(&ife->tcf_lock); + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -731,6 +769,8 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, ife_meta = ife_encode(skb, metalen); + spin_lock(&ife->tcf_lock); + /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... @@ -742,25 +782,24 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, } if (err < 0) { /* too corrupt to keep around if overwritten */ - ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } + spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; - if (!is_zero_ether_addr(ife->eth_src)) - ether_addr_copy(oethh->h_source, ife->eth_src); - if (!is_zero_ether_addr(ife->eth_dst)) - ether_addr_copy(oethh->h_dest, ife->eth_dst); - oethh->h_proto = htons(ife->eth_type); + if (!is_zero_ether_addr(p->eth_src)) + ether_addr_copy(oethh->h_source, p->eth_src); + if (!is_zero_ether_addr(p->eth_dst)) + ether_addr_copy(oethh->h_dest, p->eth_dst); + oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); - spin_unlock(&ife->tcf_lock); - return action; } @@ -768,21 +807,19 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; + int ret; + + rcu_read_lock(); + p = rcu_dereference(ife->params); + if (p->flags & IFE_ENCODE) { + ret = tcf_ife_encode(skb, a, res, p); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); - if (ife->flags & IFE_ENCODE) - return tcf_ife_encode(skb, a, res); - - if (!(ife->flags & IFE_ENCODE)) - return tcf_ife_decode(skb, a, res); - - pr_info_ratelimited("unknown failure(policy neither de/encode\n"); - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); - tcf_lastuse_update(&ife->tcf_tm); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); - - return TC_ACT_SHOT; + return tcf_ife_decode(skb, a, res); } static int tcf_ife_walker(struct net *net, struct sk_buff *skb, diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 82892170ce4f..1e3f10e5da99 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -76,4 +76,4 @@ module_exit(ifemark_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb mark metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBMARK); +MODULE_ALIAS_IFE_META("skbmark"); diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c index 26bf4d86030b..4033f9fc4d4a 100644 --- a/net/sched/act_meta_skbprio.c +++ b/net/sched/act_meta_skbprio.c @@ -73,4 +73,4 @@ module_exit(ifeprio_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb prio metadata action"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_PRIO); +MODULE_ALIAS_IFE_META("skbprio"); diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 3b35774ce890..2ea1f26c9e96 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -76,4 +76,4 @@ module_exit(ifetc_index_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2016)"); MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); +MODULE_ALIAS_IFE_META("tcindex"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 416627c66f08..8b3e59388480 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -140,6 +140,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; + m->net = net; if (ret != ACT_P_CREATED) dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); @@ -313,15 +314,11 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; -static int tcf_mirred_device(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev) +static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { - int ifindex = tcf_mirred_ifindex(a); + struct tcf_mirred *m = to_mirred(a); - *mirred_dev = __dev_get_by_index(net, ifindex); - if (!*mirred_dev) - return -EINVAL; - return 0; + return __dev_get_by_index(m->net, m->tcfm_ifindex); } static struct tc_action_ops act_mirred_ops = { @@ -336,7 +333,7 @@ static struct tc_action_ops act_mirred_ops = { .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), - .get_dev = tcf_mirred_device, + .get_dev = tcf_mirred_get_dev, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 16eb067a8d8f..97f717a13ad5 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -26,14 +26,13 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p; int action; int err; u16 tci; - spin_lock(&v->tcf_lock); tcf_lastuse_update(&v->tcf_tm); - bstats_update(&v->tcf_bstats, skb); - action = v->tcf_action; + bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. @@ -41,15 +40,21 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - switch (v->tcfv_action) { + rcu_read_lock(); + + action = READ_ONCE(v->tcf_action); + + p = rcu_dereference(v->vlan_p); + + switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH: - err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid | - (v->tcfv_push_prio << VLAN_PRIO_SHIFT)); + err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid | + (p->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; @@ -68,14 +73,14 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, goto drop; } /* replace the vid */ - tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid; + tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ - if (v->tcfv_push_prio) { + if (p->tcfv_push_prio) { tci &= ~VLAN_PRIO_MASK; - tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT; + tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } /* put updated tci as hwaccel tag */ - __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci); + __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; default: BUG(); @@ -85,12 +90,13 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, drop: action = TC_ACT_SHOT; - v->tcf_qstats.drops++; + qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + unlock: + rcu_read_unlock(); if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - spin_unlock(&v->tcf_lock); return action; } @@ -107,6 +113,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; + struct tcf_vlan_params *p, *p_old; struct tc_vlan *parm; struct tcf_vlan *v; int action; @@ -172,7 +179,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_vlan_ops, bind, false); + &act_vlan_ops, bind, true); if (ret) return ret; @@ -185,46 +192,67 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - spin_lock_bh(&v->tcf_lock); - - v->tcfv_action = action; - v->tcfv_push_vid = push_vid; - v->tcfv_push_prio = push_prio; - v->tcfv_push_proto = push_proto; + ASSERT_RTNL(); + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + if (ovr) + tcf_idr_release(*a, bind); + return -ENOMEM; + } v->tcf_action = parm->action; - spin_unlock_bh(&v->tcf_lock); + p_old = rtnl_dereference(v->vlan_p); + + p->tcfv_action = action; + p->tcfv_push_vid = push_vid; + p->tcfv_push_prio = push_prio; + p->tcfv_push_proto = push_proto; + + rcu_assign_pointer(v->vlan_p, p); + + if (p_old) + kfree_rcu(p_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; } +static void tcf_vlan_cleanup(struct tc_action *a, int bind) +{ + struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p; + + p = rcu_dereference_protected(v->vlan_p, 1); + kfree_rcu(p, rcu); +} + static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); + struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, .refcnt = v->tcf_refcnt - ref, .bindcnt = v->tcf_bindcnt - bind, .action = v->tcf_action, - .v_action = v->tcfv_action, + .v_action = p->tcfv_action, }; struct tcf_t t; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((v->tcfv_action == TCA_VLAN_ACT_PUSH || - v->tcfv_action == TCA_VLAN_ACT_MODIFY) && - (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || + if ((p->tcfv_action == TCA_VLAN_ACT_PUSH || + p->tcfv_action == TCA_VLAN_ACT_MODIFY) && + (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, - v->tcfv_push_proto) || + p->tcfv_push_proto) || (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, - v->tcfv_push_prio)))) + p->tcfv_push_prio)))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); @@ -260,6 +288,7 @@ static struct tc_action_ops act_vlan_ops = { .act = tcf_vlan, .dump = tcf_vlan_dump, .init = tcf_vlan_init, + .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, .size = sizeof(struct tcf_vlan), diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ecbb019efcbd..ab255b421781 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -195,12 +195,19 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, return chain; } +static void tcf_chain_head_change(struct tcf_chain *chain, + struct tcf_proto *tp_head) +{ + if (chain->chain_head_change) + chain->chain_head_change(tp_head, + chain->chain_head_change_priv); +} + static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp; - if (chain->p_filter_chain) - RCU_INIT_POINTER(*chain->p_filter_chain, NULL); + tcf_chain_head_change(chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_chain_put(chain); @@ -242,15 +249,35 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); -static void -tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, - struct tcf_proto __rcu **p_filter_chain) +static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei, + enum tc_block_command command) { - chain->p_filter_chain = p_filter_chain; + struct net_device *dev = q->dev_queue->dev; + struct tc_block_offload bo = {}; + + if (!dev->netdev_ops->ndo_setup_tc) + return; + bo.command = command; + bo.binder_type = ei->binder_type; + bo.block = block; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } -int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain) +static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND); +} + +static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND); +} + +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, + struct tcf_block_ext_info *ei) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); struct tcf_chain *chain; @@ -259,13 +286,20 @@ int tcf_block_get(struct tcf_block **p_block, if (!block) return -ENOMEM; INIT_LIST_HEAD(&block->chain_list); + INIT_LIST_HEAD(&block->cb_list); + /* Create chain 0 by default, it has to be always present. */ chain = tcf_chain_create(block, 0); if (!chain) { err = -ENOMEM; goto err_chain_create; } - tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); + WARN_ON(!ei->chain_head_change); + chain->chain_head_change = ei->chain_head_change; + chain->chain_head_change_priv = ei->chain_head_change_priv; + block->net = qdisc_net(q); + block->q = q; + tcf_block_offload_bind(block, q, ei); *p_block = block; return 0; @@ -273,6 +307,26 @@ err_chain_create: kfree(block); return err; } +EXPORT_SYMBOL(tcf_block_get_ext); + +static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv) +{ + struct tcf_proto __rcu **p_filter_chain = priv; + + rcu_assign_pointer(*p_filter_chain, tp_head); +} + +int tcf_block_get(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) +{ + struct tcf_block_ext_info ei = { + .chain_head_change = tcf_chain_head_change_dflt, + .chain_head_change_priv = p_filter_chain, + }; + + WARN_ON(!p_filter_chain); + return tcf_block_get_ext(p_block, q, &ei); +} EXPORT_SYMBOL(tcf_block_get); static void tcf_block_put_final(struct work_struct *work) @@ -292,25 +346,140 @@ static void tcf_block_put_final(struct work_struct *work) * actions should be all removed after flushing. However, filters are now * destroyed in tc filter workqueue with RTNL lock, they can not race here. */ -void tcf_block_put(struct tcf_block *block) +void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, + struct tcf_block_ext_info *ei) { struct tcf_chain *chain, *tmp; - if (!block) - return; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_flush(chain); + tcf_block_offload_unbind(block, q, ei); + INIT_WORK(&block->work, tcf_block_put_final); - /* Wait for RCU callbacks to release the reference count and make - * sure their works have been queued before this. + /* Wait for existing RCU callbacks to cool down, make sure their works + * have been queued before this. We can not flush pending works here + * because we are holding the RTNL lock. */ rcu_barrier(); tcf_queue_work(&block->work); } +EXPORT_SYMBOL(tcf_block_put_ext); + +void tcf_block_put(struct tcf_block *block) +{ + struct tcf_block_ext_info ei = {0, }; + + if (!block) + return; + tcf_block_put_ext(block, block->q, &ei); +} + EXPORT_SYMBOL(tcf_block_put); +struct tcf_block_cb { + struct list_head list; + tc_setup_cb_t *cb; + void *cb_ident; + void *cb_priv; + unsigned int refcnt; +}; + +void *tcf_block_cb_priv(struct tcf_block_cb *block_cb) +{ + return block_cb->cb_priv; +} +EXPORT_SYMBOL(tcf_block_cb_priv); + +struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ struct tcf_block_cb *block_cb; + + list_for_each_entry(block_cb, &block->cb_list, list) + if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) + return block_cb; + return NULL; +} +EXPORT_SYMBOL(tcf_block_cb_lookup); + +void tcf_block_cb_incref(struct tcf_block_cb *block_cb) +{ + block_cb->refcnt++; +} +EXPORT_SYMBOL(tcf_block_cb_incref); + +unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) +{ + return --block_cb->refcnt; +} +EXPORT_SYMBOL(tcf_block_cb_decref); + +struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + struct tcf_block_cb *block_cb; + + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + if (!block_cb) + return NULL; + block_cb->cb = cb; + block_cb->cb_ident = cb_ident; + block_cb->cb_priv = cb_priv; + list_add(&block_cb->list, &block->cb_list); + return block_cb; +} +EXPORT_SYMBOL(__tcf_block_cb_register); + +int tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + struct tcf_block_cb *block_cb; + + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); + return block_cb ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(tcf_block_cb_register); + +void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +{ + list_del(&block_cb->list); + kfree(block_cb); +} +EXPORT_SYMBOL(__tcf_block_cb_unregister); + +void tcf_block_cb_unregister(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ + struct tcf_block_cb *block_cb; + + block_cb = tcf_block_cb_lookup(block, cb, cb_ident); + if (!block_cb) + return; + __tcf_block_cb_unregister(block_cb); +} +EXPORT_SYMBOL(tcf_block_cb_unregister); + +static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type, + void *type_data, bool err_stop) +{ + struct tcf_block_cb *block_cb; + int ok_count = 0; + int err; + + list_for_each_entry(block_cb, &block->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } + } + return ok_count; +} + /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. @@ -379,9 +548,8 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { - if (chain->p_filter_chain && - *chain_info->pprev == chain->filter_chain) - rcu_assign_pointer(*chain->p_filter_chain, tp); + if (*chain_info->pprev == chain->filter_chain) + tcf_chain_head_change(chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); tcf_chain_hold(chain); @@ -393,8 +561,8 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, { struct tcf_proto *next = rtnl_dereference(chain_info->next); - if (chain->p_filter_chain && tp == chain->filter_chain) - RCU_INIT_POINTER(*chain->p_filter_chain, next); + if (tp == chain->filter_chain) + tcf_chain_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); tcf_chain_put(chain); } @@ -427,8 +595,8 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, } static int tcf_fill_node(struct net *net, struct sk_buff *skb, - struct tcf_proto *tp, void *fh, u32 portid, - u32 seq, u16 flags, int event) + struct tcf_proto *tp, struct Qdisc *q, u32 parent, + void *fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -441,8 +609,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; - tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; - tcm->tcm_parent = tp->classid; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = parent; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; @@ -465,6 +633,7 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, + struct Qdisc *q, u32 parent, void *fh, int event, bool unicast) { struct sk_buff *skb; @@ -474,7 +643,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event) <= 0) { kfree_skb(skb); return -EINVAL; @@ -489,6 +658,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, + struct Qdisc *q, u32 parent, void *fh, bool unicast, bool *last) { struct sk_buff *skb; @@ -499,7 +669,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { kfree_skb(skb); return -EINVAL; @@ -519,6 +689,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, + struct Qdisc *q, u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event) { @@ -526,7 +697,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) - tfilter_notify(net, oskb, n, tp, 0, event, false); + tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false); } /* Add/change/delete/get a filter node */ @@ -645,7 +816,8 @@ replay: } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { - tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); + tfilter_notify_chain(net, skb, q, parent, n, + chain, RTM_DELTFILTER); tcf_chain_flush(chain); err = 0; goto errout; @@ -692,7 +864,7 @@ replay: if (!fh) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, fh, + tfilter_notify(net, skb, n, tp, q, parent, fh, RTM_DELTFILTER, false); tcf_proto_destroy(tp); err = 0; @@ -717,8 +889,8 @@ replay: } break; case RTM_DELTFILTER: - err = tfilter_del_notify(net, skb, n, tp, fh, false, - &last); + err = tfilter_del_notify(net, skb, n, tp, q, parent, + fh, false, &last); if (err) goto errout; if (last) { @@ -727,7 +899,7 @@ replay: } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, fh, + err = tfilter_notify(net, skb, n, tp, q, parent, fh, RTM_NEWTFILTER, true); goto errout; default: @@ -741,7 +913,8 @@ replay: if (err == 0) { if (tp_created) tcf_chain_tp_insert(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); + tfilter_notify(net, skb, n, tp, q, parent, fh, + RTM_NEWTFILTER, false); } else { if (tp_created) tcf_proto_destroy(tp); @@ -760,6 +933,8 @@ struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; + struct Qdisc *q; + u32 parent; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -767,13 +942,14 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, + return tcf_fill_node(net, a->skb, tp, a->q, a->parent, + n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } -static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, - struct netlink_callback *cb, +static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, + struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index) { struct net *net = sock_net(skb->sk); @@ -795,7 +971,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, + if (tcf_fill_node(net, skb, tp, q, parent, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) @@ -808,6 +984,8 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; + arg.q = q; + arg.parent = parent; arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; @@ -833,6 +1011,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) const struct Qdisc_class_ops *cops; long index_start; long index; + u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) @@ -846,10 +1025,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) return skb->len; - if (!tcm->tcm_parent) + parent = tcm->tcm_parent; + if (!parent) { q = dev->qdisc; - else + parent = q->handle; + } else { q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } if (!q) goto out; cops = q->ops->cl_ops; @@ -873,7 +1055,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; - if (!tcf_chain_dump(chain, skb, cb, index_start, &index)) + if (!tcf_chain_dump(chain, q, parent, skb, cb, + index_start, &index)) break; } @@ -1015,29 +1198,56 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev) +static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, + enum tc_setup_type type, + void *type_data, bool err_stop) { + int ok_count = 0; #ifdef CONFIG_NET_CLS_ACT const struct tc_action *a; - LIST_HEAD(actions); + struct net_device *dev; + int i, ret; if (!tcf_exts_has_actions(exts)) - return -EINVAL; + return 0; - tcf_exts_to_list(exts, &actions); - list_for_each_entry(a, &actions, list) { - if (a->ops->get_dev) { - a->ops->get_dev(a, dev_net(dev), hw_dev); - break; - } + for (i = 0; i < exts->nr_actions; i++) { + a = exts->actions[i]; + if (!a->ops->get_dev) + continue; + dev = a->ops->get_dev(a); + if (!dev) + continue; + ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count += ret; } - if (*hw_dev) - return 0; #endif - return -EOPNOTSUPP; + return ok_count; +} + +int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, + enum tc_setup_type type, void *type_data, bool err_stop) +{ + int ok_count; + int ret; + + ret = tcf_block_cb_call(block, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count = ret; + + if (!exts) + return ok_count; + ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count += ret; + + return ok_count; } -EXPORT_SYMBOL(tcf_exts_get_dev); +EXPORT_SYMBOL(tc_setup_cb_call); static int __init tc_filter_init(void) { diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index e43c56d5b96a..5f169ded347e 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -17,13 +17,14 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/idr.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> struct basic_head { - u32 hgenerator; struct list_head flist; + struct idr handle_idr; struct rcu_head rcu; }; @@ -81,6 +82,7 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } @@ -118,11 +120,13 @@ static void basic_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); if (tcf_exts_get_net(&f->exts)) call_rcu(&f->rcu, basic_delete_filter); else __basic_delete_filter(f); } + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -133,6 +137,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); tcf_exts_get_net(&f->exts); call_rcu(&f->rcu, basic_delete_filter); *last = list_empty(&head->flist); @@ -177,6 +182,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *fold = (struct basic_filter *) *arg; struct basic_filter *fnew; + unsigned long idr_index; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -199,33 +205,33 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; if (handle) { fnew->handle = handle; - } else if (fold) { - fnew->handle = fold->handle; + if (!fold) { + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (err) + goto errout; + } } else { - unsigned int i = 0x80000000; - do { - if (++head->hgenerator == 0x7FFFFFFF) - head->hgenerator = 1; - } while (--i > 0 && basic_get(tp, head->hgenerator)); - - if (i <= 0) { - pr_err("Insufficient number of handles\n"); + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (err) goto errout; - } - - fnew->handle = head->hgenerator; + fnew->handle = idr_index; } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); - if (err < 0) + if (err < 0) { + if (!fold) + idr_remove_ext(&head->handle_idr, fnew->handle); goto errout; + } *arg = fnew; if (fold) { + idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 990eb4d91d54..fb680dafac5a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/idr.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> @@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; - u32 hgen; + struct idr handle_idr; struct rcu_head rcu; }; @@ -102,11 +103,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } @@ -149,7 +150,9 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, enum tc_clsbpf_command cmd) { - struct net_device *dev = tp->q->dev_queue->dev; + bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE; + struct tcf_block *block = tp->chain->block; + bool skip_sw = tc_skip_sw(prog->gen_flags); struct tc_cls_bpf_offload cls_bpf = {}; int err; @@ -161,17 +164,25 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf.exts_integrated = prog->exts_integrated; cls_bpf.gen_flags = prog->gen_flags; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, &cls_bpf); - if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE)) - prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw); + if (addorrep) { + if (err < 0) { + cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + return err; + } else if (err > 0) { + prog->gen_flags |= TCA_CLS_FLAGS_IN_HW; + } + } + + if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; - return err; + return 0; } static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog) { - struct net_device *dev = tp->q->dev_queue->dev; struct cls_bpf_prog *obj = prog; enum tc_clsbpf_command cmd; bool skip_sw; @@ -181,7 +192,7 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, (oldprog && tc_skip_sw(oldprog->gen_flags)); if (oldprog && oldprog->offloaded) { - if (tc_should_offload(dev, prog->gen_flags)) { + if (!tc_skip_hw(prog->gen_flags)) { cmd = TC_CLSBPF_REPLACE; } else if (!tc_skip_sw(prog->gen_flags)) { obj = oldprog; @@ -190,14 +201,14 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, return -EINVAL; } } else { - if (!tc_should_offload(dev, prog->gen_flags)) + if (tc_skip_hw(prog->gen_flags)) return skip_sw ? -EINVAL : 0; cmd = TC_CLSBPF_ADD; } ret = cls_bpf_offload_cmd(tp, obj, cmd); if (ret) - return skip_sw ? ret : 0; + return ret; obj->offloaded = true; if (oldprog) @@ -241,6 +252,7 @@ static int cls_bpf_init(struct tcf_proto *tp) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; @@ -280,6 +292,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) { + struct cls_bpf_head *head = rtnl_dereference(tp->root); + + idr_remove_ext(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); @@ -306,6 +321,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog); + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -362,7 +378,7 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) } static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, - const struct tcf_proto *tp) + u32 gen_flags, const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; @@ -370,7 +386,11 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, + qdisc_dev(tp->q)); + else + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -428,7 +448,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : - cls_bpf_prog_from_efd(tb, prog, tp); + cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); if (ret < 0) return ret; @@ -440,27 +460,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, return 0; } -static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, - struct cls_bpf_head *head) -{ - unsigned int i = 0x80000000; - u32 handle; - - do { - if (++head->hgen == 0x7FFFFFFF) - head->hgen = 1; - } while (--i > 0 && cls_bpf_get(tp, head->hgen)); - - if (unlikely(i == 0)) { - pr_err("Insufficient number of handles\n"); - handle = 0; - } else { - handle = head->hgen; - } - - return handle; -} - static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -470,6 +469,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; struct cls_bpf_prog *prog; + unsigned long idr_index; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -495,21 +495,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, } } - if (handle == 0) - prog->handle = cls_bpf_grab_new_handle(tp, head); - else + if (handle == 0) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (ret) + goto errout; + prog->handle = idr_index; + } else { + if (!oldprog) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (ret) + goto errout; + } prog->handle = handle; - if (prog->handle == 0) { - ret = -EINVAL; - goto errout; } ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) - goto errout; + goto errout_idr; ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); __cls_bpf_delete_prog(prog); return ret; } @@ -518,6 +527,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (oldprog) { + idr_replace_ext(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); tcf_exts_get_net(&oldprog->exts); @@ -529,6 +539,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_idr: + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 85f765cff697..25c2a888e1f0 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -348,9 +348,9 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; } -static void flow_perturbation(unsigned long arg) +static void flow_perturbation(struct timer_list *t) { - struct flow_filter *f = (struct flow_filter *)arg; + struct flow_filter *f = from_timer(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) @@ -510,8 +510,11 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } - if (TC_H_MAJ(baseclass) == 0) - baseclass = TC_H_MAKE(tp->q->handle, baseclass); + if (TC_H_MAJ(baseclass) == 0) { + struct Qdisc *q = tcf_block_q(tp->chain->block); + + baseclass = TC_H_MAKE(q->handle, baseclass); + } if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); @@ -521,8 +524,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, get_random_bytes(&fnew->hashrnd, 4); } - setup_deferrable_timer(&fnew->perturb_timer, flow_perturbation, - (unsigned long)fnew); + timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE); netif_keep_dst(qdisc_dev(tp->q)); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 7a838d1c1c00..543a3e875d05 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -155,37 +155,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; - struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); - info = skb_tunnel_info(skb); - if (info) { - struct ip_tunnel_key *key = &info->key; - - switch (ip_tunnel_info_af(info)) { - case AF_INET: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV4_ADDRS; - skb_key.enc_ipv4.src = key->u.ipv4.src; - skb_key.enc_ipv4.dst = key->u.ipv4.dst; - break; - case AF_INET6: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV6_ADDRS; - skb_key.enc_ipv6.src = key->u.ipv6.src; - skb_key.enc_ipv6.dst = key->u.ipv6.dst; - break; - } - - skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); - skb_key.enc_tp.src = key->tp_src; - skb_key.enc_tp.dst = key->tp_dst; - } - skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. @@ -245,17 +220,14 @@ static void fl_destroy_filter(struct rcu_head *head) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); + tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static int fl_hw_replace_filter(struct tcf_proto *tp, @@ -263,22 +235,11 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, struct fl_flow_key *mask, struct cls_fl_filter *f) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; + bool skip_sw = tc_skip_sw(f->flags); int err; - if (!tc_can_offload(dev)) { - if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) || - (f->hw_dev && !tc_can_offload(f->hw_dev))) { - f->hw_dev = dev; - return tc_skip_sw(f->flags) ? -EINVAL : 0; - } - dev = f->hw_dev; - cls_flower.egress_dev = true; - } else { - f->hw_dev = dev; - } - tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; @@ -286,33 +247,36 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.mask = mask; cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); - if (!err) + err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, skip_sw); + if (err < 0) { + fl_hw_destroy_filter(tp, f); + return err; + } else if (err > 0) { f->flags |= TCA_CLS_FLAGS_IN_HW; + } + + if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; - if (tc_skip_sw(f->flags)) - return err; return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct tcf_block *block = tp->chain->block; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; - cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev; + cls_flower.classid = f->res.classid; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); + tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 7f45e5ab8afc..20f0de1a960a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -28,6 +28,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/sch_generic.h> #define HTSIZE 256 @@ -86,9 +87,11 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { + struct Qdisc *q = tcf_block_q(tp->chain->block); + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || - !(TC_H_MAJ(id ^ tp->q->handle)))) { + !(TC_H_MAJ(id ^ q->handle)))) { res->classid = id; res->class = 0; return 0; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 3684153cd8a9..66d4e0099158 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -69,12 +69,27 @@ static void mall_destroy_rcu(struct rcu_head *rcu) tcf_queue_work(&head->work); } +static void mall_destroy_hw_filter(struct tcf_proto *tp, + struct cls_mall_head *head, + unsigned long cookie) +{ + struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + + tc_cls_common_offload_init(&cls_mall.common, tp); + cls_mall.command = TC_CLSMATCHALL_DESTROY; + cls_mall.cookie = cookie; + + tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false); +} + static int mall_replace_hw_filter(struct tcf_proto *tp, struct cls_mall_head *head, unsigned long cookie) { - struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + bool skip_sw = tc_skip_sw(head->flags); int err; tc_cls_common_offload_init(&cls_mall.common, tp); @@ -82,37 +97,29 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, cls_mall.exts = &head->exts; cls_mall.cookie = cookie; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, - &cls_mall); - if (!err) + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, + &cls_mall, skip_sw); + if (err < 0) { + mall_destroy_hw_filter(tp, head, cookie); + return err; + } else if (err > 0) { head->flags |= TCA_CLS_FLAGS_IN_HW; + } - return err; -} - -static void mall_destroy_hw_filter(struct tcf_proto *tp, - struct cls_mall_head *head, - unsigned long cookie) -{ - struct net_device *dev = tp->q->dev_queue->dev; - struct tc_cls_matchall_offload cls_mall = {}; - - tc_cls_common_offload_init(&cls_mall.common, tp); - cls_mall.command = TC_CLSMATCHALL_DESTROY; - cls_mall.cookie = cookie; + if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, &cls_mall); + return 0; } static void mall_destroy(struct tcf_proto *tp) { struct cls_mall_head *head = rtnl_dereference(tp->root); - struct net_device *dev = tp->q->dev_queue->dev; if (!head) return; - if (tc_should_offload(dev, head->flags)) + if (!tc_skip_hw(head->flags)) mall_destroy_hw_filter(tp, head, (unsigned long) head); if (tcf_exts_get_net(&head->exts)) @@ -155,7 +162,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, void **arg, bool ovr) { struct cls_mall_head *head = rtnl_dereference(tp->root); - struct net_device *dev = tp->q->dev_queue->dev; struct nlattr *tb[TCA_MATCHALL_MAX + 1]; struct cls_mall_head *new; u32 flags = 0; @@ -195,14 +201,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (err) goto err_set_parms; - if (tc_should_offload(dev, flags)) { + if (!tc_skip_hw(new->flags)) { err = mall_replace_hw_filter(tp, new, (unsigned long) new); - if (err) { - if (tc_skip_sw(flags)) - goto err_replace_hw_filter; - else - err = 0; - } + if (err) + goto err_replace_hw_filter; } if (!tc_in_hw(new->flags)) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index a76937ee0b2d..67467ae24c97 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -13,6 +13,7 @@ #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> +#include <net/sch_generic.h> /* * Passing parameters to the root seems to be done more awkwardly than really @@ -96,9 +97,11 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = tcindex_lookup(p, key); if (!f) { + struct Qdisc *q = tcf_block_q(tp->chain->block); + if (!p->fall_through) return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); + res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); res->class = 0; pr_debug("alg 0x%x\n", res->classid); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b58eccb21f03..ac152b4f4247 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -46,6 +46,7 @@ #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/netdevice.h> +#include <linux/idr.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -85,6 +86,7 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; + struct idr handle_idr; struct rcu_head rcu; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. @@ -94,9 +96,9 @@ struct tc_u_hnode { struct tc_u_common { struct tc_u_hnode __rcu *hlist; - struct Qdisc *q; + struct tcf_block *block; int refcnt; - u32 hgenerator; + struct idr handle_idr; struct hlist_node hnode; struct rcu_head rcu; }; @@ -314,19 +316,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle) return u32_lookup_key(ht, handle); } -static u32 gen_new_htid(struct tc_u_common *tp_c) +static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { - int i = 0x800; + unsigned long idr_index; + int err; - /* hgenerator only used inside rtnl lock it is safe to increment + /* This is only used inside rtnl lock it is safe to increment * without read _copy_ update semantics */ - do { - if (++tp_c->hgenerator == 0x7FF) - tp_c->hgenerator = 1; - } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); - - return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; + err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index, + 1, 0x7FF, GFP_KERNEL); + if (err) + return 0; + return (u32)(idr_index | 0x800) << 20; } static struct hlist_head *tc_u_common_hash; @@ -336,11 +338,7 @@ static struct hlist_head *tc_u_common_hash; static unsigned int tc_u_hash(const struct tcf_proto *tp) { - struct net_device *dev = tp->q->dev_queue->dev; - u32 qhandle = tp->q->handle; - int ifindex = dev->ifindex; - - return hash_64((u64)ifindex << 32 | qhandle, U32_HASH_SHIFT); + return hash_ptr(tp->chain->block, U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) @@ -350,7 +348,7 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) h = tc_u_hash(tp); hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) { - if (tc->q == tp->q) + if (tc->block == tp->chain->block) return tc; } return NULL; @@ -369,8 +367,9 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; root_ht->refcnt++; - root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); @@ -378,8 +377,9 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } - tp_c->q = tp->q; + tp_c->block = tp->chain->block; INIT_HLIST_NODE(&tp_c->hnode); + idr_init(&tp_c->handle_idr); h = tc_u_hash(tp); hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); @@ -487,71 +487,69 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) return 0; } -static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) +static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) { - struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - if (!tc_should_offload(dev, 0)) - return; - tc_cls_common_offload_init(&cls_u32.common, tp); - cls_u32.command = TC_CLSU32_DELETE_KNODE; - cls_u32.knode.handle = handle; + cls_u32.command = TC_CLSU32_DELETE_HNODE; + cls_u32.hnode.divisor = h->divisor; + cls_u32.hnode.handle = h->handle; + cls_u32.hnode.prio = h->prio; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, u32 flags) { - struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; + bool skip_sw = tc_skip_sw(flags); + bool offloaded = false; int err; - if (!tc_should_offload(dev, flags)) - return tc_skip_sw(flags) ? -EINVAL : 0; - tc_cls_common_offload_init(&cls_u32.common, tp); cls_u32.command = TC_CLSU32_NEW_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); - if (tc_skip_sw(flags)) + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + if (err < 0) { + u32_clear_hw_hnode(tp, h); return err; + } else if (err > 0) { + offloaded = true; + } + + if (skip_sw && !offloaded) + return -EINVAL; return 0; } -static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) +static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) { - struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; - if (!tc_should_offload(dev, 0)) - return; - tc_cls_common_offload_init(&cls_u32.common, tp); - cls_u32.command = TC_CLSU32_DELETE_HNODE; - cls_u32.hnode.divisor = h->divisor; - cls_u32.hnode.handle = h->handle; - cls_u32.hnode.prio = h->prio; + cls_u32.command = TC_CLSU32_DELETE_KNODE; + cls_u32.knode.handle = handle; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); + tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags) { - struct net_device *dev = tp->q->dev_queue->dev; + struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; + bool skip_sw = tc_skip_sw(flags); int err; - if (!tc_should_offload(dev, flags)) - return tc_skip_sw(flags) ? -EINVAL : 0; - tc_cls_common_offload_init(&cls_u32.common, tp); cls_u32.command = TC_CLSU32_REPLACE_KNODE; cls_u32.knode.handle = n->handle; @@ -568,13 +566,16 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, if (n->ht_down) cls_u32.knode.link_handle = n->ht_down->handle; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32); - - if (!err) + err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw); + if (err < 0) { + u32_remove_hw_knode(tp, n->handle); + return err; + } else if (err > 0) { n->flags |= TCA_CLS_FLAGS_IN_HW; + } - if (tc_skip_sw(flags)) - return err; + if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; return 0; } @@ -590,6 +591,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n->handle); + idr_remove_ext(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) call_rcu(&n->rcu, u32_delete_key_freepf_rcu); else @@ -614,6 +616,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht); + idr_destroy(&ht->handle_idr); + idr_remove_ext(&tp_c->handle_idr, ht->handle); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -661,6 +665,7 @@ static void u32_destroy(struct tcf_proto *tp) kfree_rcu(ht, rcu); } + idr_destroy(&tp_c->handle_idr); kfree(tp_c); } @@ -729,27 +734,21 @@ ret: return ret; } -#define NR_U32_NODE (1<<12) -static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { - struct tc_u_knode *n; - unsigned long i; - unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), - GFP_KERNEL); - if (!bitmap) - return handle | 0xFFF; - - for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); - n; - n = rtnl_dereference(n->next)) - set_bit(TC_U32_NODE(n->handle), bitmap); - - i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); - if (i >= NR_U32_NODE) - i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); + unsigned long idr_index; + u32 start = htid | 0x800; + u32 max = htid | 0xFFF; + u32 min = htid; + + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + start, max + 1, GFP_KERNEL)) { + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + min + 1, max + 1, GFP_KERNEL)) + return max; + } - kfree(bitmap); - return handle | (i >= NR_U32_NODE ? 0xFFF : i); + return (u32)idr_index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -834,6 +833,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, if (pins->handle == n->handle) break; + idr_replace_ext(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } @@ -966,22 +966,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; if (TC_U32_KEY(handle)) return -EINVAL; - if (handle == 0) { - handle = gen_new_htid(tp->data); - if (handle == 0) - return -ENOMEM; - } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; + if (handle == 0) { + handle = gen_new_htid(tp->data, ht); + if (handle == 0) { + kfree(ht); + return -ENOMEM; + } + } else { + err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL, + handle, handle + 1, GFP_KERNEL); + if (err) { + kfree(ht); + return err; + } + } ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; + idr_init(&ht->handle_idr); err = u32_replace_hw_hnode(tp, ht, flags); if (err) { + idr_remove_ext(&tp_c->handle_idr, handle); kfree(ht); return err; } @@ -1015,24 +1026,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) return -EINVAL; handle = htid | TC_U32_NODE(handle); + err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, + handle, handle + 1, + GFP_KERNEL); + if (err) + return err; } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL] == NULL) - return -EINVAL; + if (tb[TCA_U32_SEL] == NULL) { + err = -EINVAL; + goto erridr; + } s = nla_data(tb[TCA_U32_SEL]); n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); - if (n == NULL) - return -ENOBUFS; + if (n == NULL) { + err = -ENOBUFS; + goto erridr; + } #ifdef CONFIG_CLS_U32_PERF size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); if (!n->pf) { - kfree(n); - return -ENOBUFS; + err = -ENOBUFS; + goto errfree; } #endif @@ -1095,9 +1115,12 @@ errhw: errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF +errfree: free_percpu(n->pf); #endif kfree(n); +erridr: + idr_remove_ext(&ht->handle_idr, handle); return err; } diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 03b677bc0700..1331a4c2d8ff 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -178,7 +178,7 @@ static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); - struct net *net = dev_net(qdisc_dev(tp->q)); + struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 22bc6fc48311..b6c4f536876b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1502,7 +1502,6 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) int s_idx, s_q_idx; struct net_device *dev; const struct nlmsghdr *nlh = cb->nlh; - struct tcmsg *tcm = nlmsg_data(nlh); struct nlattr *tca[TCA_MAX + 1]; int err; @@ -1512,7 +1511,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL); if (err < 0) return err; @@ -1664,9 +1663,11 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_bind_args *a = (void *)arg; if (tp->ops->bind_class) { - tcf_tree_lock(tp); + struct Qdisc *q = tcf_block_q(tp->chain->block); + + sch_tree_lock(q); tp->ops->bind_class(n, a->classid, a->cl); - tcf_tree_unlock(tp); + sch_tree_unlock(q); } return 0; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c5fcdf1a58a0..2dbd249c0b2f 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -281,7 +281,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, goto err_out; } - error = tcf_block_get(&flow->block, &flow->filter_list); + error = tcf_block_get(&flow->block, &flow->filter_list, sch); if (error) { kfree(flow); goto err_out; @@ -546,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - err = tcf_block_get(&p->link.block, &p->link.filter_list); + err = tcf_block_get(&p->link.block, &p->link.filter_list, sch); if (err) return err; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index dcef97fa8047..6361be7881f1 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -255,6 +255,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; case TC_ACT_RECLASSIFY: @@ -1566,7 +1567,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); return err; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c new file mode 100644 index 000000000000..7a72980c1509 --- /dev/null +++ b/net/sched/sch_cbs.c @@ -0,0 +1,373 @@ +/* + * net/sched/sch_cbs.c Credit Based Shaper + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> + * + */ + +/* Credit Based Shaper (CBS) + * ========================= + * + * This is a simple rate-limiting shaper aimed at TSN applications on + * systems with known traffic workloads. + * + * Its algorithm is defined by the IEEE 802.1Q-2014 Specification, + * Section 8.6.8.2, and explained in more detail in the Annex L of the + * same specification. + * + * There are four tunables to be considered: + * + * 'idleslope': Idleslope is the rate of credits that is + * accumulated (in kilobits per second) when there is at least + * one packet waiting for transmission. Packets are transmitted + * when the current value of credits is equal or greater than + * zero. When there is no packet to be transmitted the amount of + * credits is set to zero. This is the main tunable of the CBS + * algorithm. + * + * 'sendslope': + * Sendslope is the rate of credits that is depleted (it should be a + * negative number of kilobits per second) when a transmission is + * ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section + * 8.6.8.2 item g): + * + * sendslope = idleslope - port_transmit_rate + * + * 'hicredit': Hicredit defines the maximum amount of credits (in + * bytes) that can be accumulated. Hicredit depends on the + * characteristics of interfering traffic, + * 'max_interference_size' is the maximum size of any burst of + * traffic that can delay the transmission of a frame that is + * available for transmission for this traffic class, (IEEE + * 802.1Q-2014 Annex L, Equation L-3): + * + * hicredit = max_interference_size * (idleslope / port_transmit_rate) + * + * 'locredit': Locredit is the minimum amount of credits that can + * be reached. It is a function of the traffic flowing through + * this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2): + * + * locredit = max_frame_size * (sendslope / port_transmit_rate) + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> + +#define BYTES_PER_KBIT (1000LL / 8) + +struct cbs_sched_data { + bool offload; + int queue; + s64 port_rate; /* in bytes/s */ + s64 last; /* timestamp in ns */ + s64 credits; /* in bytes */ + s32 locredit; /* in bytes */ + s32 hicredit; /* in bytes */ + s64 sendslope; /* in bytes/s */ + s64 idleslope; /* in bytes/s */ + struct qdisc_watchdog watchdog; + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch); + struct sk_buff *(*dequeue)(struct Qdisc *sch); +}; + +static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch) +{ + return qdisc_enqueue_tail(skb, sch); +} + +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (sch->q.qlen == 0 && q->credits > 0) { + /* We need to stop accumulating credits when there's + * no enqueued packets and q->credits is positive. + */ + q->credits = 0; + q->last = ktime_get_ns(); + } + + return qdisc_enqueue_tail(skb, sch); +} + +static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->enqueue(skb, sch); +} + +/* timediff is in ns, slope is in bytes/s */ +static s64 timediff_to_credits(s64 timediff, s64 slope) +{ + return div64_s64(timediff * slope, NSEC_PER_SEC); +} + +static s64 delay_from_credits(s64 credits, s64 slope) +{ + if (unlikely(slope == 0)) + return S64_MAX; + + return div64_s64(-credits * NSEC_PER_SEC, slope); +} + +static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) +{ + if (unlikely(port_rate == 0)) + return S64_MAX; + + return div64_s64(len * slope, port_rate); +} + +static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + s64 now = ktime_get_ns(); + struct sk_buff *skb; + s64 credits; + int len; + + if (q->credits < 0) { + credits = timediff_to_credits(now - q->last, q->idleslope); + + credits = q->credits + credits; + q->credits = min_t(s64, credits, q->hicredit); + + if (q->credits < 0) { + s64 delay; + + delay = delay_from_credits(q->credits, q->idleslope); + qdisc_watchdog_schedule_ns(&q->watchdog, now + delay); + + q->last = now; + + return NULL; + } + } + + skb = qdisc_dequeue_head(sch); + if (!skb) + return NULL; + + len = qdisc_pkt_len(skb); + + /* As sendslope is a negative number, this will decrease the + * amount of q->credits. + */ + credits = credits_from_len(len, q->sendslope, q->port_rate); + credits += q->credits; + + q->credits = max_t(s64, credits, q->locredit); + q->last = now; + + return skb; +} + +static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) +{ + return qdisc_dequeue_head(sch); +} + +static struct sk_buff *cbs_dequeue(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->dequeue(sch); +} + +static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { + [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, +}; + +static void cbs_disable_offload(struct net_device *dev, + struct cbs_sched_data *q) +{ + struct tc_cbs_qopt_offload cbs = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + q->enqueue = cbs_enqueue_soft; + q->dequeue = cbs_dequeue_soft; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + cbs.queue = q->queue; + cbs.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); + if (err < 0) + pr_warn("Couldn't disable CBS offload for queue %d\n", + cbs.queue); +} + +static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, + const struct tc_cbs_qopt *opt) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_cbs_qopt_offload cbs = { }; + int err; + + if (!ops->ndo_setup_tc) + return -EOPNOTSUPP; + + cbs.queue = q->queue; + + cbs.enable = 1; + cbs.hicredit = opt->hicredit; + cbs.locredit = opt->locredit; + cbs.idleslope = opt->idleslope; + cbs.sendslope = opt->sendslope; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); + if (err < 0) + return err; + + q->enqueue = cbs_enqueue_offload; + q->dequeue = cbs_dequeue_offload; + + return 0; +} + +static int cbs_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_CBS_MAX + 1]; + struct tc_cbs_qopt *qopt; + int err; + + err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, NULL); + if (err < 0) + return err; + + if (!tb[TCA_CBS_PARMS]) + return -EINVAL; + + qopt = nla_data(tb[TCA_CBS_PARMS]); + + if (!qopt->offload) { + struct ethtool_link_ksettings ecmd; + s64 link_speed; + + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; + + q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; + + cbs_disable_offload(dev, q); + } else { + err = cbs_enable_offload(dev, q, qopt); + if (err < 0) + return err; + } + + /* Everything went OK, save the parameters used. */ + q->hicredit = qopt->hicredit; + q->locredit = qopt->locredit; + q->idleslope = qopt->idleslope * BYTES_PER_KBIT; + q->sendslope = qopt->sendslope * BYTES_PER_KBIT; + q->offload = qopt->offload; + + return 0; +} + +static int cbs_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + if (!opt) + return -EINVAL; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + q->enqueue = cbs_enqueue_soft; + q->dequeue = cbs_dequeue_soft; + + qdisc_watchdog_init(&q->watchdog, sch); + + return cbs_change(sch, opt); +} + +static void cbs_destroy(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + qdisc_watchdog_cancel(&q->watchdog); + + cbs_disable_offload(dev, q); +} + +static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct tc_cbs_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.hicredit = q->hicredit; + opt.locredit = q->locredit; + opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT); + opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT); + opt.offload = q->offload; + + if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { + .id = "cbs", + .priv_size = sizeof(struct cbs_sched_data), + .enqueue = cbs_enqueue, + .dequeue = cbs_dequeue, + .peek = qdisc_peek_dequeued, + .init = cbs_init, + .reset = qdisc_reset_queue, + .destroy = cbs_destroy, + .change = cbs_change, + .dump = cbs_dump, + .owner = THIS_MODULE, +}; + +static int __init cbs_module_init(void) +{ + return register_qdisc(&cbs_qdisc_ops); +} + +static void __exit cbs_module_exit(void) +{ + unregister_qdisc(&cbs_qdisc_ops); +} +module_init(cbs_module_init) +module_exit(cbs_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 2d0e8d4bdc29..5bbcef3dcd8c 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -321,6 +321,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } @@ -412,7 +413,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct drr_sched *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; err = qdisc_class_hash_init(&q->clhash); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2836c80c7aa5..fb4fb71c68cf 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -344,7 +344,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; - err = tcf_block_get(&p->block, &p->filter_list); + err = tcf_block_get(&p->block, &p->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index de3b57ceca7b..0305d791ea94 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -105,6 +105,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return 0; } @@ -481,7 +482,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) return err; } - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index bf8c81e07c70..3839cbbdc32b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -288,9 +288,9 @@ unsigned long dev_trans_start(struct net_device *dev) } EXPORT_SYMBOL(dev_trans_start); -static void dev_watchdog(unsigned long arg) +static void dev_watchdog(struct timer_list *t) { - struct net_device *dev = (struct net_device *)arg; + struct net_device *dev = from_timer(dev, t, watchdog_timer); netif_tx_lock(dev); if (!qdisc_tx_is_noop(dev)) { @@ -603,8 +603,14 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct Qdisc *sch; unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size; int err = -ENOBUFS; - struct net_device *dev = dev_queue->dev; + struct net_device *dev; + + if (!dev_queue) { + err = -EINVAL; + goto errout; + } + dev = dev_queue->dev; p = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); @@ -689,10 +695,8 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_rcu_free(struct rcu_head *head) +static void qdisc_free(struct Qdisc *qdisc) { - struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); @@ -725,11 +729,7 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(qdisc->gso_skb); kfree_skb(qdisc->skb_bad_txq); - /* - * gen_estimator est_timer() might access qdisc->q.lock, - * wait a RCU grace period before freeing qdisc. - */ - call_rcu(&qdisc->rcu_head, qdisc_rcu_free); + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); @@ -960,7 +960,7 @@ void dev_init_scheduler(struct net_device *dev) if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); + timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } static void shutdown_scheduler_queue(struct net_device *dev, @@ -1024,3 +1024,49 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, } } EXPORT_SYMBOL(psched_ratecfg_precompute); + +static void mini_qdisc_rcu_func(struct rcu_head *head) +{ +} + +void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, + struct tcf_proto *tp_head) +{ + struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq); + struct mini_Qdisc *miniq; + + if (!tp_head) { + RCU_INIT_POINTER(*miniqp->p_miniq, NULL); + return; + } + + miniq = !miniq_old || miniq_old == &miniqp->miniq2 ? + &miniqp->miniq1 : &miniqp->miniq2; + + /* We need to make sure that readers won't see the miniq + * we are about to modify. So wait until previous call_rcu_bh callback + * is done. + */ + rcu_barrier_bh(); + miniq->filter_list = tp_head; + rcu_assign_pointer(*miniqp->p_miniq, miniq); + + if (miniq_old) + /* This is counterpart of the rcu barrier above. We need to + * block potential new user of miniq_old until all readers + * are not seeing it. + */ + call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func); +} +EXPORT_SYMBOL(mini_qdisc_pair_swap); + +void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, + struct mini_Qdisc __rcu **p_miniq) +{ + miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; + miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; + miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; + miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; + miniqp->p_miniq = p_miniq; +} +EXPORT_SYMBOL(mini_qdisc_pair_init); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3f88b75488b0..d04068a97d81 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1033,7 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); return err; @@ -1144,6 +1144,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } @@ -1405,7 +1406,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; - err = tcf_block_get(&q->root.block, &q->root.filter_list); + err = tcf_block_get(&q->root.block, &q->root.filter_list, sch); if (err) return err; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7e148376ba52..fa0380730ff0 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -142,6 +142,7 @@ struct htb_class { struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ unsigned int drops ____cacheline_aligned_in_smp; + unsigned int overlimits; }; struct htb_level { @@ -243,6 +244,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } @@ -533,6 +535,9 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) if (new_mode == cl->cmode) return; + if (new_mode == HTB_CANT_SEND) + cl->overlimits++; + if (cl->prio_activity) { /* not necessary: speed optimization */ if (cl->cmode != HTB_CANT_SEND) htb_deactivate_prios(q, cl); @@ -1026,7 +1031,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; @@ -1143,6 +1148,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) struct htb_class *cl = (struct htb_class *)arg; struct gnet_stats_queue qs = { .drops = cl->drops, + .overlimits = cl->overlimits, }; __u32 qlen = 0; @@ -1388,7 +1394,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); goto failure; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 44de4ee51ce9..5ecc38f35d47 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -20,6 +20,8 @@ struct ingress_sched_data { struct tcf_block *block; + struct tcf_block_ext_info block_info; + struct mini_Qdisc_pair miniqp; }; static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) @@ -53,13 +55,26 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl) return q->block; } +static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv) +{ + struct mini_Qdisc_pair *miniqp = priv; + + mini_qdisc_pair_swap(miniqp, tp_head); +} + static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->block, &dev->ingress_cl_list); + mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); + + q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + q->block_info.chain_head_change = clsact_chain_head_change; + q->block_info.chain_head_change_priv = &q->miniqp; + + err = tcf_block_get_ext(&q->block, sch, &q->block_info); if (err) return err; @@ -73,7 +88,7 @@ static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); - tcf_block_put(q->block); + tcf_block_put_ext(q->block, sch, &q->block_info); net_dec_ingress_queue(); } @@ -114,6 +129,10 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { struct clsact_sched_data { struct tcf_block *ingress_block; struct tcf_block *egress_block; + struct tcf_block_ext_info ingress_block_info; + struct tcf_block_ext_info egress_block_info; + struct mini_Qdisc_pair miniqp_ingress; + struct mini_Qdisc_pair miniqp_egress; }; static unsigned long clsact_find(struct Qdisc *sch, u32 classid) @@ -153,13 +172,25 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); + + q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + q->ingress_block_info.chain_head_change = clsact_chain_head_change; + q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress; + + err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info); if (err) return err; - err = tcf_block_get(&q->egress_block, &dev->egress_cl_list); + mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); + + q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS; + q->egress_block_info.chain_head_change = clsact_chain_head_change; + q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; + + err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); if (err) - return err; + goto err_egress_block_get; net_inc_ingress_queue(); net_inc_egress_queue(); @@ -167,14 +198,18 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) sch->flags |= TCQ_F_CPUSTATS; return 0; + +err_egress_block_get: + tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + return err; } static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); - tcf_block_put(q->egress_block); - tcf_block_put(q->ingress_block); + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); + tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); net_dec_ingress_queue(); net_dec_egress_queue(); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f3a3e507422b..213b586a06a0 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -130,15 +130,7 @@ static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) static struct netdev_queue *mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { - unsigned int ntx = TC_H_MIN(tcm->tcm_parent); - struct netdev_queue *dev_queue = mq_queue_get(sch, ntx); - - if (!dev_queue) { - struct net_device *dev = qdisc_dev(sch); - - return netdev_get_tx_queue(dev, 0); - } - return dev_queue; + return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6bcdfe6e7b63..b85885a9d8a1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -18,10 +18,16 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> +#include <net/pkt_cls.h> struct mqprio_sched { struct Qdisc **qdiscs; + u16 mode; + u16 shaper; int hw_offload; + u32 flags; + u64 min_rate[TC_QOPT_MAX_QUEUE]; + u64 max_rate[TC_QOPT_MAX_QUEUE]; }; static void mqprio_destroy(struct Qdisc *sch) @@ -39,9 +45,18 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { - struct tc_mqprio_qopt mqprio = {}; - - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &mqprio); + struct tc_mqprio_qopt_offload mqprio = { { 0 } }; + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + case TC_MQPRIO_MODE_CHANNEL: + dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_MQPRIO, + &mqprio); + break; + default: + return; + } } else { netdev_set_num_tc(dev, 0); } @@ -97,6 +112,26 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) return 0; } +static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { + [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, + [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, + [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, + [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, +}; + +static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy, int len) +{ + int nested_len = nla_len(nla) - NLA_ALIGN(len); + + if (nested_len >= nla_attr_size(0)) + return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), + nested_len, policy, NULL); + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = qdisc_dev(sch); @@ -105,6 +140,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int rem; + int len; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); @@ -115,6 +154,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; + /* make certain can allocate enough classids to handle queues */ + if (dev->num_tx_queues >= TC_H_MIN_PRIORITY) + return -ENOMEM; + if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; @@ -122,6 +165,59 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (mqprio_parse_opt(dev, qopt)) return -EINVAL; + len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); + if (len > 0) { + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) + return -EINVAL; + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + } + /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); @@ -146,14 +242,36 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { - struct tc_mqprio_qopt mqprio = *qopt; + struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + if (priv->shaper != TC_MQPRIO_SHAPER_DCB) + return -EINVAL; + break; + case TC_MQPRIO_MODE_CHANNEL: + mqprio.flags = priv->flags; + if (priv->flags & TC_MQPRIO_F_MODE) + mqprio.mode = priv->mode; + if (priv->flags & TC_MQPRIO_F_SHAPER) + mqprio.shaper = priv->shaper; + if (priv->flags & TC_MQPRIO_F_MIN_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.min_rate[i] = priv->min_rate[i]; + if (priv->flags & TC_MQPRIO_F_MAX_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.max_rate[i] = priv->max_rate[i]; + break; + default: + return -EINVAL; + } + err = dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_MQPRIO, &mqprio); if (err) return err; - priv->hw_offload = mqprio.hw; + priv->hw_offload = mqprio.qopt.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -193,7 +311,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); - unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; @@ -223,11 +341,51 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return 0; } +static int dump_rates(struct mqprio_sched *priv, + struct tc_mqprio_qopt *opt, struct sk_buff *skb) +{ + struct nlattr *nest; + int i; + + if (priv->flags & TC_MQPRIO_F_MIN_RATE) { + nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < opt->num_tc; i++) { + if (nla_put(skb, TCA_MQPRIO_MIN_RATE64, + sizeof(priv->min_rate[i]), + &priv->min_rate[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + + if (priv->flags & TC_MQPRIO_F_MAX_RATE) { + nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < opt->num_tc; i++) { + if (nla_put(skb, TCA_MQPRIO_MAX_RATE64, + sizeof(priv->max_rate[i]), + &priv->max_rate[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int i; @@ -258,12 +416,25 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[i] = dev->tc_to_txq[i].offset; } - if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_MODE) && + nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode)) goto nla_put_failure; - return skb->len; + if ((priv->flags & TC_MQPRIO_F_SHAPER) && + nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_MIN_RATE || + priv->flags & TC_MQPRIO_F_MAX_RATE) && + (dump_rates(priv, &opt, skb) != 0)) + goto nla_put_failure; + + return nla_nest_end(skb, nla); nla_put_failure: - nlmsg_trim(skb, b); + nlmsg_trim(skb, nla); return -1; } @@ -282,38 +453,35 @@ static unsigned long mqprio_find(struct Qdisc *sch, u32 classid) struct net_device *dev = qdisc_dev(sch); unsigned int ntx = TC_H_MIN(classid); - if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) - return 0; - return ntx; + /* There are essentially two regions here that have valid classid + * values. The first region will have a classid value of 1 through + * num_tx_queues. All of these are backed by actual Qdiscs. + */ + if (ntx < TC_H_MIN_PRIORITY) + return (ntx <= dev->num_tx_queues) ? ntx : 0; + + /* The second region represents the hardware traffic classes. These + * are represented by classid values of TC_H_MIN_PRIORITY through + * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1 + */ + return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0; } static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct net_device *dev = qdisc_dev(sch); + if (cl < TC_H_MIN_PRIORITY) { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + struct net_device *dev = qdisc_dev(sch); + int tc = netdev_txq_to_tc(dev, cl - 1); - if (cl <= netdev_get_num_tc(dev)) { + tcm->tcm_parent = (tc < 0) ? 0 : + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(tc + TC_H_MIN_PRIORITY)); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + } else { tcm->tcm_parent = TC_H_ROOT; tcm->tcm_info = 0; - } else { - int i; - struct netdev_queue *dev_queue; - - dev_queue = mqprio_queue_get(sch, cl); - tcm->tcm_parent = 0; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - struct netdev_tc_txq tc = dev->tc_to_txq[i]; - int q_idx = cl - netdev_get_num_tc(dev); - - if (q_idx > tc.offset && - q_idx <= tc.offset + tc.count) { - tcm->tcm_parent = - TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(i + 1)); - break; - } - } - tcm->tcm_info = dev_queue->qdisc_sleeping->handle; } tcm->tcm_handle |= TC_H_MIN(cl); return 0; @@ -324,15 +492,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __releases(d->lock) __acquires(d->lock) { - struct net_device *dev = qdisc_dev(sch); - - if (cl <= netdev_get_num_tc(dev)) { + if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; - struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + struct net_device *dev = qdisc_dev(sch); + struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we @@ -385,17 +552,36 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; - for (ntx = arg->skip; - ntx < dev->num_tx_queues + netdev_get_num_tc(dev); - ntx++) { + for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { + if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + + /* Pad the values and skip over unused traffic classes */ + if (ntx < TC_MAX_QUEUE) { + arg->count = TC_MAX_QUEUE; + ntx = TC_MAX_QUEUE; + } + + /* Reset offset, sort out remaining per-queue qdiscs */ + for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; - break; + return; } arg->count++; } } +static struct netdev_queue *mqprio_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + return mqprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); +} + static const struct Qdisc_class_ops mqprio_class_ops = { .graft = mqprio_graft, .leaf = mqprio_leaf, @@ -403,6 +589,7 @@ static const struct Qdisc_class_ops mqprio_class_ops = { .walk = mqprio_walk, .dump = mqprio_dump_class, .dump_stats = mqprio_dump_class_stats, + .select_queue = mqprio_select_queue, }; static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index ff4fc3e0facd..012216386c0b 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -54,6 +54,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } @@ -245,7 +246,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b1266e75ca43..dd70924cbcdf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -77,8 +77,8 @@ struct netem_sched_data { struct qdisc_watchdog watchdog; - psched_tdiff_t latency; - psched_tdiff_t jitter; + s64 latency; + s64 jitter; u32 loss; u32 ecn; @@ -135,6 +135,13 @@ struct netem_sched_data { u32 a5; /* p23 used only in 4-states */ } clg; + struct tc_netem_slot slot_config; + struct slotstate { + u64 slot_next; + s32 packets_left; + s32 bytes_left; + } slot; + }; /* Time stamp put into socket buffer control block @@ -145,16 +152,9 @@ struct netem_sched_data { * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { - psched_time_t time_to_send; - ktime_t tstamp_save; + u64 time_to_send; }; - -static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) -{ - return rb_entry(rb, struct sk_buff, rbnode); -} - static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ @@ -312,11 +312,11 @@ static bool loss_event(struct netem_sched_data *q) * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ -static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, - struct crndstate *state, - const struct disttable *dist) +static s64 tabledist(s64 mu, s32 sigma, + struct crndstate *state, + const struct disttable *dist) { - psched_tdiff_t x; + s64 x; long t; u32 rnd; @@ -327,7 +327,7 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, /* default uniform distribution */ if (dist == NULL) - return (rnd % (2*sigma)) - sigma + mu; + return (rnd % (2 * sigma)) - sigma + mu; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; @@ -339,10 +339,8 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) +static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) { - u64 ticks; - len += q->packet_overhead; if (q->cell_size) { @@ -353,21 +351,19 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche len = cells * (q->cell_size + q->cell_overhead); } - ticks = (u64)len * NSEC_PER_SEC; - - do_div(ticks, q->rate); - return PSCHED_NS2TICKS(ticks); + return div64_u64(len * NSEC_PER_SEC, q->rate); } static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - struct rb_node *p; + struct rb_node *p = rb_first(&q->t_root); - while ((p = rb_first(&q->t_root))) { - struct sk_buff *skb = netem_rb_to_skb(p); + while (p) { + struct sk_buff *skb = rb_to_skb(p); - rb_erase(p, &q->t_root); + p = rb_next(p); + rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } } @@ -375,14 +371,14 @@ static void tfifo_reset(struct Qdisc *sch) static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; + u64 tnext = netem_skb_cb(nskb)->time_to_send; struct rb_node **p = &q->t_root.rb_node, *parent = NULL; while (*p) { struct sk_buff *skb; parent = *p; - skb = netem_rb_to_skb(parent); + skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else @@ -521,13 +517,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor)) { - psched_time_t now; - psched_tdiff_t delay; + u64 now; + s64 delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - now = psched_get_time(); + now = ktime_get_ns(); if (q->rate) { struct netem_skb_cb *last = NULL; @@ -538,7 +534,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *t_skb; struct netem_skb_cb *t_last; - t_skb = netem_rb_to_skb(rb_last(&q->t_root)); + t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) { @@ -553,15 +549,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * from delay. */ delay -= last->time_to_send - now; - delay = max_t(psched_tdiff_t, 0, delay); + delay = max_t(s64, 0, delay); now = last->time_to_send; } - delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); + delay += packet_time_ns(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; - cb->tstamp_save = skb->tstamp; ++q->counter; tfifo_enqueue(skb, sch); } else { @@ -569,7 +564,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * Do re-ordering by putting one out of N packets at the front * of the queue. */ - cb->time_to_send = psched_get_time(); + cb->time_to_send = ktime_get_ns(); q->counter = 0; netem_enqueue_skb_head(&sch->q, skb); @@ -600,6 +595,20 @@ finish_segs: return NET_XMIT_SUCCESS; } +/* Delay the next round with a new future slot with a + * correct number of bytes and packets. + */ + +static void get_slot_next(struct netem_sched_data *q, u64 now) +{ + q->slot.slot_next = now + q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + q->slot.packets_left = q->slot_config.max_packets; + q->slot.bytes_left = q->slot_config.max_bytes; +} + static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); @@ -616,20 +625,26 @@ deliver: } p = rb_first(&q->t_root); if (p) { - psched_time_t time_to_send; + u64 time_to_send; + u64 now = ktime_get_ns(); - skb = netem_rb_to_skb(p); + skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; - if (time_to_send <= psched_get_time()) { - rb_erase(p, &q->t_root); + if (q->slot.slot_next && q->slot.slot_next < time_to_send) + get_slot_next(q, now); + if (time_to_send <= now && q->slot.slot_next <= now) { + rb_erase(p, &q->t_root); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; - skb->tstamp = netem_skb_cb(skb)->tstamp_save; + /* skb->dev shares skb->rbnode area, + * we need to restore its value. + */ + skb->dev = qdisc_dev(sch); #ifdef CONFIG_NET_CLS_ACT /* @@ -640,6 +655,14 @@ deliver: skb->tstamp = 0; #endif + if (q->slot.slot_next) { + q->slot.packets_left--; + q->slot.bytes_left -= qdisc_pkt_len(skb); + if (q->slot.packets_left <= 0 || + q->slot.bytes_left <= 0) + get_slot_next(q, now); + } + if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; @@ -663,7 +686,10 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule(&q->watchdog, time_to_send); + + qdisc_watchdog_schedule_ns(&q->watchdog, + max(time_to_send, + q->slot.slot_next)); } if (q->qdisc) { @@ -694,6 +720,7 @@ static void dist_free(struct disttable *d) * Distribution data is a variable size payload containing * signed 16 bit values. */ + static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); @@ -724,6 +751,23 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return 0; } +static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) +{ + const struct tc_netem_slot *c = nla_data(attr); + + q->slot_config = *c; + if (q->slot_config.max_packets == 0) + q->slot_config.max_packets = INT_MAX; + if (q->slot_config.max_bytes == 0) + q->slot_config.max_bytes = INT_MAX; + q->slot.packets_left = q->slot_config.max_packets; + q->slot.bytes_left = q->slot_config.max_bytes; + if (q->slot_config.min_delay | q->slot_config.max_delay) + q->slot.slot_next = ktime_get_ns(); + else + q->slot.slot_next = 0; +} + static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corr *c = nla_data(attr); @@ -825,6 +869,9 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, [TCA_NETEM_RATE64] = { .type = NLA_U64 }, + [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, + [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, + [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -892,8 +939,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) sch->limit = qopt->limit; - q->latency = qopt->latency; - q->jitter = qopt->jitter; + q->latency = PSCHED_TICKS2NS(qopt->latency); + q->jitter = PSCHED_TICKS2NS(qopt->jitter); q->limit = qopt->limit; q->gap = qopt->gap; q->counter = 0; @@ -922,9 +969,18 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) q->rate = max_t(u64, q->rate, nla_get_u64(tb[TCA_NETEM_RATE64])); + if (tb[TCA_NETEM_LATENCY64]) + q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); + + if (tb[TCA_NETEM_JITTER64]) + q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); + if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); + if (tb[TCA_NETEM_SLOT]) + get_slot(q, tb[TCA_NETEM_SLOT]); + return ret; } @@ -1014,9 +1070,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; + struct tc_netem_slot slot; - qopt.latency = q->latency; - qopt.jitter = q->jitter; + qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), + UINT_MAX); + qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter), + UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; qopt.gap = q->gap; @@ -1024,6 +1083,12 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; + if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) + goto nla_put_failure; + + if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) + goto nla_put_failure; + cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; @@ -1060,6 +1125,16 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; + if (q->slot_config.min_delay | q->slot_config.max_delay) { + slot = q->slot_config; + if (slot.max_packets == INT_MAX) + slot.max_packets = 0; + if (slot.max_bytes == INT_MAX) + slot.max_bytes = 0; + if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) + goto nla_put_failure; + } + return nla_nest_end(skb, nla); nla_put_failure: diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 6c2791d6102d..776c694c77c7 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -74,6 +74,7 @@ struct pie_sched_data { struct pie_vars vars; struct pie_stats stats; struct timer_list adapt_timer; + struct Qdisc *sch; }; static void pie_params_init(struct pie_params *params) @@ -422,10 +423,10 @@ static void calculate_probability(struct Qdisc *sch) pie_vars_init(&q->vars); } -static void pie_timer(unsigned long arg) +static void pie_timer(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct pie_sched_data *q = qdisc_priv(sch); + struct pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -446,7 +447,8 @@ static int pie_init(struct Qdisc *sch, struct nlattr *opt) pie_vars_init(&q->vars); sch->limit = q->params.limit; - setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); + q->sch = sch; + timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { int err = pie_change(sch, opt); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 2dd6c68ae91e..2c79559a0d31 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -50,6 +50,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } @@ -212,7 +213,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6ddfd4991108..6962b37a3ad3 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -709,6 +709,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return NULL; } @@ -1419,7 +1420,7 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 93b9d70a9b28..7f8ea9e297c3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/inet_ecn.h> #include <net/red.h> @@ -40,6 +41,7 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; struct timer_list adapt_timer; + struct Qdisc *sch; struct red_parms parms; struct red_vars vars; struct red_stats stats; @@ -147,11 +149,37 @@ static void red_reset(struct Qdisc *sch) red_restart(&q->vars); } +static int red_offload(struct Qdisc *sch, bool enable) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_red_qopt_offload opt = { + .handle = sch->handle, + .parent = sch->parent, + }; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + if (enable) { + opt.command = TC_RED_REPLACE; + opt.set.min = q->parms.qth_min >> q->parms.Wlog; + opt.set.max = q->parms.qth_max >> q->parms.Wlog; + opt.set.probability = q->parms.max_P; + opt.set.is_ecn = red_use_ecn(q); + } else { + opt.command = TC_RED_DESTROY; + } + + return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt); +} + static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); del_timer_sync(&q->adapt_timer); + red_offload(sch, false); qdisc_destroy(q->qdisc); } @@ -218,13 +246,14 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) red_start_of_idle_period(&q->vars); sch_tree_unlock(sch); + red_offload(sch, true); return 0; } -static inline void red_adaptative_timer(unsigned long arg) +static inline void red_adaptative_timer(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct red_sched_data *q = qdisc_priv(sch); + struct red_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -238,10 +267,40 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) struct red_sched_data *q = qdisc_priv(sch); q->qdisc = &noop_qdisc; - setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch); + q->sch = sch; + timer_setup(&q->adapt_timer, red_adaptative_timer, 0); return red_change(sch, opt); } +static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_red_qopt_offload hw_stats = { + .command = TC_RED_STATS, + .handle = sch->handle, + .parent = sch->parent, + { + .stats.bstats = &sch->bstats, + .stats.qstats = &sch->qstats, + }, + }; + int err; + + opt->flags &= ~TC_RED_OFFLOADED; + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return 0; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, + &hw_stats); + if (err == -EOPNOTSUPP) + return 0; + + if (!err) + opt->flags |= TC_RED_OFFLOADED; + + return err; +} + static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); @@ -255,8 +314,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) .Plog = q->parms.Plog, .Scell_log = q->parms.Scell_log, }; + int err; sch->qstats.backlog = q->qdisc->qstats.backlog; + err = red_dump_offload(sch, &opt); + if (err) + goto nla_put_failure; + opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; @@ -273,6 +337,7 @@ nla_put_failure: static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct red_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); struct tc_red_xstats st = { .early = q->stats.prob_drop + q->stats.forced_drop, .pdrop = q->stats.pdrop, @@ -280,6 +345,26 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .marked = q->stats.prob_mark + q->stats.forced_mark, }; + if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) { + struct red_stats hw_stats = {0}; + struct tc_red_qopt_offload hw_stats_request = { + .command = TC_RED_XSTATS, + .handle = sch->handle, + .parent = sch->parent, + { + .xstats = &hw_stats, + }, + }; + if (!dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_QDISC_RED, + &hw_stats_request)) { + st.early += hw_stats.prob_drop + hw_stats.forced_drop; + st.pdrop += hw_stats.pdrop; + st.other += hw_stats.other; + st.marked += hw_stats.prob_mark + hw_stats.forced_mark; + } + } + return gnet_stats_copy_app(d, &st, sizeof(st)); } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index cc39e170b4aa..0678debdd856 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -268,6 +268,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return false; } @@ -553,7 +554,7 @@ static int sfb_init(struct Qdisc *sch, struct nlattr *opt) struct sfb_sched_data *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 74ea863b8240..890f4a4564e7 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -145,6 +145,7 @@ struct sfq_sched_data { int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; + struct Qdisc *sch; }; /* @@ -189,6 +190,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ case TC_ACT_SHOT: return 0; } @@ -604,10 +606,10 @@ drop: qdisc_tree_reduce_backlog(sch, dropped, drop_len); } -static void sfq_perturbation(unsigned long arg) +static void sfq_perturbation(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct sfq_sched_data *q = qdisc_priv(sch); + struct sfq_sched_data *q = from_timer(q, t, perturb_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -722,10 +724,9 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) int i; int err; - setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, - (unsigned long)sch); + timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; |