summaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig22
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/cls_u32.c12
-rw-r--r--net/sched/em_meta.c2
-rw-r--r--net/sched/sch_api.c36
-rw-r--r--net/sched/sch_cbq.c9
-rw-r--r--net/sched/sch_choke.c688
-rw-r--r--net/sched/sch_drr.c2
-rw-r--r--net/sched/sch_dsmark.c2
-rw-r--r--net/sched/sch_fifo.c18
-rw-r--r--net/sched/sch_generic.c7
-rw-r--r--net/sched/sch_hfsc.c4
-rw-r--r--net/sched/sch_htb.c14
-rw-r--r--net/sched/sch_mq.c1
-rw-r--r--net/sched/sch_mqprio.c23
-rw-r--r--net/sched/sch_multiq.c2
-rw-r--r--net/sched/sch_netem.c406
-rw-r--r--net/sched/sch_prio.c2
-rw-r--r--net/sched/sch_red.c11
-rw-r--r--net/sched/sch_sfb.c709
-rw-r--r--net/sched/sch_sfq.c54
-rw-r--r--net/sched/sch_tbf.c4
-rw-r--r--net/sched/sch_teql.c3
23 files changed, 1920 insertions, 114 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e318f458713e..a7a5583d4f68 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -126,6 +126,17 @@ config NET_SCH_RED
To compile this code as a module, choose M here: the
module will be called sch_red.
+config NET_SCH_SFB
+ tristate "Stochastic Fair Blue (SFB)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fair Blue (SFB)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfb.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfb.
+
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
---help---
@@ -217,6 +228,17 @@ config NET_SCH_MQPRIO
If unsure, say N.
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 26ce681a2c60..2e77b8dba22e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
@@ -33,6 +34,8 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 966920c14e7a..3b93fc0c8955 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -134,12 +134,12 @@ next_knode:
for (i = n->sel.nkeys; i > 0; i--, key++) {
int toff = off + key->off + (off2 & key->offmask);
- __be32 *data, _data;
+ __be32 *data, hdata;
if (skb_headroom(skb) + toff > INT_MAX)
goto out;
- data = skb_header_pointer(skb, toff, 4, &_data);
+ data = skb_header_pointer(skb, toff, 4, &hdata);
if (!data)
goto out;
if ((*data ^ key->val) & key->mask) {
@@ -187,10 +187,10 @@ check_terminal:
ht = n->ht_down;
sel = 0;
if (ht->divisor) {
- __be32 *data, _data;
+ __be32 *data, hdata;
data = skb_header_pointer(skb, off + n->sel.hoff, 4,
- &_data);
+ &hdata);
if (!data)
goto out;
sel = ht->divisor & u32_hash_fold(*data, &n->sel,
@@ -202,11 +202,11 @@ check_terminal:
if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
off2 = n->sel.off + 3;
if (n->sel.flags & TC_U32_VAROFFSET) {
- __be16 *data, _data;
+ __be16 *data, hdata;
data = skb_header_pointer(skb,
off + n->sel.offoff,
- 2, &_data);
+ 2, &hdata);
if (!data)
goto out;
off2 += ntohs(n->sel.offmask & *data) >>
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index a889d099320f..e5e174782677 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -401,7 +401,7 @@ META_COLLECTOR(int_sk_sndbuf)
META_COLLECTOR(int_sk_alloc)
{
SKIP_NONLOCAL(skb);
- dst->value = skb->sk->sk_allocation;
+ dst->value = (__force int) skb->sk->sk_allocation;
}
META_COLLECTOR(int_sk_route_caps)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 36ac0ec81ce0..7490f3f2db8b 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -398,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
return stab;
}
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
@@ -407,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (--tab->refcnt == 0) {
list_del(&tab->list);
- kfree(tab);
+ call_rcu_bh(&tab->rcu, stab_kfree_rcu);
}
spin_unlock(&qdisc_stab_lock);
@@ -430,7 +435,7 @@ nla_put_failure:
return -1;
}
-void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -456,7 +461,7 @@ out:
pkt_len = 1;
qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
-EXPORT_SYMBOL(qdisc_calculate_pkt_len);
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
{
@@ -473,7 +478,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
__netif_schedule(qdisc_root(wd->qdisc));
return HRTIMER_NORESTART;
@@ -495,7 +500,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- wd->qdisc->flags |= TCQ_F_THROTTLED;
+ qdisc_throttled(wd->qdisc);
time = ktime_set(0, 0);
time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
@@ -505,7 +510,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
hrtimer_cancel(&wd->timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
}
EXPORT_SYMBOL(qdisc_watchdog_cancel);
@@ -835,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
err = PTR_ERR(stab);
goto err_out4;
}
- sch->stab = stab;
+ rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
spinlock_t *root_lock;
@@ -875,7 +880,7 @@ err_out4:
* Any broken qdiscs that would require a ops->reset() here?
* The qdisc was never in action so it shouldn't be necessary.
*/
- qdisc_put_stab(sch->stab);
+ qdisc_put_stab(rtnl_dereference(sch->stab));
if (ops->destroy)
ops->destroy(sch);
goto err_out3;
@@ -883,7 +888,7 @@ err_out4:
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
{
- struct qdisc_size_table *stab = NULL;
+ struct qdisc_size_table *ostab, *stab = NULL;
int err = 0;
if (tca[TCA_OPTIONS]) {
@@ -900,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
return PTR_ERR(stab);
}
- qdisc_put_stab(sch->stab);
- sch->stab = stab;
+ ostab = rtnl_dereference(sch->stab);
+ rcu_assign_pointer(sch->stab, stab);
+ qdisc_put_stab(ostab);
if (tca[TCA_RATE]) {
/* NB: ignores errors from replace_estimator
@@ -1180,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
struct gnet_dump d;
+ struct qdisc_size_table *stab;
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
@@ -1195,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;
q->qstats.qlen = q->q.qlen;
- if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
@@ -1664,12 +1672,12 @@ int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
int err = 0;
- __be16 protocol;
#ifdef CONFIG_NET_CLS_ACT
+ __be16 protocol;
struct tcf_proto *otp = tp;
reclassify:
-#endif
protocol = skb->protocol;
+#endif
err = tc_classify_compat(skb, tp, res);
#ifdef CONFIG_NET_CLS_ACT
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 4aaf44c95c52..24d94c097b35 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -351,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
int toplevel = q->toplevel;
- if (toplevel > cl->level && !(cl->q->flags & TCQ_F_THROTTLED)) {
+ if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
psched_time_t now;
psched_tdiff_t incr;
@@ -391,7 +391,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
@@ -625,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
__netif_schedule(qdisc_root(sch));
return HRTIMER_NORESTART;
}
@@ -650,7 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
@@ -973,8 +971,9 @@ cbq_dequeue(struct Qdisc *sch)
skb = cbq_dequeue_1(sch);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
return skb;
}
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 000000000000..06afbaeb4c88
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,688 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct tcf_proto *filter_list;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+ return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ --sch->q.qlen;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ int off1, off2, poff;
+ const u32 *ports1, *ports2;
+ u8 ip_proto;
+ __u32 hash1;
+
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ /* Use hash value as quick check
+ * Assumes that __skb_get_rxhash makes IP header and ports linear
+ */
+ hash1 = skb_get_rxhash(skb1);
+ if (!hash1 || hash1 != skb_get_rxhash(skb2))
+ return false;
+
+ /* Probably match, but be sure to avoid hash collisions */
+ off1 = skb_network_offset(skb1);
+ off2 = skb_network_offset(skb2);
+
+ switch (skb1->protocol) {
+ case __constant_htons(ETH_P_IP): {
+ const struct iphdr *ip1, *ip2;
+
+ ip1 = (const struct iphdr *) (skb1->data + off1);
+ ip2 = (const struct iphdr *) (skb2->data + off2);
+
+ ip_proto = ip1->protocol;
+ if (ip_proto != ip2->protocol ||
+ ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
+ return false;
+
+ if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
+ ip_proto = 0;
+ off1 += ip1->ihl * 4;
+ off2 += ip2->ihl * 4;
+ break;
+ }
+
+ case __constant_htons(ETH_P_IPV6): {
+ const struct ipv6hdr *ip1, *ip2;
+
+ ip1 = (const struct ipv6hdr *) (skb1->data + off1);
+ ip2 = (const struct ipv6hdr *) (skb2->data + off2);
+
+ ip_proto = ip1->nexthdr;
+ if (ip_proto != ip2->nexthdr ||
+ ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
+ ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
+ return false;
+ off1 += 40;
+ off2 += 40;
+ }
+
+ default: /* Maybe compare MAC header here? */
+ return false;
+ }
+
+ poff = proto_ports_offset(ip_proto);
+ if (poff < 0)
+ return true;
+
+ off1 += poff;
+ off2 += poff;
+
+ ports1 = (__force u32 *)(skb1->data + off1);
+ ports2 = (__force u32 *)(skb2->data + off2);
+ return *ports1 == *ports2;
+}
+
+struct choke_skb_cb {
+ u16 classid;
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) <
+ sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
+ return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ choke_skb_cb(skb)->classid = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+ return choke_skb_cb(skb)->classid;
+}
+
+/*
+ * Classify flow using either:
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification
+ * 3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+ struct Qdisc *sch, int *qerr)
+
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ choke_set_classid(skb, TC_H_MIN(res.classid));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ if (q->filter_list)
+ return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct red_parms *p = &q->parms;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!choke_classify(skb, sch, &ret))
+ goto other_drop; /* Packet was eaten by filter */
+ }
+
+ /* Compute average queue usage (see RED) */
+ p->qavg = red_calc_qavg(p, sch->q.qlen);
+ if (red_is_idling(p))
+ red_end_of_idle_period(p);
+
+ /* Is queue small? */
+ if (p->qavg <= p->qth_min)
+ p->qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (p->qavg > p->qth_max) {
+ p->qcount = -1;
+
+ sch->qstats.overlimits++;
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++p->qcount) {
+ if (red_mark_probability(p, p->qavg)) {
+ p->qcount = 0;
+ p->qR = red_random(p);
+
+ sch->qstats.overlimits++;
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ p->qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+
+ congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+
+ other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+ if (addr) {
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+ }
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_CHOKE_STAB]));
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->parms);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ if (!arg->stop) {
+ if (arg->fn(sch, 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+ .leaf = choke_leaf,
+ .get = choke_get,
+ .put = choke_put,
+ .tcf_chain = choke_find_tcf,
+ .bind_tcf = choke_bind,
+ .unbind_tcf = choke_put,
+ .dump = choke_dump_class,
+ .walk = choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index de55e642eafc..6b7fe4a84f13 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -376,7 +376,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
bstats_update(&cl->bstats, skb);
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return err;
@@ -403,6 +402,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
skb = qdisc_dequeue_peeked(cl->qdisc);
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 4970d56b4aa7..2c790204d042 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -260,7 +260,6 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -283,6 +282,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
if (skb == NULL)
return NULL;
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index b3075f8a196b..be33f9ddf9dd 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -45,17 +45,14 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct sk_buff *skb_head;
struct fifo_sched_data *q = qdisc_priv(sch);
if (likely(skb_queue_len(&sch->q) < q->limit))
return qdisc_enqueue_tail(skb, sch);
/* queue full, remove one skb to fulfill the limit */
- skb_head = qdisc_dequeue_head(sch);
+ __qdisc_queue_drop_head(sch, &sch->q);
sch->qstats.drops++;
- kfree_skb(skb_head);
-
qdisc_enqueue_tail(skb, sch);
return NET_XMIT_CN;
@@ -64,11 +61,13 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
{
struct fifo_sched_data *q = qdisc_priv(sch);
+ bool bypass;
+ bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
if (opt == NULL) {
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
- if (sch->ops == &bfifo_qdisc_ops)
+ if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
q->limit = limit;
@@ -81,6 +80,15 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
q->limit = ctl->limit;
}
+ if (is_bfifo)
+ bypass = q->limit >= psched_mtu(qdisc_dev(sch));
+ else
+ bypass = q->limit >= 1;
+
+ if (bypass)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2f1cb62130da..0da09d508737 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -527,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
skb_queue_head_init(band2list(priv, prio));
+ /* Can by-pass the queue discipline */
+ qdisc->flags |= TCQ_F_CAN_BYPASS;
return 0;
}
@@ -632,7 +634,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
#ifdef CONFIG_NET_SCHED
qdisc_list_del(qdisc);
- qdisc_put_stab(qdisc->stab);
+ qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
if (ops->reset)
@@ -691,9 +693,6 @@ static void attach_one_default_qdisc(struct net_device *dev,
netdev_info(dev, "activation failed\n");
return;
}
-
- /* Can by-pass the queue discipline for default qdisc */
- qdisc->flags |= TCQ_F_CAN_BYPASS;
}
dev_queue->qdisc_sleeping = qdisc;
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index dea4009615f9..6488e6425652 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1598,7 +1598,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
set_active(cl, qdisc_pkt_len(skb));
bstats_update(&cl->bstats, skb);
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1664,7 +1663,8 @@ hfsc_dequeue(struct Qdisc *sch)
set_passive(cl);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 3e86fd3a1b78..e1429a85091f 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -581,7 +581,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -856,7 +855,7 @@ next:
static struct sk_buff *htb_dequeue(struct Qdisc *sch)
{
- struct sk_buff *skb = NULL;
+ struct sk_buff *skb;
struct htb_sched *q = qdisc_priv(sch);
int level;
psched_time_t next_event;
@@ -865,7 +864,9 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
skb = __skb_dequeue(&q->direct_queue);
if (skb != NULL) {
- sch->flags &= ~TCQ_F_THROTTLED;
+ok:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
sch->q.qlen--;
return skb;
}
@@ -899,11 +900,8 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
m |= 1 << prio;
skb = htb_dequeue_tree(q, prio, level);
- if (likely(skb != NULL)) {
- sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
- goto fin;
- }
+ if (likely(skb != NULL))
+ goto ok;
}
}
sch->qstats.overlimits++;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f4d2a1..ec5cbc848963 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
- qdisc->flags |= TCQ_F_CAN_BYPASS;
priv->qdiscs[ntx] = qdisc;
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 8620c65f480a..ea17cbed29ef 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -29,18 +29,18 @@ static void mqprio_destroy(struct Qdisc *sch)
struct mqprio_sched *priv = qdisc_priv(sch);
unsigned int ntx;
- if (!priv->qdiscs)
- return;
-
- for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ if (priv->qdiscs) {
+ for (ntx = 0;
+ ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+ ntx++)
+ qdisc_destroy(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+ }
if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
dev->netdev_ops->ndo_setup_tc(dev, 0);
else
netdev_set_num_tc(dev, 0);
-
- kfree(priv->qdiscs);
}
static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
@@ -130,7 +130,6 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
err = -ENOMEM;
goto err;
}
- qdisc->flags |= TCQ_F_CAN_BYPASS;
priv->qdiscs[i] = qdisc;
}
@@ -216,7 +215,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct net_device *dev = qdisc_dev(sch);
struct mqprio_sched *priv = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
- struct tc_mqprio_qopt opt;
+ struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
unsigned int i;
@@ -312,7 +311,9 @@ static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
}
static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
- struct gnet_dump *d)
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
{
struct net_device *dev = qdisc_dev(sch);
@@ -390,7 +391,7 @@ static const struct Qdisc_class_ops mqprio_class_ops = {
.dump_stats = mqprio_dump_class_stats,
};
-struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
.cl_ops = &mqprio_class_ops,
.id = "mqprio",
.priv_size = sizeof(struct mqprio_sched),
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 820f2a7ca14d..edc1950e0e77 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -83,7 +83,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -112,6 +111,7 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index c2bbbe60d544..edbbf7ad6623 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -19,12 +19,13 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-#define VERSION "1.2"
+#define VERSION "1.3"
/* Network Emulation Queuing algorithm.
====================================
@@ -47,6 +48,20 @@
layering other disciplines. It does not need to do bandwidth
control either since that can be handled by using token
bucket or other rate control.
+
+ Correlated Loss Generator models
+
+ Added generation of correlated loss according to the
+ "Gilbert-Elliot" model, a 4-state markov model.
+
+ References:
+ [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+ [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+ and intuitive loss model for packet networks and its implementation
+ in the Netem module in the Linux kernel", available in [1]
+
+ Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+ Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
struct netem_sched_data {
@@ -73,6 +88,26 @@ struct netem_sched_data {
u32 size;
s16 table[0];
} *delay_dist;
+
+ enum {
+ CLG_RANDOM,
+ CLG_4_STATES,
+ CLG_GILB_ELL,
+ } loss_model;
+
+ /* Correlated Loss Generation models */
+ struct clgstate {
+ /* state of the Markov chain */
+ u8 state;
+
+ /* 4-states and Gilbert-Elliot models */
+ u32 a1; /* p13 for 4-states or p for GE */
+ u32 a2; /* p31 for 4-states or r for GE */
+ u32 a3; /* p32 for 4-states or h for GE */
+ u32 a4; /* p14 for 4-states or 1-k for GE */
+ u32 a5; /* p23 used only in 4-states */
+ } clg;
+
};
/* Time stamp put into socket buffer control block */
@@ -115,6 +150,122 @@ static u32 get_crandom(struct crndstate *state)
return answer;
}
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+ u32 rnd = net_random();
+
+ /*
+ * Makes a comparision between rnd and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state and if the next packet has to be transmitted or lost.
+ * The four states correspond to:
+ * 1 => successfully transmitted packets within a gap period
+ * 4 => isolated losses within a gap period
+ * 3 => lost packets within a burst period
+ * 2 => successfully transmitted packets within a burst period
+ */
+ switch (clg->state) {
+ case 1:
+ if (rnd < clg->a4) {
+ clg->state = 4;
+ return true;
+ } else if (clg->a4 < rnd && rnd < clg->a1) {
+ clg->state = 3;
+ return true;
+ } else if (clg->a1 < rnd)
+ clg->state = 1;
+
+ break;
+ case 2:
+ if (rnd < clg->a5) {
+ clg->state = 3;
+ return true;
+ } else
+ clg->state = 2;
+
+ break;
+ case 3:
+ if (rnd < clg->a3)
+ clg->state = 2;
+ else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ clg->state = 1;
+ return true;
+ } else if (clg->a2 + clg->a3 < rnd) {
+ clg->state = 3;
+ return true;
+ }
+ break;
+ case 4:
+ clg->state = 1;
+ break;
+ }
+
+ return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparision between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparision
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+
+ switch (clg->state) {
+ case 1:
+ if (net_random() < clg->a1)
+ clg->state = 2;
+ if (net_random() < clg->a4)
+ return true;
+ case 2:
+ if (net_random() < clg->a2)
+ clg->state = 1;
+ if (clg->a3 > net_random())
+ return true;
+ }
+
+ return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* Random packet drop 0 => none, ~0 => all */
+ return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+ case CLG_4_STATES:
+ /* 4state loss model algorithm (used also for GI model)
+ * Extracts a value from the markov 4 state loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_4state(q);
+
+ case CLG_GILB_ELL:
+ /* Gilbert-Elliot loss model algorithm
+ * Extracts a value from the Gilbert-Elliot loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_gilb_ell(q);
+ }
+
+ return false; /* not reached */
+}
+
+
/* tabledist - return a pseudo-randomly distributed value with mean mu and
* std deviation sigma. Uses table lookup to approximate the desired
* distribution, and a uniformly-distributed pseudo-random source.
@@ -161,14 +312,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
int ret;
int count = 1;
- pr_debug("netem_enqueue skb=%p\n", skb);
-
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
- /* Random packet drop 0 => none, ~0 => all */
- if (q->loss && q->loss >= get_crandom(&q->loss_cor))
+ /* Drop packet? */
+ if (loss_event(q))
--count;
if (count == 0) {
@@ -238,15 +387,15 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = NET_XMIT_SUCCESS;
}
- if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
- } else if (net_xmit_drop_count(ret)) {
- sch->qstats.drops++;
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret)) {
+ sch->qstats.drops++;
+ return ret;
+ }
}
- pr_debug("netem: enqueue ret %d\n", ret);
- return ret;
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
}
static unsigned int netem_drop(struct Qdisc *sch)
@@ -266,7 +415,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- if (sch->flags & TCQ_F_THROTTLED)
+ if (qdisc_is_throttled(sch))
return NULL;
skb = q->qdisc->ops->peek(q->qdisc);
@@ -288,8 +437,10 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
skb->tstamp.tv64 = 0;
#endif
- pr_debug("netem_dequeue: return skb=%p\n", skb);
+
sch->q.qlen--;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
return skb;
}
@@ -308,6 +459,16 @@ static void netem_reset(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
}
+static void dist_free(struct disttable *d)
+{
+ if (d) {
+ if (is_vmalloc_addr(d))
+ vfree(d);
+ else
+ kfree(d);
+ }
+}
+
/*
* Distribution data is a variable size payload containing
* signed 16 bit values.
@@ -315,16 +476,20 @@ static void netem_reset(struct Qdisc *sch)
static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
- unsigned long n = nla_len(attr)/sizeof(__s16);
+ size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
struct disttable *d;
int i;
+ size_t s;
- if (n > 65536)
+ if (n > NETEM_DIST_MAX)
return -EINVAL;
- d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
+ s = sizeof(struct disttable) + n * sizeof(s16);
+ d = kmalloc(s, GFP_KERNEL);
+ if (!d)
+ d = vmalloc(s);
if (!d)
return -ENOMEM;
@@ -335,7 +500,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
- kfree(q->delay_dist);
+ dist_free(q->delay_dist);
q->delay_dist = d;
spin_unlock_bh(root_lock);
return 0;
@@ -369,10 +534,66 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
init_crandom(&q->corrupt_cor, r->correlation);
}
+static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ const struct nlattr *la;
+ int rem;
+
+ nla_for_each_nested(la, attr, rem) {
+ u16 type = nla_type(la);
+
+ switch(type) {
+ case NETEM_LOSS_GI: {
+ const struct tc_netem_gimodel *gi = nla_data(la);
+
+ if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_4_STATES;
+
+ q->clg.state = 1;
+ q->clg.a1 = gi->p13;
+ q->clg.a2 = gi->p31;
+ q->clg.a3 = gi->p32;
+ q->clg.a4 = gi->p14;
+ q->clg.a5 = gi->p23;
+ break;
+ }
+
+ case NETEM_LOSS_GE: {
+ const struct tc_netem_gemodel *ge = nla_data(la);
+
+ if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_GILB_ELL;
+ q->clg.state = 1;
+ q->clg.a1 = ge->p;
+ q->clg.a2 = ge->r;
+ q->clg.a3 = ge->h;
+ q->clg.a4 = ge->k1;
+ break;
+ }
+
+ default:
+ pr_info("netem: unknown loss type %u\n", type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -380,11 +601,15 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
{
int nested_len = nla_len(nla) - NLA_ALIGN(len);
- if (nested_len < 0)
+ if (nested_len < 0) {
+ pr_info("netem: invalid attributes len %d\n", nested_len);
return -EINVAL;
+ }
+
if (nested_len >= nla_attr_size(0))
return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
nested_len, policy);
+
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
@@ -407,7 +632,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
ret = fifo_set_limit(q->qdisc, qopt->limit);
if (ret) {
- pr_debug("netem: can't set fifo limit\n");
+ pr_info("netem: can't set fifo limit\n");
return ret;
}
@@ -440,7 +665,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_NETEM_CORRUPT])
get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
- return 0;
+ q->loss_model = CLG_RANDOM;
+ if (tb[TCA_NETEM_LOSS])
+ ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
+
+ return ret;
}
/*
@@ -476,7 +705,6 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
__skb_queue_after(list, skb, nskb);
sch->qstats.backlog += qdisc_pkt_len(nskb);
- qdisc_bstats_update(sch, nskb);
return NET_XMIT_SUCCESS;
}
@@ -536,16 +764,17 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
+ q->loss_model = CLG_RANDOM;
q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
TC_H_MAKE(sch->handle, 1));
if (!q->qdisc) {
- pr_debug("netem: qdisc create failed\n");
+ pr_notice("netem: qdisc create tfifo qdisc failed\n");
return -ENOMEM;
}
ret = netem_change(sch, opt);
if (ret) {
- pr_debug("netem: change failed\n");
+ pr_info("netem: change failed\n");
qdisc_destroy(q->qdisc);
}
return ret;
@@ -557,14 +786,61 @@ static void netem_destroy(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
qdisc_destroy(q->qdisc);
- kfree(q->delay_dist);
+ dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* legacy loss model */
+ nla_nest_cancel(skb, nest);
+ return 0; /* no data */
+
+ case CLG_4_STATES: {
+ struct tc_netem_gimodel gi = {
+ .p13 = q->clg.a1,
+ .p31 = q->clg.a2,
+ .p32 = q->clg.a3,
+ .p14 = q->clg.a4,
+ .p23 = q->clg.a5,
+ };
+
+ NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
+ break;
+ }
+ case CLG_GILB_ELL: {
+ struct tc_netem_gemodel ge = {
+ .p = q->clg.a1,
+ .r = q->clg.a2,
+ .h = q->clg.a3,
+ .k1 = q->clg.a4,
+ };
+
+ NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
}
static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
{
const struct netem_sched_data *q = qdisc_priv(sch);
- unsigned char *b = skb_tail_pointer(skb);
- struct nlattr *nla = (struct nlattr *) b;
+ struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
@@ -591,17 +867,87 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
corrupt.correlation = q->corrupt_cor.rho;
NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
- nla->nla_len = skb_tail_pointer(skb) - b;
+ if (dump_loss_model(q, skb) != 0)
+ goto nla_put_failure;
- return skb->len;
+ return nla_nest_end(skb, nla);
nla_put_failure:
- nlmsg_trim(skb, b);
+ nlmsg_trim(skb, nla);
return -1;
}
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void netem_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops netem_class_ops = {
+ .graft = netem_graft,
+ .leaf = netem_leaf,
+ .get = netem_get,
+ .put = netem_put,
+ .walk = netem_walk,
+ .dump = netem_dump_class,
+};
+
static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.id = "netem",
+ .cl_ops = &netem_class_ops,
.priv_size = sizeof(struct netem_sched_data),
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 3bea31e101b5..2a318f2dc3e5 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -83,7 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -115,6 +114,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch)
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 689157555fa4..6649463da1b6 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -93,7 +93,6 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -113,11 +112,13 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
struct Qdisc *child = q->qdisc;
skb = child->dequeue(child);
- if (skb)
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- else if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
-
+ } else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
return skb;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 000000000000..0a833d0c1f61
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,709 @@
+/*
+ * net/sched/sch_sfb.c Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+ u16 qlen; /* length of virtual queue */
+ u16 p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+ u32 perturbation; /* jhash perturbation */
+ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+ struct Qdisc *qdisc;
+ struct tcf_proto *filter_list;
+ unsigned long rehash_interval;
+ unsigned long warmup_time; /* double buffering warmup time in jiffies */
+ u32 max;
+ u32 bin_size; /* maximum queue length per bin */
+ u32 increment; /* d1 */
+ u32 decrement; /* d2 */
+ u32 limit; /* HARD maximal queue length */
+ u32 penalty_rate;
+ u32 penalty_burst;
+ u32 tokens_avail;
+ unsigned long rehash_time;
+ unsigned long token_time;
+
+ u8 slot; /* current active bins (0 or 1) */
+ bool double_buffering;
+ struct sfb_bins bins[2];
+
+ struct {
+ u32 earlydrop;
+ u32 penaltydrop;
+ u32 bucketdrop;
+ u32 queuedrop;
+ u32 childdrop; /* drops in child qdisc */
+ u32 marked; /* ECN mark */
+ } stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+ u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) <
+ sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
+ return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+ return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+ u32 res = p1 + p2;
+
+ return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+ return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen < 0xFFFF)
+ b[hash].qlen++;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+ struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen > 0)
+ b[hash].qlen--;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+ memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+ int i;
+ u32 qlen = 0, prob = 0, totalpm = 0;
+ const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+ if (qlen < b->qlen)
+ qlen = b->qlen;
+ totalpm += b->p_mark;
+ if (prob < b->p_mark)
+ prob = b->p_mark;
+ b++;
+ }
+ *prob_r = prob;
+ *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+ return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+ q->bins[slot].perturbation = net_random();
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+ sfb_init_perturbation(q->slot, q);
+ q->slot ^= 1;
+ q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ if (q->penalty_rate == 0 || q->penalty_burst == 0)
+ return true;
+
+ if (q->tokens_avail < 1) {
+ unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+ q->tokens_avail = (age * q->penalty_rate) / HZ;
+ if (q->tokens_avail > q->penalty_burst)
+ q->tokens_avail = q->penalty_burst;
+ q->token_time = jiffies;
+ if (q->tokens_avail < 1)
+ return true;
+ }
+
+ q->tokens_avail--;
+ return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+ int *qerr, u32 *salt)
+{
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ *salt = TC_H_MIN(res.classid);
+ return true;
+ }
+ return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ int i;
+ u32 p_min = ~0;
+ u32 minqlen = ~0;
+ u32 r, slot, salt, sfbhash;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->rehash_interval > 0) {
+ unsigned long limit = q->rehash_time + q->rehash_interval;
+
+ if (unlikely(time_after(jiffies, limit))) {
+ sfb_swap_slot(q);
+ q->rehash_time = jiffies;
+ } else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+ time_after(jiffies, limit - q->warmup_time))) {
+ q->double_buffering = true;
+ }
+ }
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!sfb_classify(skb, q, &ret, &salt))
+ goto other_drop;
+ } else {
+ salt = skb_get_rxhash(skb);
+ }
+
+ slot = q->slot;
+
+ sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ if (minqlen > b->qlen)
+ minqlen = b->qlen;
+ if (p_min > b->p_mark)
+ p_min = b->p_mark;
+ }
+
+ slot ^= 1;
+ sfb_skb_cb(skb)->hashes[slot] = 0;
+
+ if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) {
+ sch->qstats.overlimits++;
+ if (minqlen >= q->max)
+ q->stats.bucketdrop++;
+ else
+ q->stats.queuedrop++;
+ goto drop;
+ }
+
+ if (unlikely(p_min >= SFB_MAX_PROB)) {
+ /* Inelastic flow */
+ if (q->double_buffering) {
+ sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ }
+ }
+ if (sfb_rate_limit(skb, q)) {
+ sch->qstats.overlimits++;
+ q->stats.penaltydrop++;
+ goto drop;
+ }
+ goto enqueue;
+ }
+
+ r = net_random() & SFB_MAX_PROB;
+
+ if (unlikely(r < p_min)) {
+ if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+ /* If we're marking that many packets, then either
+ * this flow is unresponsive, or we're badly congested.
+ * In either case, we want to start dropping packets.
+ */
+ if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.marked++;
+ } else {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+
+enqueue:
+ ret = qdisc_enqueue(skb, child);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ sch->q.qlen++;
+ increment_qlen(skb, q);
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.childdrop++;
+ sch->qstats.drops++;
+ }
+ return ret;
+
+drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct sk_buff *skb;
+
+ skb = child->dequeue(q->qdisc);
+
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ decrement_qlen(skb, q);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->q.qlen = 0;
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+ [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+ .rehash_interval = 600 * MSEC_PER_SEC,
+ .warmup_time = 60 * MSEC_PER_SEC,
+ .limit = 0,
+ .max = 25,
+ .bin_size = 20,
+ .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+ .decrement = (SFB_MAX_PROB + 3000) / 6000,
+ .penalty_rate = 10,
+ .penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ struct nlattr *tb[TCA_SFB_MAX + 1];
+ const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+ u32 limit;
+ int err;
+
+ if (opt) {
+ err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+ if (err < 0)
+ return -EINVAL;
+
+ if (tb[TCA_SFB_PARMS] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_SFB_PARMS]);
+ }
+
+ limit = ctl->limit;
+ if (limit == 0)
+ limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+ child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ sch_tree_lock(sch);
+
+ qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_destroy(q->qdisc);
+ q->qdisc = child;
+
+ q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+ q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+ q->rehash_time = jiffies;
+ q->limit = limit;
+ q->increment = ctl->increment;
+ q->decrement = ctl->decrement;
+ q->max = ctl->max;
+ q->bin_size = ctl->bin_size;
+ q->penalty_rate = ctl->penalty_rate;
+ q->penalty_burst = ctl->penalty_burst;
+ q->tokens_avail = ctl->penalty_burst;
+ q->token_time = jiffies;
+
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+ sfb_init_perturbation(1, q);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ q->qdisc = &noop_qdisc;
+ return sfb_change(sch, opt);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct tc_sfb_qopt opt = {
+ .rehash_interval = jiffies_to_msecs(q->rehash_interval),
+ .warmup_time = jiffies_to_msecs(q->warmup_time),
+ .limit = q->limit,
+ .max = q->max,
+ .bin_size = q->bin_size,
+ .increment = q->increment,
+ .decrement = q->decrement,
+ .penalty_rate = q->penalty_rate,
+ .penalty_burst = q->penalty_burst,
+ };
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct tc_sfb_xstats st = {
+ .earlydrop = q->stats.earlydrop,
+ .penaltydrop = q->stats.penaltydrop,
+ .bucketdrop = q->stats.bucketdrop,
+ .queuedrop = q->stats.queuedrop,
+ .childdrop = q->stats.childdrop,
+ .marked = q->stats.marked,
+ };
+
+ st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+ return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+ .graft = sfb_graft,
+ .leaf = sfb_leaf,
+ .get = sfb_get,
+ .put = sfb_put,
+ .change = sfb_change_class,
+ .delete = sfb_delete,
+ .walk = sfb_walk,
+ .tcf_chain = sfb_find_tcf,
+ .bind_tcf = sfb_bind,
+ .unbind_tcf = sfb_put,
+ .dump = sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+ .id = "sfb",
+ .priv_size = sizeof(struct sfb_sched_data),
+ .cl_ops = &sfb_class_ops,
+ .enqueue = sfb_enqueue,
+ .dequeue = sfb_dequeue,
+ .peek = sfb_peek,
+ .init = sfb_init,
+ .reset = sfb_reset,
+ .destroy = sfb_destroy,
+ .change = sfb_change,
+ .dump = sfb_dump,
+ .dump_stats = sfb_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+ return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+ unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 54a36f43a1f1..c2e628dfaacc 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -21,6 +21,7 @@
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -76,7 +77,8 @@
#define SFQ_DEPTH 128 /* max number of packets per flow */
#define SFQ_SLOTS 128 /* max number of flows */
#define SFQ_EMPTY_SLOT 255
-#define SFQ_HASH_DIVISOR 1024
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
/* We use 16 bits to store allot, and want to handle packets up to 64K
* Scale allot by 8 (1<<3) so that no overflow occurs.
*/
@@ -112,7 +114,7 @@ struct sfq_sched_data {
int perturb_period;
unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
int limit;
-
+ unsigned int divisor; /* number of slots in hash table */
/* Variables */
struct tcf_proto *filter_list;
struct timer_list perturb_timer;
@@ -120,7 +122,7 @@ struct sfq_sched_data {
sfq_index cur_depth; /* depth of longest slot */
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct sfq_slot *tail; /* current slot in round */
- sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
+ sfq_index *ht; /* Hash table (divisor slots) */
struct sfq_slot slots[SFQ_SLOTS];
struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
};
@@ -137,7 +139,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
{
- return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
+ return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
}
static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
@@ -201,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
+ TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
if (!q->filter_list)
@@ -219,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
return 0;
}
#endif
- if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
+ if (TC_H_MIN(res.classid) <= q->divisor)
return TC_H_MIN(res.classid);
}
return 0;
@@ -400,10 +402,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->tail = slot;
slot->allot = q->scaled_quantum;
}
- if (++sch->q.qlen <= q->limit) {
- qdisc_bstats_update(sch, skb);
+ if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
- }
sfq_drop(sch);
return NET_XMIT_CN;
@@ -443,6 +443,7 @@ next_slot:
}
skb = slot_dequeue_head(slot);
sfq_dec(q, a);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -490,13 +491,18 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
+ if (ctl->divisor &&
+ (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+ return -EINVAL;
+
sch_tree_lock(sch);
q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = ctl->perturb_period * HZ;
if (ctl->limit)
q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
-
+ if (ctl->divisor)
+ q->divisor = ctl->divisor;
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
sfq_drop(sch);
@@ -514,15 +520,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
+ size_t sz;
int i;
q->perturb_timer.function = sfq_perturbation;
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
- for (i = 0; i < SFQ_HASH_DIVISOR; i++)
- q->ht[i] = SFQ_EMPTY_SLOT;
-
for (i = 0; i < SFQ_DEPTH; i++) {
q->dep[i].next = i + SFQ_SLOTS;
q->dep[i].prev = i + SFQ_SLOTS;
@@ -531,6 +535,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
q->limit = SFQ_DEPTH - 1;
q->cur_depth = 0;
q->tail = NULL;
+ q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
if (opt == NULL) {
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
@@ -542,10 +547,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
return err;
}
+ sz = sizeof(q->ht[0]) * q->divisor;
+ q->ht = kmalloc(sz, GFP_KERNEL);
+ if (!q->ht && sz > PAGE_SIZE)
+ q->ht = vmalloc(sz);
+ if (!q->ht)
+ return -ENOMEM;
+ for (i = 0; i < q->divisor; i++)
+ q->ht[i] = SFQ_EMPTY_SLOT;
+
for (i = 0; i < SFQ_SLOTS; i++) {
slot_queue_init(&q->slots[i]);
sfq_link(q, i);
}
+ if (q->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -556,6 +574,10 @@ static void sfq_destroy(struct Qdisc *sch)
tcf_destroy_chain(&q->filter_list);
q->perturb_period = 0;
del_timer_sync(&q->perturb_timer);
+ if (is_vmalloc_addr(q->ht))
+ vfree(q->ht);
+ else
+ kfree(q->ht);
}
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -568,7 +590,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.perturb_period = q->perturb_period / HZ;
opt.limit = q->limit;
- opt.divisor = SFQ_HASH_DIVISOR;
+ opt.divisor = q->divisor;
opt.flows = q->limit;
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -593,6 +615,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -646,7 +670,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
if (arg->stop)
return;
- for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
+ for (i = 0; i < q->divisor; i++) {
if (q->ht[i] == SFQ_EMPTY_SLOT ||
arg->count < arg->skip) {
arg->count++;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 475edfb69c22..1dcfb5223a86 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -133,7 +133,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -185,7 +184,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
return skb;
}
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 64c071ded0f4..45cd30098e34 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -85,7 +85,6 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -109,6 +108,8 @@ teql_dequeue(struct Qdisc *sch)
dat->m->slaves = sch;
netif_wake_queue(m);
}
+ } else {
+ qdisc_bstats_update(sch, skb);
}
sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
return skb;