From 32f675bbc9be14a40d972820e190ac56341ce198 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 2 Jul 2015 15:57:19 +0200
Subject: net_sched: gen_estimator: extend pps limit

rate estimators are limited to 4 Mpps, which was fine years ago, but
too small with current hardware generation.

Lets use 2^5 scaling instead of 2^10 to get 128 Mpps new limit.

On 64bit arch, use an "unsigned long" for temp storage and remove limit.
(We do not expect 32bit arches to be able to reach this point)

Tested:

tc -s -d filter sh dev eth0 parent ffff:

filter protocol ip pref 1 u32
filter protocol ip pref 1 u32 fh 800: ht divisor 1
filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15
  match 07000000/ff000000 at 12
	action order 1: gact action drop
	 random type none pass val 0
	 index 1 ref 1 bind 1 installed 166 sec
 	Action statistics:
	Sent 39734251496 bytes 863788076 pkt (dropped 863788117, overlimits 0 requeues 0)
	rate 4067Mbit 11053596pps backlog 0b 0p requeues 0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_estimator.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9dfb88a933e7..92d886f4adcb 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -66,7 +66,7 @@
 
    NOTES.
 
-   * avbps is scaled by 2^5, avpps is scaled by 2^10.
+   * avbps and avpps are scaled by 2^5.
    * both values are reported as 32 bit unsigned values. bps can
      overflow for fast links : max speed being 34360Mbit/sec
    * Minimal interval is HZ/4=250msec (it is the greatest common divisor
@@ -85,10 +85,10 @@ struct gen_estimator
 	struct gnet_stats_rate_est64	*rate_est;
 	spinlock_t		*stats_lock;
 	int			ewma_log;
+	u32			last_packets;
+	unsigned long		avpps;
 	u64			last_bytes;
 	u64			avbps;
-	u32			last_packets;
-	u32			avpps;
 	struct rcu_head		e_rcu;
 	struct rb_node		node;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
@@ -118,8 +118,8 @@ static void est_timer(unsigned long arg)
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
 		struct gnet_stats_basic_packed b = {0};
+		unsigned long rate;
 		u64 brate;
-		u32 rate;
 
 		spin_lock(e->stats_lock);
 		read_lock(&est_lock);
@@ -133,10 +133,11 @@ static void est_timer(unsigned long arg)
 		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
 		e->rate_est->bps = (e->avbps+0xF)>>5;
 
-		rate = (b.packets - e->last_packets)<<(12 - idx);
+		rate = b.packets - e->last_packets;
+		rate <<= (7 - idx);
 		e->last_packets = b.packets;
 		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		e->rate_est->pps = (e->avpps+0x1FF)>>10;
+		e->rate_est->pps = (e->avpps + 0xF) >> 5;
 skip:
 		read_unlock(&est_lock);
 		spin_unlock(e->stats_lock);
-- 
cgit v1.2.3-55-g7522


From d339727c2b1a10f25e6636670ab6e1841170e328 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Mon, 6 Jul 2015 17:13:26 +0200
Subject: net: graceful exit from netif_alloc_netdev_queues()

User space can crash kernel with

ip link add ifb10 numtxqueues 100000 type ifb

We must replace a BUG_ON() by proper test and return -EINVAL for
crazy values.

Fixes: 60877a32bce00 ("net: allow large number of tx queues")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6778a9999d52..0ad626214332 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6409,7 +6409,8 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	struct netdev_queue *tx;
 	size_t sz = count * sizeof(*tx);
 
-	BUG_ON(count < 1 || count > 0xffff);
+	if (count < 1 || count > 0xffff)
+		return -EINVAL;
 
 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 	if (!tx) {
-- 
cgit v1.2.3-55-g7522


From 95ec655bc465ccb2a3329d4aff9a45e3c8188db5 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel
Date: Mon, 6 Jul 2015 17:25:10 +0200
Subject: Revert "dev: set iflink to 0 for virtual interfaces"

This reverts commit e1622baf54df8cc958bf29d71de5ad545ea7d93c.

The side effect of this commit is to add a '@NONE' after each virtual
interface name with a 'ip link'. It may break existing scripts.

Reported-by: Olivier Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Tested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0ad626214332..1e57efda055b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -677,10 +677,6 @@ int dev_get_iflink(const struct net_device *dev)
 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 		return dev->netdev_ops->ndo_get_iflink(dev);
 
-	/* If dev->rtnl_link_ops is set, it's a virtual interface. */
-	if (dev->rtnl_link_ops)
-		return 0;
-
 	return dev->ifindex;
 }
 EXPORT_SYMBOL(dev_get_iflink);
-- 
cgit v1.2.3-55-g7522


From 4f7d2cdfdde71ffe962399b7020c674050329423 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Tue, 7 Jul 2015 00:07:52 +0200
Subject: rtnetlink: verify IFLA_VF_INFO attributes before passing them to
 driver

Jason Gunthorpe reported that since commit c02db8c6290b ("rtnetlink: make
SR-IOV VF interface symmetric"), we don't verify IFLA_VF_INFO attributes
anymore with respect to their policy, that is, ifla_vfinfo_policy[].

Before, they were part of ifla_policy[], but they have been nested since
placed under IFLA_VFINFO_LIST, that contains the attribute IFLA_VF_INFO,
which is another nested attribute for the actual VF attributes such as
IFLA_VF_MAC, IFLA_VF_VLAN, etc.

Despite the policy being split out from ifla_policy[] in this commit,
it's never applied anywhere. nla_for_each_nested() only does basic nla_ok()
testing for struct nlattr, but it doesn't know about the data context and
their requirements.

Fix, on top of Jason's initial work, does 1) parsing of the attributes
with the right policy, and 2) using the resulting parsed attribute table
from 1) instead of the nla_for_each_nested() loop (just like we used to
do when still part of ifla_policy[]).

Reference: http://thread.gmane.org/gmane.linux.network/368913
Fixes: c02db8c6290b ("rtnetlink: make SR-IOV VF interface symmetric")
Reported-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Sucheta Chakraborty <sucheta.chakraborty@qlogic.com>
Cc: Greg Rose <gregory.v.rose@intel.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Rony Efraim <ronye@mellanox.com>
Cc: Vlad Zolotarov <vladz@cloudius-systems.com>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Vlad Zolotarov <vladz@cloudius-systems.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 187 ++++++++++++++++++++++++++-------------------------
 1 file changed, 96 insertions(+), 91 deletions(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 01ced4a889e0..9e433d58d265 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1328,10 +1328,6 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 	[IFLA_INFO_SLAVE_DATA]	= { .type = NLA_NESTED },
 };
 
-static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
-	[IFLA_VF_INFO]		= { .type = NLA_NESTED },
-};
-
 static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_MAC]		= { .len = sizeof(struct ifla_vf_mac) },
 	[IFLA_VF_VLAN]		= { .len = sizeof(struct ifla_vf_vlan) },
@@ -1488,96 +1484,98 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 	return 0;
 }
 
-static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 {
-	int rem, err = -EINVAL;
-	struct nlattr *vf;
 	const struct net_device_ops *ops = dev->netdev_ops;
+	int err = -EINVAL;
 
-	nla_for_each_nested(vf, attr, rem) {
-		switch (nla_type(vf)) {
-		case IFLA_VF_MAC: {
-			struct ifla_vf_mac *ivm;
-			ivm = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_mac)
-				err = ops->ndo_set_vf_mac(dev, ivm->vf,
-							  ivm->mac);
-			break;
-		}
-		case IFLA_VF_VLAN: {
-			struct ifla_vf_vlan *ivv;
-			ivv = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_vlan)
-				err = ops->ndo_set_vf_vlan(dev, ivv->vf,
-							   ivv->vlan,
-							   ivv->qos);
-			break;
-		}
-		case IFLA_VF_TX_RATE: {
-			struct ifla_vf_tx_rate *ivt;
-			struct ifla_vf_info ivf;
-			ivt = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_get_vf_config)
-				err = ops->ndo_get_vf_config(dev, ivt->vf,
-							     &ivf);
-			if (err)
-				break;
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_rate)
-				err = ops->ndo_set_vf_rate(dev, ivt->vf,
-							   ivf.min_tx_rate,
-							   ivt->rate);
-			break;
-		}
-		case IFLA_VF_RATE: {
-			struct ifla_vf_rate *ivt;
-			ivt = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_rate)
-				err = ops->ndo_set_vf_rate(dev, ivt->vf,
-							   ivt->min_tx_rate,
-							   ivt->max_tx_rate);
-			break;
-		}
-		case IFLA_VF_SPOOFCHK: {
-			struct ifla_vf_spoofchk *ivs;
-			ivs = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_spoofchk)
-				err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
-							       ivs->setting);
-			break;
-		}
-		case IFLA_VF_LINK_STATE: {
-			struct ifla_vf_link_state *ivl;
-			ivl = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_link_state)
-				err = ops->ndo_set_vf_link_state(dev, ivl->vf,
-								 ivl->link_state);
-			break;
-		}
-		case IFLA_VF_RSS_QUERY_EN: {
-			struct ifla_vf_rss_query_en *ivrssq_en;
+	if (tb[IFLA_VF_MAC]) {
+		struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
 
-			ivrssq_en = nla_data(vf);
-			err = -EOPNOTSUPP;
-			if (ops->ndo_set_vf_rss_query_en)
-				err = ops->ndo_set_vf_rss_query_en(dev,
-							    ivrssq_en->vf,
-							    ivrssq_en->setting);
-			break;
-		}
-		default:
-			err = -EINVAL;
-			break;
-		}
-		if (err)
-			break;
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_mac)
+			err = ops->ndo_set_vf_mac(dev, ivm->vf,
+						  ivm->mac);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_VLAN]) {
+		struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_vlan)
+			err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
+						   ivv->qos);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_TX_RATE]) {
+		struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
+		struct ifla_vf_info ivf;
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_get_vf_config)
+			err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
+		if (err < 0)
+			return err;
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_rate)
+			err = ops->ndo_set_vf_rate(dev, ivt->vf,
+						   ivf.min_tx_rate,
+						   ivt->rate);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_RATE]) {
+		struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_rate)
+			err = ops->ndo_set_vf_rate(dev, ivt->vf,
+						   ivt->min_tx_rate,
+						   ivt->max_tx_rate);
+		if (err < 0)
+			return err;
 	}
+
+	if (tb[IFLA_VF_SPOOFCHK]) {
+		struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_spoofchk)
+			err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+						       ivs->setting);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_LINK_STATE]) {
+		struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_link_state)
+			err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+							 ivl->link_state);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_RSS_QUERY_EN]) {
+		struct ifla_vf_rss_query_en *ivrssq_en;
+
+		err = -EOPNOTSUPP;
+		ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+		if (ops->ndo_set_vf_rss_query_en)
+			err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
+							   ivrssq_en->setting);
+		if (err < 0)
+			return err;
+	}
+
 	return err;
 }
 
@@ -1773,14 +1771,21 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 
 	if (tb[IFLA_VFINFO_LIST]) {
+		struct nlattr *vfinfo[IFLA_VF_MAX + 1];
 		struct nlattr *attr;
 		int rem;
+
 		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
-			if (nla_type(attr) != IFLA_VF_INFO) {
+			if (nla_type(attr) != IFLA_VF_INFO ||
+			    nla_len(attr) < NLA_HDRLEN) {
 				err = -EINVAL;
 				goto errout;
 			}
-			err = do_setvfinfo(dev, attr);
+			err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr,
+					       ifla_vf_policy);
+			if (err < 0)
+				goto errout;
+			err = do_setvfinfo(dev, vfinfo);
 			if (err < 0)
 				goto errout;
 			status |= DO_SETLINK_NOTIFY;
-- 
cgit v1.2.3-55-g7522


From fecdf8be2d91e04b0a9a4f79ff06499a36f5d14f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 8 Jul 2015 21:42:11 +0200
Subject: net: pktgen: fix race between pktgen_thread_worker() and
 kthread_stop()

pktgen_thread_worker() is obviously racy, kthread_stop() can come
between the kthread_should_stop() check and set_current_state().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Reported-by: Marcelo Leitner <mleitner@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 05badbb58865..41489e3c69af 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3572,8 +3572,10 @@ static int pktgen_thread_worker(void *arg)
 	pktgen_rem_thread(t);
 
 	/* Wait for kthread_stop */
-	while (!kthread_should_stop()) {
+	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
 		schedule();
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3-55-g7522


From 1fbe4b46caca5b01b070af93d513031ffbcc480c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 8 Jul 2015 21:42:13 +0200
Subject: net: pktgen: kill the "Wait for kthread_stop" code in
 pktgen_thread_worker()

pktgen_thread_worker() doesn't need to wait for kthread_stop(), it
can simply exit. Just pktgen_create_thread() and pg_net_exit() should
do get_task_struct()/put_task_struct(). kthread_stop(dead_thread) is
fine.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 41489e3c69af..1ebdf1c0d118 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3571,15 +3571,6 @@ static int pktgen_thread_worker(void *arg)
 	pr_debug("%s removing thread\n", t->tsk->comm);
 	pktgen_rem_thread(t);
 
-	/* Wait for kthread_stop */
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop())
-			break;
-		schedule();
-	}
-	__set_current_state(TASK_RUNNING);
-
 	return 0;
 }
 
@@ -3771,6 +3762,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
 	}
 
 	t->net = pn;
+	get_task_struct(p);
 	wake_up_process(p);
 	wait_for_completion(&t->start_done);
 
@@ -3893,6 +3885,7 @@ static void __net_exit pg_net_exit(struct net *net)
 		t = list_entry(q, struct pktgen_thread, th_list);
 		list_del(&t->th_list);
 		kthread_stop(t->tsk);
+		put_task_struct(t->tsk);
 		kfree(t);
 	}
 
-- 
cgit v1.2.3-55-g7522


From e9e4dd3267d0c5234c5c0f47440456b10875dec9 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Thu, 9 Jul 2015 09:59:09 +0300
Subject: net: do not process device backlog during unregistration

commit 381c759d9916 ("ipv4: Avoid crashing in ip_error")
fixes a problem where processed packet comes from device
with destroyed inetdev (dev->ip_ptr). This is not expected
because inetdev_destroy is called in NETDEV_UNREGISTER
phase and packets should not be processed after
dev_close_many() and synchronize_net(). Above fix is still
required because inetdev_destroy can be called for other
reasons. But it shows the real problem: backlog can keep
packets for long time and they do not hold reference to
device. Such packets are then delivered to upper levels
at the same time when device is unregistered.
Calling flush_backlog after NETDEV_UNREGISTER_FINAL still
accounts all packets from backlog but before that some packets
continue to be delivered to upper levels long after the
synchronize_net call which is supposed to wait the last
ones. Also, as Eric pointed out, processed packets, mostly
from other devices, can continue to add new packets to backlog.

Fix the problem by moving flush_backlog early, after the
device driver is stopped and before the synchronize_net() call.
Then use netif_running check to make sure we do not add more
packets to backlog. We have to do it in enqueue_to_backlog
context when the local IRQ is disabled. As result, after the
flush_backlog and synchronize_net sequence all packets
should be accounted.

Thanks to Eric W. Biederman for the test script and his
valuable feedback!

Reported-by: Vittorio Gambaletta <linuxbugs@vittgam.net>
Fixes: 6e583ce5242f ("net: eliminate refcounting in backlog queue")
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1e57efda055b..6dd126a69aa0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3448,6 +3448,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	local_irq_save(flags);
 
 	rps_lock(sd);
+	if (!netif_running(skb->dev))
+		goto drop;
 	qlen = skb_queue_len(&sd->input_pkt_queue);
 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (qlen) {
@@ -3469,6 +3471,7 @@ enqueue:
 		goto enqueue;
 	}
 
+drop:
 	sd->dropped++;
 	rps_unlock(sd);
 
@@ -6135,6 +6138,7 @@ static void rollback_registered_many(struct list_head *head)
 		unlist_netdevice(dev);
 
 		dev->reg_state = NETREG_UNREGISTERING;
+		on_each_cpu(flush_backlog, dev, 1);
 	}
 
 	synchronize_net();
@@ -6770,8 +6774,6 @@ void netdev_run_todo(void)
 
 		dev->reg_state = NETREG_UNREGISTERED;
 
-		on_each_cpu(flush_backlog, dev, 1);
-
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-- 
cgit v1.2.3-55-g7522


From 2c17d27c36dcce2b6bf689f41a46b9e909877c21 Mon Sep 17 00:00:00 2001
From: Julian Anastasov
Date: Thu, 9 Jul 2015 09:59:10 +0300
Subject: net: call rcu_read_lock early in process_backlog

Incoming packet should be either in backlog queue or
in RCU read-side section. Otherwise, the final sequence of
flush_backlog() and synchronize_net() may miss packets
that can run without device reference:

CPU 1                  CPU 2
                       skb->dev: no reference
                       process_backlog:__skb_dequeue
                       process_backlog:local_irq_enable

on_each_cpu for
flush_backlog =>       IPI(hardirq): flush_backlog
                       - packet not found in backlog

                       CPU delayed ...
synchronize_net
- no ongoing RCU
read-side sections

netdev_run_todo,
rcu_barrier: no
ongoing callbacks
                       __netif_receive_skb_core:rcu_read_lock
                       - too late
free dev
                       process packet for freed dev

Fixes: 6e583ce5242f ("net: eliminate refcounting in backlog queue")
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6dd126a69aa0..a8e4dd430285 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3774,8 +3774,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
 	pt_prev = NULL;
 
-	rcu_read_lock();
-
 another_round:
 	skb->skb_iif = skb->dev->ifindex;
 
@@ -3785,7 +3783,7 @@ another_round:
 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 		skb = skb_vlan_untag(skb);
 		if (unlikely(!skb))
-			goto unlock;
+			goto out;
 	}
 
 #ifdef CONFIG_NET_CLS_ACT
@@ -3815,10 +3813,10 @@ skip_taps:
 	if (static_key_false(&ingress_needed)) {
 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
-			goto unlock;
+			goto out;
 
 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
-			goto unlock;
+			goto out;
 	}
 #endif
 #ifdef CONFIG_NET_CLS_ACT
@@ -3836,7 +3834,7 @@ ncls:
 		if (vlan_do_receive(&skb))
 			goto another_round;
 		else if (unlikely(!skb))
-			goto unlock;
+			goto out;
 	}
 
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
@@ -3848,7 +3846,7 @@ ncls:
 		switch (rx_handler(&skb)) {
 		case RX_HANDLER_CONSUMED:
 			ret = NET_RX_SUCCESS;
-			goto unlock;
+			goto out;
 		case RX_HANDLER_ANOTHER:
 			goto another_round;
 		case RX_HANDLER_EXACT:
@@ -3902,8 +3900,7 @@ drop:
 		ret = NET_RX_DROP;
 	}
 
-unlock:
-	rcu_read_unlock();
+out:
 	return ret;
 }
 
@@ -3934,29 +3931,30 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
+	int ret;
+
 	net_timestamp_check(netdev_tstamp_prequeue, skb);
 
 	if (skb_defer_rx_timestamp(skb))
 		return NET_RX_SUCCESS;
 
+	rcu_read_lock();
+
 #ifdef CONFIG_RPS
 	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
-		int cpu, ret;
-
-		rcu_read_lock();
-
-		cpu = get_rps_cpu(skb->dev, skb, &rflow);
+		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
 		if (cpu >= 0) {
 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 			rcu_read_unlock();
 			return ret;
 		}
-		rcu_read_unlock();
 	}
 #endif
-	return __netif_receive_skb(skb);
+	ret = __netif_receive_skb(skb);
+	rcu_read_unlock();
+	return ret;
 }
 
 /**
@@ -4501,8 +4499,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		struct sk_buff *skb;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
+			rcu_read_lock();
 			local_irq_enable();
 			__netif_receive_skb(skb);
+			rcu_read_unlock();
 			local_irq_disable();
 			input_queue_head_incr(sd);
 			if (++work >= quota) {
-- 
cgit v1.2.3-55-g7522