summaryrefslogtreecommitdiffstats
path: root/net/core/dst.c
diff options
context:
space:
mode:
authorLinus Torvalds2017-07-05 21:31:59 +0200
committerLinus Torvalds2017-07-05 21:31:59 +0200
commit5518b69b76680a4f2df96b1deca260059db0c2de (patch)
treef33cd1519c8efb4590500f2f9617400be233238c /net/core/dst.c
parentMerge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert... (diff)
parentMerge branch 'phy-dp83867-workaround-incorrect-RX_CTRL-pin-strap' (diff)
downloadkernel-qcow2-linux-5518b69b76680a4f2df96b1deca260059db0c2de.tar.gz
kernel-qcow2-linux-5518b69b76680a4f2df96b1deca260059db0c2de.tar.xz
kernel-qcow2-linux-5518b69b76680a4f2df96b1deca260059db0c2de.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Reasonably busy this cycle, but perhaps not as busy as in the 4.12 merge window: 1) Several optimizations for UDP processing under high load from Paolo Abeni. 2) Support pacing internally in TCP when using the sch_fq packet scheduler for this is not practical. From Eric Dumazet. 3) Support mutliple filter chains per qdisc, from Jiri Pirko. 4) Move to 1ms TCP timestamp clock, from Eric Dumazet. 5) Add batch dequeueing to vhost_net, from Jason Wang. 6) Flesh out more completely SCTP checksum offload support, from Davide Caratti. 7) More plumbing of extended netlink ACKs, from David Ahern, Pablo Neira Ayuso, and Matthias Schiffer. 8) Add devlink support to nfp driver, from Simon Horman. 9) Add RTM_F_FIB_MATCH flag to RTM_GETROUTE queries, from Roopa Prabhu. 10) Add stack depth tracking to BPF verifier and use this information in the various eBPF JITs. From Alexei Starovoitov. 11) Support XDP on qed device VFs, from Yuval Mintz. 12) Introduce BPF PROG ID for better introspection of installed BPF programs. From Martin KaFai Lau. 13) Add bpf_set_hash helper for TC bpf programs, from Daniel Borkmann. 14) For loads, allow narrower accesses in bpf verifier checking, from Yonghong Song. 15) Support MIPS in the BPF selftests and samples infrastructure, the MIPS eBPF JIT will be merged in via the MIPS GIT tree. From David Daney. 16) Support kernel based TLS, from Dave Watson and others. 17) Remove completely DST garbage collection, from Wei Wang. 18) Allow installing TCP MD5 rules using prefixes, from Ivan Delalande. 19) Add XDP support to Intel i40e driver, from Björn Töpel 20) Add support for TC flower offload in nfp driver, from Simon Horman, Pieter Jansen van Vuuren, Benjamin LaHaise, Jakub Kicinski, and Bert van Leeuwen. 21) IPSEC offloading support in mlx5, from Ilan Tayari. 22) Add HW PTP support to macb driver, from Rafal Ozieblo. 23) Networking refcount_t conversions, From Elena Reshetova. 24) Add sock_ops support to BPF, from Lawrence Brako. This is useful for tuning the TCP sockopt settings of a group of applications, currently via CGROUPs" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1899 commits) net: phy: dp83867: add workaround for incorrect RX_CTRL pin strap dt-bindings: phy: dp83867: provide a workaround for incorrect RX_CTRL pin strap cxgb4: Support for get_ts_info ethtool method cxgb4: Add PTP Hardware Clock (PHC) support cxgb4: time stamping interface for PTP nfp: default to chained metadata prepend format nfp: remove legacy MAC address lookup nfp: improve order of interfaces in breakout mode net: macb: remove extraneous return when MACB_EXT_DESC is defined bpf: add missing break in for the TCP_BPF_SNDCWND_CLAMP case bpf: fix return in load_bpf_file mpls: fix rtm policy in mpls_getroute net, ax25: convert ax25_cb.refcount from atomic_t to refcount_t net, ax25: convert ax25_route.refcount from atomic_t to refcount_t net, ax25: convert ax25_uid_assoc.refcount from atomic_t to refcount_t net, sctp: convert sctp_ep_common.refcnt from atomic_t to refcount_t net, sctp: convert sctp_transport.refcnt from atomic_t to refcount_t net, sctp: convert sctp_chunk.refcnt from atomic_t to refcount_t net, sctp: convert sctp_datamsg.refcnt from atomic_t to refcount_t net, sctp: convert sctp_auth_bytes.refcnt from atomic_t to refcount_t ...
Diffstat (limited to 'net/core/dst.c')
-rw-r--r--net/core/dst.c291
1 files changed, 53 insertions, 238 deletions
diff --git a/net/core/dst.c b/net/core/dst.c
index 13ba4a090c41..00aa972ad1a1 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -42,108 +42,6 @@
* to dirty as few cache lines as possible in __dst_free().
* As this is not a very strong hint, we dont force an alignment on SMP.
*/
-static struct {
- spinlock_t lock;
- struct dst_entry *list;
- unsigned long timer_inc;
- unsigned long timer_expires;
-} dst_garbage = {
- .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
- .timer_inc = DST_GC_MAX,
-};
-static void dst_gc_task(struct work_struct *work);
-static void ___dst_free(struct dst_entry *dst);
-
-static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
-
-static DEFINE_MUTEX(dst_gc_mutex);
-/*
- * long lived entries are maintained in this list, guarded by dst_gc_mutex
- */
-static struct dst_entry *dst_busy_list;
-
-static void dst_gc_task(struct work_struct *work)
-{
- int delayed = 0;
- int work_performed = 0;
- unsigned long expires = ~0L;
- struct dst_entry *dst, *next, head;
- struct dst_entry *last = &head;
-
- mutex_lock(&dst_gc_mutex);
- next = dst_busy_list;
-
-loop:
- while ((dst = next) != NULL) {
- next = dst->next;
- prefetch(&next->next);
- cond_resched();
- if (likely(atomic_read(&dst->__refcnt))) {
- last->next = dst;
- last = dst;
- delayed++;
- continue;
- }
- work_performed++;
-
- dst = dst_destroy(dst);
- if (dst) {
- /* NOHASH and still referenced. Unless it is already
- * on gc list, invalidate it and add to gc list.
- *
- * Note: this is temporary. Actually, NOHASH dst's
- * must be obsoleted when parent is obsoleted.
- * But we do not have state "obsoleted, but
- * referenced by parent", so it is right.
- */
- if (dst->obsolete > 0)
- continue;
-
- ___dst_free(dst);
- dst->next = next;
- next = dst;
- }
- }
-
- spin_lock_bh(&dst_garbage.lock);
- next = dst_garbage.list;
- if (next) {
- dst_garbage.list = NULL;
- spin_unlock_bh(&dst_garbage.lock);
- goto loop;
- }
- last->next = NULL;
- dst_busy_list = head.next;
- if (!dst_busy_list)
- dst_garbage.timer_inc = DST_GC_MAX;
- else {
- /*
- * if we freed less than 1/10 of delayed entries,
- * we can sleep longer.
- */
- if (work_performed <= delayed/10) {
- dst_garbage.timer_expires += dst_garbage.timer_inc;
- if (dst_garbage.timer_expires > DST_GC_MAX)
- dst_garbage.timer_expires = DST_GC_MAX;
- dst_garbage.timer_inc += DST_GC_INC;
- } else {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- }
- expires = dst_garbage.timer_expires;
- /*
- * if the next desired timer is more than 4 seconds in the
- * future then round the timer to whole seconds
- */
- if (expires > 4*HZ)
- expires = round_jiffies_relative(expires);
- schedule_delayed_work(&dst_gc_work, expires);
- }
-
- spin_unlock_bh(&dst_garbage.lock);
- mutex_unlock(&dst_gc_mutex);
-}
-
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
}
EXPORT_SYMBOL(dst_alloc);
-static void ___dst_free(struct dst_entry *dst)
-{
- /* The first case (dev==NULL) is required, when
- protocol module is unloaded.
- */
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- }
- dst->obsolete = DST_OBSOLETE_DEAD;
-}
-
-void __dst_free(struct dst_entry *dst)
-{
- spin_lock_bh(&dst_garbage.lock);
- ___dst_free(dst);
- dst->next = dst_garbage.list;
- dst_garbage.list = dst;
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-}
-EXPORT_SYMBOL(__dst_free);
-
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
smp_rmb();
-again:
child = dst->child;
if (!(dst->flags & DST_NOCOUNT))
@@ -269,20 +138,8 @@ again:
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
- if (dst) {
- int nohash = dst->flags & DST_NOHASH;
-
- if (atomic_dec_and_test(&dst->__refcnt)) {
- /* We were real parent of this dst, so kill child. */
- if (nohash)
- goto again;
- } else {
- /* Child is still referenced, return it for freeing. */
- if (nohash)
- return dst;
- /* Child is still in his hash table */
- }
- }
+ if (dst)
+ dst_release_immediate(dst);
return NULL;
}
EXPORT_SYMBOL(dst_destroy);
@@ -292,26 +149,62 @@ static void dst_destroy_rcu(struct rcu_head *head)
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
}
+/* Operations to mark dst as DEAD and clean up the net device referenced
+ * by dst:
+ * 1. put the dst under loopback interface and discard all tx/rx packets
+ * on this route.
+ * 2. release the net_device
+ * This function should be called when removing routes from the fib tree
+ * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
+ * make the next dst_ops->check() fail.
+ */
+void dst_dev_put(struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+
+ dst->obsolete = DST_OBSOLETE_DEAD;
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, true);
+ dst->input = dst_discard;
+ dst->output = dst_discard_out;
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+}
+EXPORT_SYMBOL(dst_dev_put);
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
- unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
- if (!newrefcnt && unlikely(nocache))
+ if (!newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
+void dst_release_immediate(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ if (unlikely(newrefcnt < 0))
+ net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+ __func__, dst, newrefcnt);
+ if (!newrefcnt)
+ dst_destroy(dst);
+ }
+}
+EXPORT_SYMBOL(dst_release_immediate);
+
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
@@ -371,21 +264,25 @@ static int dst_md_discard(struct sk_buff *skb)
return 0;
}
-static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
+static void __metadata_dst_init(struct metadata_dst *md_dst,
+ enum metadata_type type, u8 optslen)
+
{
struct dst_entry *dst;
dst = &md_dst->dst;
dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
- DST_METADATA | DST_NOCACHE | DST_NOCOUNT);
+ DST_METADATA | DST_NOCOUNT);
dst->input = dst_md_discard;
dst->output = dst_md_discard_out;
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+ md_dst->type = type;
}
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+ gfp_t flags)
{
struct metadata_dst *md_dst;
@@ -393,7 +290,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
if (!md_dst)
return NULL;
- __metadata_dst_init(md_dst, optslen);
+ __metadata_dst_init(md_dst, type, optslen);
return md_dst;
}
@@ -407,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
kfree(md_dst);
}
-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
{
int cpu;
struct metadata_dst __percpu *md_dst;
@@ -418,91 +316,8 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
return NULL;
for_each_possible_cpu(cpu)
- __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+ __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
-
-/* Dirty hack. We did it in 2.2 (in __dst_free),
- * we have _very_ good reasons not to repeat
- * this mistake in 2.3, but we have no choice
- * now. _It_ _is_ _explicit_ _deliberate_
- * _race_ _condition_.
- *
- * Commented and originally written by Alexey.
- */
-static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, unregister);
-
- if (dev != dst->dev)
- return;
-
- if (!unregister) {
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- } else {
- dst->dev = dev_net(dst->dev)->loopback_dev;
- dev_hold(dst->dev);
- dev_put(dev);
- }
-}
-
-static int dst_dev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dst_entry *dst, *last = NULL;
-
- switch (event) {
- case NETDEV_UNREGISTER_FINAL:
- case NETDEV_DOWN:
- mutex_lock(&dst_gc_mutex);
- for (dst = dst_busy_list; dst; dst = dst->next) {
- last = dst;
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- }
-
- spin_lock_bh(&dst_garbage.lock);
- dst = dst_garbage.list;
- dst_garbage.list = NULL;
- /* The code in dst_ifdown places a hold on the loopback device.
- * If the gc entry processing is set to expire after a lengthy
- * interval, this hold can cause netdev_wait_allrefs() to hang
- * out and wait for a long time -- until the the loopback
- * interface is released. If we're really unlucky, it'll emit
- * pr_emerg messages to console too. Reset the interval here,
- * so dst cleanups occur in a more timely fashion.
- */
- if (dst_garbage.timer_inc > DST_GC_INC) {
- dst_garbage.timer_inc = DST_GC_INC;
- dst_garbage.timer_expires = DST_GC_MIN;
- mod_delayed_work(system_wq, &dst_gc_work,
- dst_garbage.timer_expires);
- }
- spin_unlock_bh(&dst_garbage.lock);
-
- if (last)
- last->next = dst;
- else
- dst_busy_list = dst;
- for (; dst; dst = dst->next)
- dst_ifdown(dst, dev, event != NETDEV_DOWN);
- mutex_unlock(&dst_gc_mutex);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block dst_dev_notifier = {
- .notifier_call = dst_dev_event,
- .priority = -10, /* must be called after other network notifiers */
-};
-
-void __init dst_subsys_init(void)
-{
- register_netdevice_notifier(&dst_dev_notifier);
-}