summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/net/netns/xfrm.h2
-rw-r--r--include/net/xfrm.h5
-rw-r--r--net/ipv6/xfrm6_policy.c1
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/xfrm/xfrm_policy.c1248
-rw-r--r--tools/testing/selftests/net/Makefile3
-rwxr-xr-xtools/testing/selftests/net/xfrm_policy.sh302
7 files changed, 1428 insertions, 135 deletions
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 9991e5ef52cc..59f45b1e9dac 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/rhashtable-types.h>
#include <linux/xfrm.h>
#include <net/dst_ops.h>
@@ -53,6 +54,7 @@ struct netns_xfrm {
unsigned int policy_count[XFRM_POLICY_MAX * 2];
struct work_struct policy_hash_work;
struct xfrm_policy_hthresh policy_hthresh;
+ struct list_head inexact_bins;
struct sock *nlsk;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0eb390c205af..0a8d70d16918 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -577,6 +577,7 @@ struct xfrm_policy {
/* This lock only affects elements except for entry. */
rwlock_t lock;
refcount_t refcnt;
+ u32 pos;
struct timer_list timer;
atomic_t genid;
@@ -589,6 +590,7 @@ struct xfrm_policy {
struct xfrm_lifetime_cur curlft;
struct xfrm_policy_walk_entry walk;
struct xfrm_policy_queue polq;
+ bool bydst_reinsert;
u8 type;
u8 action;
u8 flags;
@@ -596,6 +598,7 @@ struct xfrm_policy {
u16 family;
struct xfrm_sec_ctx *security;
struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH];
+ struct hlist_node bydst_inexact_list;
struct rcu_head rcu;
};
@@ -1967,7 +1970,7 @@ static inline void xfrm_dev_state_delete(struct xfrm_state *x)
static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
struct xfrm_state_offload *xso = &x->xso;
- struct net_device *dev = xso->dev;
+ struct net_device *dev = xso->dev;
if (dev && dev->xfrmdev_ops) {
if (dev->xfrmdev_ops->xdo_dev_state_free)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index d35bcf92969c..769f8f78d3b8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -262,7 +262,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
if (xdst->u.rt6.rt6i_idev->dev == dev) {
struct inet6_dev *loopback_idev =
in6_dev_get(dev_net(dev)->loopback_dev);
- BUG_ON(!loopback_idev);
do {
in6_dev_put(xdst->u.rt6.rt6i_idev);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9d61266526e7..655c787f9d54 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2020,7 +2020,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp)
{
- struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
if (xfrm_ctx) {
int len = sizeof(struct sadb_x_sec_ctx);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 119a427d9b2b..be04091eb7db 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -26,6 +26,7 @@
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
+#include <linux/rhashtable.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/xfrm.h>
@@ -45,6 +46,99 @@ struct xfrm_flo {
u8 flags;
};
+/* prefixes smaller than this are stored in lists, not trees. */
+#define INEXACT_PREFIXLEN_IPV4 16
+#define INEXACT_PREFIXLEN_IPV6 48
+
+struct xfrm_pol_inexact_node {
+ struct rb_node node;
+ union {
+ xfrm_address_t addr;
+ struct rcu_head rcu;
+ };
+ u8 prefixlen;
+
+ struct rb_root root;
+
+ /* the policies matching this node, can be empty list */
+ struct hlist_head hhead;
+};
+
+/* xfrm inexact policy search tree:
+ * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
+ * |
+ * +---- root_d: sorted by daddr:prefix
+ * | |
+ * | xfrm_pol_inexact_node
+ * | |
+ * | +- root: sorted by saddr/prefix
+ * | | |
+ * | | xfrm_pol_inexact_node
+ * | | |
+ * | | + root: unused
+ * | | |
+ * | | + hhead: saddr:daddr policies
+ * | |
+ * | +- coarse policies and all any:daddr policies
+ * |
+ * +---- root_s: sorted by saddr:prefix
+ * | |
+ * | xfrm_pol_inexact_node
+ * | |
+ * | + root: unused
+ * | |
+ * | + hhead: saddr:any policies
+ * |
+ * +---- coarse policies and all any:any policies
+ *
+ * Lookups return four candidate lists:
+ * 1. any:any list from top-level xfrm_pol_inexact_bin
+ * 2. any:daddr list from daddr tree
+ * 3. saddr:daddr list from 2nd level daddr tree
+ * 4. saddr:any list from saddr tree
+ *
+ * This result set then needs to be searched for the policy with
+ * the lowest priority. If two results have same prio, youngest one wins.
+ */
+
+struct xfrm_pol_inexact_key {
+ possible_net_t net;
+ u32 if_id;
+ u16 family;
+ u8 dir, type;
+};
+
+struct xfrm_pol_inexact_bin {
+ struct xfrm_pol_inexact_key k;
+ struct rhash_head head;
+ /* list containing '*:*' policies */
+ struct hlist_head hhead;
+
+ seqcount_t count;
+ /* tree sorted by daddr/prefix */
+ struct rb_root root_d;
+
+ /* tree sorted by saddr/prefix */
+ struct rb_root root_s;
+
+ /* slow path below */
+ struct list_head inexact_bins;
+ struct rcu_head rcu;
+};
+
+enum xfrm_pol_inexact_candidate_type {
+ XFRM_POL_CAND_BOTH,
+ XFRM_POL_CAND_SADDR,
+ XFRM_POL_CAND_DADDR,
+ XFRM_POL_CAND_ANY,
+
+ XFRM_POL_CAND_MAX,
+};
+
+struct xfrm_pol_inexact_candidates {
+ struct hlist_head *res[XFRM_POL_CAND_MAX];
+};
+
static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
@@ -55,6 +149,9 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
static struct kmem_cache *xfrm_dst_cache __ro_after_init;
static __read_mostly seqcount_t xfrm_policy_hash_generation;
+static struct rhashtable xfrm_policy_inexact_table;
+static const struct rhashtable_params xfrm_pol_inexact_params;
+
static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
@@ -64,6 +161,25 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
+ u32 if_id);
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net,
+ u8 type, u16 family, u8 dir, u32 if_id);
+static struct xfrm_policy *
+xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
+ bool excl);
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+ struct xfrm_policy *policy);
+
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_pol_inexact_bin *b,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr);
+
static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
return refcount_inc_not_zero(&policy->refcnt);
@@ -269,6 +385,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
if (policy) {
write_pnet(&policy->xp_net, net);
INIT_LIST_HEAD(&policy->walk.all);
+ INIT_HLIST_NODE(&policy->bydst_inexact_list);
INIT_HLIST_NODE(&policy->bydst);
INIT_HLIST_NODE(&policy->byidx);
rwlock_init(&policy->lock);
@@ -365,7 +482,7 @@ static struct hlist_head *policy_hash_bysel(struct net *net,
hash = __sel_hash(sel, family, hmask, dbits, sbits);
if (hash == hmask + 1)
- return &net->xfrm.policy_inexact[dir];
+ return NULL;
return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
@@ -563,6 +680,533 @@ static void xfrm_hash_resize(struct work_struct *work)
mutex_unlock(&hash_resize_mutex);
}
+static void xfrm_hash_reset_inexact_table(struct net *net)
+{
+ struct xfrm_pol_inexact_bin *b;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins)
+ INIT_HLIST_HEAD(&b->hhead);
+}
+
+/* Make sure *pol can be inserted into fastbin.
+ * Useful to check that later insert requests will be sucessful
+ * (provided xfrm_policy_lock is held throughout).
+ */
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
+{
+ struct xfrm_pol_inexact_bin *bin, *prev;
+ struct xfrm_pol_inexact_key k = {
+ .family = pol->family,
+ .type = pol->type,
+ .dir = dir,
+ .if_id = pol->if_id,
+ };
+ struct net *net = xp_net(pol);
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ write_pnet(&k.net, net);
+ bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
+ xfrm_pol_inexact_params);
+ if (bin)
+ return bin;
+
+ bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
+ if (!bin)
+ return NULL;
+
+ bin->k = k;
+ INIT_HLIST_HEAD(&bin->hhead);
+ bin->root_d = RB_ROOT;
+ bin->root_s = RB_ROOT;
+ seqcount_init(&bin->count);
+
+ prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
+ &bin->k, &bin->head,
+ xfrm_pol_inexact_params);
+ if (!prev) {
+ list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
+ return bin;
+ }
+
+ kfree(bin);
+
+ return IS_ERR(prev) ? NULL : prev;
+}
+
+static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
+ int family, u8 prefixlen)
+{
+ if (xfrm_addr_any(addr, family))
+ return true;
+
+ if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
+ return true;
+
+ if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
+ return true;
+
+ return false;
+}
+
+static bool
+xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
+{
+ const xfrm_address_t *addr;
+ bool saddr_any, daddr_any;
+ u8 prefixlen;
+
+ addr = &policy->selector.saddr;
+ prefixlen = policy->selector.prefixlen_s;
+
+ saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+ policy->family,
+ prefixlen);
+ addr = &policy->selector.daddr;
+ prefixlen = policy->selector.prefixlen_d;
+ daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
+ policy->family,
+ prefixlen);
+ return saddr_any && daddr_any;
+}
+
+static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
+ const xfrm_address_t *addr, u8 prefixlen)
+{
+ node->addr = *addr;
+ node->prefixlen = prefixlen;
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
+{
+ struct xfrm_pol_inexact_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC);
+ if (node)
+ xfrm_pol_inexact_node_init(node, addr, prefixlen);
+
+ return node;
+}
+
+static int xfrm_policy_addr_delta(const xfrm_address_t *a,
+ const xfrm_address_t *b,
+ u8 prefixlen, u16 family)
+{
+ unsigned int pdw, pbi;
+ int delta = 0;
+
+ switch (family) {
+ case AF_INET:
+ if (sizeof(long) == 4 && prefixlen == 0)
+ return ntohl(a->a4) - ntohl(b->a4);
+ return (ntohl(a->a4) & ((~0UL << (32 - prefixlen)))) -
+ (ntohl(b->a4) & ((~0UL << (32 - prefixlen))));
+ case AF_INET6:
+ pdw = prefixlen >> 5;
+ pbi = prefixlen & 0x1f;
+
+ if (pdw) {
+ delta = memcmp(a->a6, b->a6, pdw << 2);
+ if (delta)
+ return delta;
+ }
+ if (pbi) {
+ u32 mask = ~0u << (32 - pbi);
+
+ delta = (ntohl(a->a6[pdw]) & mask) -
+ (ntohl(b->a6[pdw]) & mask);
+ }
+ break;
+ default:
+ break;
+ }
+
+ return delta;
+}
+
+static void xfrm_policy_inexact_list_reinsert(struct net *net,
+ struct xfrm_pol_inexact_node *n,
+ u16 family)
+{
+ unsigned int matched_s, matched_d;
+ struct hlist_node *newpos = NULL;
+ struct xfrm_policy *policy, *p;
+
+ matched_s = 0;
+ matched_d = 0;
+
+ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+ bool matches_s, matches_d;
+
+ if (!policy->bydst_reinsert)
+ continue;
+
+ WARN_ON_ONCE(policy->family != family);
+
+ policy->bydst_reinsert = false;
+ hlist_for_each_entry(p, &n->hhead, bydst) {
+ if (policy->priority >= p->priority)
+ newpos = &p->bydst;
+ else
+ break;
+ }
+
+ if (newpos)
+ hlist_add_behind(&policy->bydst, newpos);
+ else
+ hlist_add_head(&policy->bydst, &n->hhead);
+
+ /* paranoia checks follow.
+ * Check that the reinserted policy matches at least
+ * saddr or daddr for current node prefix.
+ *
+ * Matching both is fine, matching saddr in one policy
+ * (but not daddr) and then matching only daddr in another
+ * is a bug.
+ */
+ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
+ &n->addr,
+ n->prefixlen,
+ family) == 0;
+ matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
+ &n->addr,
+ n->prefixlen,
+ family) == 0;
+ if (matches_s && matches_d)
+ continue;
+
+ WARN_ON_ONCE(!matches_s && !matches_d);
+ if (matches_s)
+ matched_s++;
+ if (matches_d)
+ matched_d++;
+ WARN_ON_ONCE(matched_s && matched_d);
+ }
+}
+
+static void xfrm_policy_inexact_node_reinsert(struct net *net,
+ struct xfrm_pol_inexact_node *n,
+ struct rb_root *new,
+ u16 family)
+{
+ struct rb_node **p, *parent = NULL;
+ struct xfrm_pol_inexact_node *node;
+
+ /* we should not have another subtree here */
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
+
+ p = &new->rb_node;
+ while (*p) {
+ u8 prefixlen;
+ int delta;
+
+ parent = *p;
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+ prefixlen = min(node->prefixlen, n->prefixlen);
+
+ delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
+ prefixlen, family);
+ if (delta < 0) {
+ p = &parent->rb_left;
+ } else if (delta > 0) {
+ p = &parent->rb_right;
+ } else {
+ struct xfrm_policy *tmp;
+
+ hlist_for_each_entry(tmp, &node->hhead, bydst)
+ tmp->bydst_reinsert = true;
+ hlist_for_each_entry(tmp, &n->hhead, bydst)
+ tmp->bydst_reinsert = true;
+
+ INIT_HLIST_HEAD(&node->hhead);
+ xfrm_policy_inexact_list_reinsert(net, node, family);
+
+ if (node->prefixlen == n->prefixlen) {
+ kfree_rcu(n, rcu);
+ return;
+ }
+
+ rb_erase(*p, new);
+ kfree_rcu(n, rcu);
+ n = node;
+ n->prefixlen = prefixlen;
+ *p = new->rb_node;
+ parent = NULL;
+ }
+ }
+
+ rb_link_node_rcu(&n->node, parent, p);
+ rb_insert_color(&n->node, new);
+}
+
+/* merge nodes v and n */
+static void xfrm_policy_inexact_node_merge(struct net *net,
+ struct xfrm_pol_inexact_node *v,
+ struct xfrm_pol_inexact_node *n,
+ u16 family)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct xfrm_policy *tmp;
+ struct rb_node *rnode;
+
+ /* To-be-merged node v has a subtree.
+ *
+ * Dismantle it and insert its nodes to n->root.
+ */
+ while ((rnode = rb_first(&v->root)) != NULL) {
+ node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
+ rb_erase(&node->node, &v->root);
+ xfrm_policy_inexact_node_reinsert(net, node, &n->root,
+ family);
+ }
+
+ hlist_for_each_entry(tmp, &v->hhead, bydst)
+ tmp->bydst_reinsert = true;
+ hlist_for_each_entry(tmp, &n->hhead, bydst)
+ tmp->bydst_reinsert = true;
+
+ INIT_HLIST_HEAD(&n->hhead);
+ xfrm_policy_inexact_list_reinsert(net, n, family);
+}
+
+static struct xfrm_pol_inexact_node *
+xfrm_policy_inexact_insert_node(struct net *net,
+ struct rb_root *root,
+ xfrm_address_t *addr,
+ u16 family, u8 prefixlen, u8 dir)
+{
+ struct xfrm_pol_inexact_node *cached = NULL;
+ struct rb_node **p, *parent = NULL;
+ struct xfrm_pol_inexact_node *node;
+
+ p = &root->rb_node;
+ while (*p) {
+ int delta;
+
+ parent = *p;
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
+
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ node->prefixlen,
+ family);
+ if (delta == 0 && prefixlen >= node->prefixlen) {
+ WARN_ON_ONCE(cached); /* ipsec policies got lost */
+ return node;
+ }
+
+ if (delta < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ if (prefixlen < node->prefixlen) {
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ prefixlen,
+ family);
+ if (delta)
+ continue;
+
+ /* This node is a subnet of the new prefix. It needs
+ * to be removed and re-inserted with the smaller
+ * prefix and all nodes that are now also covered
+ * by the reduced prefixlen.
+ */
+ rb_erase(&node->node, root);
+
+ if (!cached) {
+ xfrm_pol_inexact_node_init(node, addr,
+ prefixlen);
+ cached = node;
+ } else {
+ /* This node also falls within the new
+ * prefixlen. Merge the to-be-reinserted
+ * node and this one.
+ */
+ xfrm_policy_inexact_node_merge(net, node,
+ cached, family);
+ kfree_rcu(node, rcu);
+ }
+
+ /* restart */
+ p = &root->rb_node;
+ parent = NULL;
+ }
+ }
+
+ node = cached;
+ if (!node) {
+ node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
+ if (!node)
+ return NULL;
+ }
+
+ rb_link_node_rcu(&node->node, parent, p);
+ rb_insert_color(&node->node, root);
+
+ return node;
+}
+
+static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
+{
+ struct xfrm_pol_inexact_node *node;
+ struct rb_node *rn = rb_first(r);
+
+ while (rn) {
+ node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
+
+ xfrm_policy_inexact_gc_tree(&node->root, rm);
+ rn = rb_next(rn);
+
+ if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
+ WARN_ON_ONCE(rm);
+ continue;
+ }
+
+ rb_erase(&node->node, r);
+ kfree_rcu(node, rcu);
+ }
+}
+
+static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
+{
+ write_seqcount_begin(&b->count);
+ xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
+ xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
+ write_seqcount_end(&b->count);
+
+ if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
+ !hlist_empty(&b->hhead)) {
+ WARN_ON_ONCE(net_exit);
+ return;
+ }
+
+ if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
+ xfrm_pol_inexact_params) == 0) {
+ list_del(&b->inexact_bins);
+ kfree_rcu(b, rcu);
+ }
+}
+
+static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
+{
+ struct net *net = read_pnet(&b->k.net);
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ __xfrm_policy_inexact_prune_bin(b, false);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+}
+
+static void __xfrm_policy_inexact_flush(struct net *net)
+{
+ struct xfrm_pol_inexact_bin *bin, *t;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
+ __xfrm_policy_inexact_prune_bin(bin, false);
+}
+
+static struct hlist_head *
+xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
+ struct xfrm_policy *policy, u8 dir)
+{
+ struct xfrm_pol_inexact_node *n;
+ struct net *net;
+
+ net = xp_net(policy);
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ if (xfrm_policy_inexact_insert_use_any_list(policy))
+ return &bin->hhead;
+
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
+ policy->family,
+ policy->selector.prefixlen_d)) {
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &bin->root_s,
+ &policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s,
+ dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ return &n->hhead;
+ }
+
+ /* daddr is fixed */
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &bin->root_d,
+ &policy->selector.daddr,
+ policy->family,
+ policy->selector.prefixlen_d, dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ /* saddr is wildcard */
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s))
+ return &n->hhead;
+
+ write_seqcount_begin(&bin->count);
+ n = xfrm_policy_inexact_insert_node(net,
+ &n->root,
+ &policy->selector.saddr,
+ policy->family,
+ policy->selector.prefixlen_s, dir);
+ write_seqcount_end(&bin->count);
+ if (!n)
+ return NULL;
+
+ return &n->hhead;
+}
+
+static struct xfrm_policy *
+xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
+{
+ struct xfrm_pol_inexact_bin *bin;
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+ struct net *net;
+
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+ if (!bin)
+ return ERR_PTR(-ENOMEM);
+
+ net = xp_net(policy);
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
+ if (!chain) {
+ __xfrm_policy_inexact_prune_bin(bin, false);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
+ if (delpol && excl) {
+ __xfrm_policy_inexact_prune_bin(bin, false);
+ return ERR_PTR(-EEXIST);
+ }
+
+ chain = &net->xfrm.policy_inexact[dir];
+ xfrm_policy_insert_inexact_list(chain, policy);
+
+ if (delpol)
+ __xfrm_policy_inexact_prune_bin(bin, false);
+
+ return delpol;
+}
+
static void xfrm_hash_rebuild(struct work_struct *work)
{
struct net *net = container_of(work, struct net,
@@ -592,7 +1236,50 @@ static void xfrm_hash_rebuild(struct work_struct *work)
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ /* make sure that we can insert the indirect policies again before
+ * we start with destructive action.
+ */
+ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
+ struct xfrm_pol_inexact_bin *bin;
+ u8 dbits, sbits;
+
+ dir = xfrm_policy_id2dir(policy->index);
+ if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
+ continue;
+
+ if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
+ if (policy->family == AF_INET) {
+ dbits = rbits4;
+ sbits = lbits4;
+ } else {
+ dbits = rbits6;
+ sbits = lbits6;
+ }
+ } else {
+ if (policy->family == AF_INET) {
+ dbits = lbits4;
+ sbits = rbits4;
+ } else {
+ dbits = lbits6;
+ sbits = rbits6;
+ }
+ }
+
+ if (policy->selector.prefixlen_d < dbits ||
+ policy->selector.prefixlen_s < sbits)
+ continue;
+
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
+ if (!bin)
+ goto out_unlock;
+
+ if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
+ goto out_unlock;
+ }
+
/* reset the bydst and inexact table in all directions */
+ xfrm_hash_reset_inexact_table(net);
+
for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
hmask = net->xfrm.policy_bydst[dir].hmask;
@@ -616,15 +1303,23 @@ static void xfrm_hash_rebuild(struct work_struct *work)
/* re-insert all policies by order of creation */
list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
- if (policy->walk.dead ||
- xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
+ if (policy->walk.dead)
+ continue;
+ dir = xfrm_policy_id2dir(policy->index);
+ if (dir >= XFRM_POLICY_MAX) {
/* skip socket policies */
continue;
}
newpos = NULL;
chain = policy_hash_bysel(net, &policy->selector,
- policy->family,
- xfrm_policy_id2dir(policy->index));
+ policy->family, dir);
+ if (!chain) {
+ void *p = xfrm_policy_inexact_insert(policy, dir, 0);
+
+ WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
+ continue;
+ }
+
hlist_for_each_entry(pol, chain, bydst) {
if (policy->priority >= pol->priority)
newpos = &pol->bydst;
@@ -637,6 +1332,8 @@ static void xfrm_hash_rebuild(struct work_struct *work)
hlist_add_head_rcu(&policy->bydst, chain);
}
+out_unlock:
+ __xfrm_policy_inexact_flush(net);
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex);
@@ -740,18 +1437,97 @@ static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
return false;
}
-int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
- struct net *net = xp_net(policy);
- struct xfrm_policy *pol;
- struct xfrm_policy *delpol;
- struct hlist_head *chain;
- struct hlist_node *newpos;
+ const struct xfrm_pol_inexact_key *k = data;
+ u32 a = k->type << 24 | k->dir << 16 | k->family;
+
+ return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
+ seed);
+}
+
+static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
+{
+ const struct xfrm_pol_inexact_bin *b = data;
+
+ return xfrm_pol_bin_key(&b->k, 0, seed);
+}
+
+static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct xfrm_pol_inexact_key *key = arg->key;
+ const struct xfrm_pol_inexact_bin *b = ptr;
+ int ret;
+
+ if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
+ return -1;
+
+ ret = b->k.dir ^ key->dir;
+ if (ret)
+ return ret;
+
+ ret = b->k.type ^ key->type;
+ if (ret)
+ return ret;
+
+ ret = b->k.family ^ key->family;
+ if (ret)
+ return ret;
+
+ return b->k.if_id ^ key->if_id;
+}
+
+static const struct rhashtable_params xfrm_pol_inexact_params = {
+ .head_offset = offsetof(struct xfrm_pol_inexact_bin, head),
+ .hashfn = xfrm_pol_bin_key,
+ .obj_hashfn = xfrm_pol_bin_obj,
+ .obj_cmpfn = xfrm_pol_bin_cmp,
+ .automatic_shrinking = true,
+};
+
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
+ struct xfrm_policy *policy)
+{
+ struct xfrm_policy *pol, *delpol = NULL;
+ struct hlist_node *newpos = NULL;
+ int i = 0;
+
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+ if (pol->type == policy->type &&
+ pol->if_id == policy->if_id &&
+ !selector_cmp(&pol->selector, &policy->selector) &&
+ xfrm_policy_mark_match(policy, pol) &&
+ xfrm_sec_ctx_match(pol->security, policy->security) &&
+ !WARN_ON(delpol)) {
+ delpol = pol;
+ if (policy->priority > pol->priority)
+ continue;
+ } else if (policy->priority >= pol->priority) {
+ newpos = &pol->bydst_inexact_list;
+ continue;
+ }
+ if (delpol)
+ break;
+ }
+
+ if (newpos)
+ hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
+ else
+ hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
+
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
+ pol->pos = i;
+ i++;
+ }
+}
+
+static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
+ struct xfrm_policy *policy,
+ bool excl)
+{
+ struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
- delpol = NULL;
- newpos = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == policy->type &&
pol->if_id == policy->if_id &&
@@ -759,24 +1535,45 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
xfrm_policy_mark_match(policy, pol) &&
xfrm_sec_ctx_match(pol->security, policy->security) &&
!WARN_ON(delpol)) {
- if (excl) {
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- return -EEXIST;
- }
+ if (excl)
+ return ERR_PTR(-EEXIST);
delpol = pol;
if (policy->priority > pol->priority)
continue;
} else if (policy->priority >= pol->priority) {
- newpos = &pol->bydst;
+ newpos = pol;
continue;
}
if (delpol)
break;
}
+
if (newpos)
- hlist_add_behind_rcu(&policy->bydst, newpos);
+ hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
else
hlist_add_head_rcu(&policy->bydst, chain);
+
+ return delpol;
+}
+
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+{
+ struct net *net = xp_net(policy);
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+ if (chain)
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
+ else
+ delpol = xfrm_policy_inexact_insert(policy, dir, excl);
+
+ if (IS_ERR(delpol)) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return PTR_ERR(delpol);
+ }
+
__xfrm_policy_link(policy, dir);
/* After previous checking, family can either be AF_INET or AF_INET6 */
@@ -806,43 +1603,96 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
+static struct xfrm_policy *
+__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id,
+ u8 type, int dir,
+ struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx)
+{
+ struct xfrm_policy *pol;
+
+ if (!chain)
+ return NULL;
+
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (pol->type == type &&
+ pol->if_id == if_id &&
+ (mark & pol->mark.m) == pol->mark.v &&
+ !selector_cmp(sel, &pol->selector) &&
+ xfrm_sec_ctx_match(ctx, pol->security))
+ return pol;
+ }
+
+ return NULL;
+}
+
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
u8 type, int dir,
struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err)
{
- struct xfrm_policy *pol, *ret;
+ struct xfrm_pol_inexact_bin *bin = NULL;
+ struct xfrm_policy *pol, *ret = NULL;
struct hlist_head *chain;
*err = 0;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, sel, sel->family, dir);
- ret = NULL;
- hlist_for_each_entry(pol, chain, bydst) {
- if (pol->type == type &&
- pol->if_id == if_id &&
- (mark & pol->mark.m) == pol->mark.v &&
- !selector_cmp(sel, &pol->selector) &&
- xfrm_sec_ctx_match(ctx, pol->security)) {
- xfrm_pol_hold(pol);
- if (delete) {
- *err = security_xfrm_policy_delete(
- pol->security);
- if (*err) {
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- return pol;
- }
- __xfrm_policy_unlink(pol, dir);
+ if (!chain) {
+ struct xfrm_pol_inexact_candidates cand;
+ int i;
+
+ bin = xfrm_policy_inexact_lookup(net, type,
+ sel->family, dir, if_id);
+ if (!bin) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return NULL;
+ }
+
+ if (!xfrm_policy_find_inexact_candidates(&cand, bin,
+ &sel->saddr,
+ &sel->daddr)) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return NULL;
+ }
+
+ pol = NULL;
+ for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
+ struct xfrm_policy *tmp;
+
+ tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
+ if_id, type, dir,
+ sel, ctx);
+ if (!tmp)
+ continue;
+
+ if (!pol || tmp->pos < pol->pos)
+ pol = tmp;
+ }
+ } else {
+ pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
+ sel, ctx);
+ }
+
+ if (pol) {
+ xfrm_pol_hold(pol);
+ if (delete) {
+ *err = security_xfrm_policy_delete(pol->security);
+ if (*err) {
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return pol;
}
- ret = pol;
- break;
+ __xfrm_policy_unlink(pol, dir);
}
+ ret = pol;
}
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
if (ret && delete)
xfrm_policy_kill(ret);
+ if (bin && delete)
+ xfrm_policy_inexact_prune_bin(bin);
return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -892,36 +1742,19 @@ EXPORT_SYMBOL(xfrm_policy_byid);
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
- int dir, err = 0;
+ struct xfrm_policy *pol;
+ int err = 0;
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- struct xfrm_policy *pol;
- int i;
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+ if (pol->walk.dead ||
+ xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+ pol->type != type)
+ continue;
- hlist_for_each_entry(pol,
- &net->xfrm.policy_inexact[dir], bydst) {
- if (pol->type != type)
- continue;
- err = security_xfrm_policy_delete(pol->security);
- if (err) {
- xfrm_audit_policy_delete(pol, 0, task_valid);
- return err;
- }
- }
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
- hlist_for_each_entry(pol,
- net->xfrm.policy_bydst[dir].table + i,
- bydst) {
- if (pol->type != type)
- continue;
- err = security_xfrm_policy_delete(
- pol->security);
- if (err) {
- xfrm_audit_policy_delete(pol, 0,
- task_valid);
- return err;
- }
- }
+ err = security_xfrm_policy_delete(pol->security);
+ if (err) {
+ xfrm_audit_policy_delete(pol, 0, task_valid);
+ return err;
}
}
return err;
@@ -937,6 +1770,7 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
int dir, err = 0, cnt = 0;
+ struct xfrm_policy *pol;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
@@ -944,48 +1778,25 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
if (err)
goto out;
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
- struct xfrm_policy *pol;
- int i;
-
- again1:
- hlist_for_each_entry(pol,
- &net->xfrm.policy_inexact[dir], bydst) {
- if (pol->type != type)
- continue;
- __xfrm_policy_unlink(pol, dir);
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- cnt++;
-
- xfrm_audit_policy_delete(pol, 1, task_valid);
-
- xfrm_policy_kill(pol);
-
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- goto again1;
- }
-
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
- again2:
- hlist_for_each_entry(pol,
- net->xfrm.policy_bydst[dir].table + i,
- bydst) {
- if (pol->type != type)
- continue;
- __xfrm_policy_unlink(pol, dir);
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
- cnt++;
-
- xfrm_audit_policy_delete(pol, 1, task_valid);
- xfrm_policy_kill(pol);
-
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- goto again2;
- }
- }
+again:
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+ dir = xfrm_policy_id2dir(pol->index);
+ if (pol->walk.dead ||
+ dir >= XFRM_POLICY_MAX ||
+ pol->type != type)
+ continue;
+ __xfrm_policy_unlink(pol, dir);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ cnt++;
+ xfrm_audit_policy_delete(pol, 1, task_valid);
+ xfrm_policy_kill(pol);
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ goto again;
}
- if (!cnt)
+ if (cnt)
+ __xfrm_policy_inexact_flush(net);
+ else
err = -ESRCH;
out:
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -1084,21 +1895,189 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
if (match)
ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
dir);
-
return ret;
}
+static struct xfrm_pol_inexact_node *
+xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
+ seqcount_t *count,
+ const xfrm_address_t *addr, u16 family)
+{
+ const struct rb_node *parent;
+ int seq;
+
+again:
+ seq = read_seqcount_begin(count);
+
+ parent = rcu_dereference_raw(r->rb_node);
+ while (parent) {
+ struct xfrm_pol_inexact_node *node;
+ int delta;
+
+ node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
+
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
+ node->prefixlen, family);
+ if (delta < 0) {
+ parent = rcu_dereference_raw(parent->rb_left);
+ continue;
+ } else if (delta > 0) {
+ parent = rcu_dereference_raw(parent->rb_right);
+ continue;
+ }
+
+ return node;
+ }
+
+ if (read_seqcount_retry(count, seq))
+ goto again;
+
+ return NULL;
+}
+
+static bool
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_pol_inexact_bin *b,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct xfrm_pol_inexact_node *n;
+ u16 family;
+
+ if (!b)
+ return false;
+
+ family = b->k.family;
+ memset(cand, 0, sizeof(*cand));
+ cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
+
+ n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
+ family);
+ if (n) {
+ cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
+ n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
+ family);
+ if (n)
+ cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
+ }
+
+ n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
+ family);
+ if (n)
+ cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
+
+ return true;
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
+ u8 dir, u32 if_id)
+{
+ struct xfrm_pol_inexact_key k = {
+ .family = family,
+ .type = type,
+ .dir = dir,
+ .if_id = if_id,
+ };
+
+ write_pnet(&k.net, net);
+
+ return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
+ xfrm_pol_inexact_params);
+}
+
+static struct xfrm_pol_inexact_bin *
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
+ u8 dir, u32 if_id)
+{
+ struct xfrm_pol_inexact_bin *bin;
+
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
+
+ rcu_read_lock();
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
+ rcu_read_unlock();
+
+ return bin;
+}
+
+static struct xfrm_policy *
+__xfrm_policy_eval_candidates(struct hlist_head *chain,
+ struct xfrm_policy *prefer,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir, u32 if_id)
+{
+ u32 priority = prefer ? prefer->priority : ~0u;
+ struct xfrm_policy *pol;
+
+ if (!chain)
+ return NULL;
+
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
+ int err;
+
+ if (pol->priority > priority)
+ break;
+
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+ if (err) {
+ if (err != -ESRCH)
+ return ERR_PTR(err);
+
+ continue;
+ }
+
+ if (prefer) {
+ /* matches. Is it older than *prefer? */
+ if (pol->priority == priority &&
+ prefer->pos < pol->pos)
+ return prefer;
+ }
+
+ return pol;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_policy *
+xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
+ struct xfrm_policy *prefer,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir, u32 if_id)
+{
+ struct xfrm_policy *tmp;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
+ tmp = __xfrm_policy_eval_candidates(cand->res[i],
+ prefer,
+ fl, type, family, dir,
+ if_id);
+ if (!tmp)
+ continue;
+
+ if (IS_ERR(tmp))
+ return tmp;
+ prefer = tmp;
+ }
+
+ return prefer;
+}
+
static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
const struct flowi *fl,
u16 family, u8 dir,
u32 if_id)
{
- int err;
- struct xfrm_policy *pol, *ret;
+ struct xfrm_pol_inexact_candidates cand;
const xfrm_address_t *daddr, *saddr;
+ struct xfrm_pol_inexact_bin *bin;
+ struct xfrm_policy *pol, *ret;
struct hlist_head *chain;
unsigned int sequence;
u32 priority;
+ int err;
daddr = xfrm_flowi_daddr(fl, family);
saddr = xfrm_flowi_saddr(fl, family);
@@ -1129,25 +2108,20 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
break;
}
}
- chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry_rcu(pol, chain, bydst) {
- if ((pol->priority >= priority) && ret)
- break;
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
+ if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
+ daddr))
+ goto skip_inexact;
- err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
- if (err) {
- if (err == -ESRCH)
- continue;
- else {
- ret = ERR_PTR(err);
- goto fail;
- }
- } else {
- ret = pol;
- break;
- }
+ pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
+ family, dir, if_id);
+ if (pol) {
+ ret = pol;
+ if (IS_ERR(pol))
+ goto fail;
}
+skip_inexact:
if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
goto retry;
@@ -1239,6 +2213,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
/* Socket policies are not hashed. */
if (!hlist_unhashed(&pol->bydst)) {
hlist_del_rcu(&pol->bydst);
+ hlist_del_init(&pol->bydst_inexact_list);
hlist_del(&pol->byidx);
}
@@ -1811,7 +2786,7 @@ static void xfrm_policy_queue_process(struct timer_list *t)
pq->timeout = pq->timeout << 1;
if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
xfrm_pol_hold(pol);
- goto out;
+ goto out;
}
dst_release(dst);
@@ -2816,13 +3791,17 @@ static void xfrm_statistics_fini(struct net *net)
static int __net_init xfrm_policy_init(struct net *net)
{
unsigned int hmask, sz;
- int dir;
+ int dir, err;
- if (net_eq(net, &init_net))
+ if (net_eq(net, &init_net)) {
xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
sizeof(struct xfrm_dst),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ err = rhashtable_init(&xfrm_policy_inexact_table,
+ &xfrm_pol_inexact_params);
+ BUG_ON(err);
+ }
hmask = 8 - 1;
sz = (hmask+1) * sizeof(struct hlist_head);
@@ -2857,6 +3836,7 @@ static int __net_init xfrm_policy_init(struct net *net)
seqlock_init(&net->xfrm.policy_hthresh.lock);
INIT_LIST_HEAD(&net->xfrm.policy_all);
+ INIT_LIST_HEAD(&net->xfrm.inexact_bins);
INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
return 0;
@@ -2875,6 +3855,7 @@ out_byidx:
static void xfrm_policy_fini(struct net *net)
{
+ struct xfrm_pol_inexact_bin *b, *t;
unsigned int sz;
int dir;
@@ -2900,6 +3881,11 @@ static void xfrm_policy_fini(struct net *net)
sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+ list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
+ __xfrm_policy_inexact_prune_bin(b, true);
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
static int __net_init xfrm_net_init(struct net *net)
@@ -3065,7 +4051,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *
}
}
chain = &net->xfrm.policy_inexact[dir];
- hlist_for_each_entry(pol, chain, bydst) {
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
if ((pol->priority >= priority) && ret)
break;
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index aeecc3ef53d0..9543a4c2f9be 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -4,7 +4,8 @@
CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
CFLAGS += -I../../../../usr/include/
-TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
+TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \
+ rtnetlink.sh xfrm_policy.sh
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh
diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh
new file mode 100755
index 000000000000..8db35b99457c
--- /dev/null
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@@ -0,0 +1,302 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check xfrm policy resolution. Topology:
+#
+# 1.2 1.1 3.1 3.10 2.1 2.2
+# eth1 eth1 veth0 veth0 eth1 eth1
+# ns1 ---- ns3 ----- ns4 ---- ns2
+#
+# ns3 and ns4 are connected via ipsec tunnel.
+# pings from ns1 to ns2 (and vice versa) are supposed to work like this:
+# ns1: ping 10.0.2.2: passes via ipsec tunnel.
+# ns2: ping 10.0.1.2: passes via ipsec tunnel.
+
+# ns1: ping 10.0.1.253: passes via ipsec tunnel (direct policy)
+# ns2: ping 10.0.2.253: passes via ipsec tunnel (direct policy)
+#
+# ns1: ping 10.0.2.254: does NOT pass via ipsec tunnel (exception)
+# ns2: ping 10.0.1.254: does NOT pass via ipsec tunnel (exception)
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+policy_checks_ok=1
+
+KEY_SHA=0xdeadbeef1234567890abcdefabcdefabcdefabcd
+KEY_AES=0x0123456789abcdef0123456789012345
+SPI1=0x1
+SPI2=0x2
+
+do_esp() {
+ local ns=$1
+ local me=$2
+ local remote=$3
+ local lnet=$4
+ local rnet=$5
+ local spi_out=$6
+ local spi_in=$7
+
+ ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
+ ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
+
+ # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+ ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 100 action allow
+ # to fwd decrypted packets after esp processing:
+ ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow
+}
+
+do_esp_policy_get_check() {
+ local ns=$1
+ local lnet=$2
+ local rnet=$3
+
+ ip -net $ns xfrm policy get src $lnet dst $rnet dir out > /dev/null
+ if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then
+ policy_checks_ok=0
+ echo "FAIL: ip -net $ns xfrm policy get src $lnet dst $rnet dir out"
+ ret=1
+ fi
+
+ ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd > /dev/null
+ if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then
+ policy_checks_ok=0
+ echo "FAIL: ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd"
+ ret=1
+ fi
+}
+
+do_exception() {
+ local ns=$1
+ local me=$2
+ local remote=$3
+ local encryptip=$4
+ local plain=$5
+
+ # network $plain passes without tunnel
+ ip -net $ns xfrm policy add dst $plain dir out priority 10 action allow
+
+ # direct policy for $encryptip, use tunnel, higher prio takes precedence
+ ip -net $ns xfrm policy add dst $encryptip dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
+}
+
+# policies that are not supposed to match any packets generated in this test.
+do_dummies4() {
+ local ns=$1
+
+ for i in $(seq 10 16);do
+ # dummy policy with wildcard src/dst.
+ echo netns exec $ns ip xfrm policy add src 0.0.0.0/0 dst 10.$i.99.0/30 dir out action block
+ echo netns exec $ns ip xfrm policy add src 10.$i.99.0/30 dst 0.0.0.0/0 dir out action block
+ for j in $(seq 32 64);do
+ echo netns exec $ns ip xfrm policy add src 10.$i.1.0/30 dst 10.$i.$j.0/30 dir out action block
+ # silly, as it encompasses the one above too, but its allowed:
+ echo netns exec $ns ip xfrm policy add src 10.$i.1.0/29 dst 10.$i.$j.0/29 dir out action block
+ # and yet again, even more broad one.
+ echo netns exec $ns ip xfrm policy add src 10.$i.1.0/24 dst 10.$i.$j.0/24 dir out action block
+ echo netns exec $ns ip xfrm policy add src 10.$i.$j.0/24 dst 10.$i.1.0/24 dir fwd action block
+ done
+ done | ip -batch /dev/stdin
+}
+
+do_dummies6() {
+ local ns=$1
+
+ for i in $(seq 10 16);do
+ for j in $(seq 32 64);do
+ echo netns exec $ns ip xfrm policy add src dead:$i::/64 dst dead:$i:$j::/64 dir out action block
+ echo netns exec $ns ip xfrm policy add src dead:$i:$j::/64 dst dead:$i::/24 dir fwd action block
+ done
+ done | ip -batch /dev/stdin
+}
+
+check_ipt_policy_count()
+{
+ ns=$1
+
+ ip netns exec $ns iptables-save -c |grep policy | ( read c rest
+ ip netns exec $ns iptables -Z
+ if [ x"$c" = x'[0:0]' ]; then
+ exit 0
+ elif [ x"$c" = x ]; then
+ echo "ERROR: No counters"
+ ret=1
+ exit 111
+ else
+ exit 1
+ fi
+ )
+}
+
+check_xfrm() {
+ # 0: iptables -m policy rule count == 0
+ # 1: iptables -m policy rule count != 0
+ rval=$1
+ ip=$2
+ lret=0
+
+ ip netns exec ns1 ping -q -c 1 10.0.2.$ip > /dev/null
+
+ check_ipt_policy_count ns3
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+ check_ipt_policy_count ns4
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+
+ ip netns exec ns2 ping -q -c 1 10.0.1.$ip > /dev/null
+
+ check_ipt_policy_count ns3
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+ check_ipt_policy_count ns4
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+
+ return $lret
+}
+
+#check for needed privileges
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip
+fi
+
+ip -Version 2>/dev/null >/dev/null
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without the ip tool"
+ exit $ksft_skip
+fi
+
+# needed to check if policy lookup got valid ipsec result
+iptables --version 2>/dev/null >/dev/null
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without iptables tool"
+ exit $ksft_skip
+fi
+
+for i in 1 2 3 4; do
+ ip netns add ns$i
+ ip -net ns$i link set lo up
+done
+
+DEV=veth0
+ip link add $DEV netns ns1 type veth peer name eth1 netns ns3
+ip link add $DEV netns ns2 type veth peer name eth1 netns ns4
+
+ip link add $DEV netns ns3 type veth peer name veth0 netns ns4
+
+DEV=veth0
+for i in 1 2; do
+ ip -net ns$i link set $DEV up
+ ip -net ns$i addr add 10.0.$i.2/24 dev $DEV
+ ip -net ns$i addr add dead:$i::2/64 dev $DEV
+
+ ip -net ns$i addr add 10.0.$i.253 dev $DEV
+ ip -net ns$i addr add 10.0.$i.254 dev $DEV
+ ip -net ns$i addr add dead:$i::fd dev $DEV
+ ip -net ns$i addr add dead:$i::fe dev $DEV
+done
+
+for i in 3 4; do
+ip -net ns$i link set eth1 up
+ip -net ns$i link set veth0 up
+done
+
+ip -net ns1 route add default via 10.0.1.1
+ip -net ns2 route add default via 10.0.2.1
+
+ip -net ns3 addr add 10.0.1.1/24 dev eth1
+ip -net ns3 addr add 10.0.3.1/24 dev veth0
+ip -net ns3 addr add 2001:1::1/64 dev eth1
+ip -net ns3 addr add 2001:3::1/64 dev veth0
+
+ip -net ns3 route add default via 10.0.3.10
+
+ip -net ns4 addr add 10.0.2.1/24 dev eth1
+ip -net ns4 addr add 10.0.3.10/24 dev veth0
+ip -net ns4 addr add 2001:2::1/64 dev eth1
+ip -net ns4 addr add 2001:3::10/64 dev veth0
+ip -net ns4 route add default via 10.0.3.1
+
+for j in 4 6; do
+ for i in 3 4;do
+ ip netns exec ns$i sysctl net.ipv$j.conf.eth1.forwarding=1 > /dev/null
+ ip netns exec ns$i sysctl net.ipv$j.conf.veth0.forwarding=1 > /dev/null
+ done
+done
+
+# abuse iptables rule counter to check if ping matches a policy
+ip netns exec ns3 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec
+ip netns exec ns4 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not insert iptables rule"
+ for i in 1 2 3 4;do ip netns del ns$i;done
+ exit $ksft_skip
+fi
+
+# localip remoteip localnet remotenet
+do_esp ns3 10.0.3.1 10.0.3.10 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
+do_esp ns3 dead:3::1 dead:3::10 dead:1::/64 dead:2::/64 $SPI1 $SPI2
+do_esp ns4 10.0.3.10 10.0.3.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
+do_esp ns4 dead:3::10 dead:3::1 dead:2::/64 dead:1::/64 $SPI2 $SPI1
+
+do_dummies4 ns3
+do_dummies6 ns4
+
+do_esp_policy_get_check ns3 10.0.1.0/24 10.0.2.0/24
+do_esp_policy_get_check ns4 10.0.2.0/24 10.0.1.0/24
+do_esp_policy_get_check ns3 dead:1::/64 dead:2::/64
+do_esp_policy_get_check ns4 dead:2::/64 dead:1::/64
+
+# ping to .254 should use ipsec, exception is not installed.
+check_xfrm 1 254
+if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .254 to use ipsec tunnel"
+ ret=1
+else
+ echo "PASS: policy before exception matches"
+fi
+
+# installs exceptions
+# localip remoteip encryptdst plaindst
+do_exception ns3 10.0.3.1 10.0.3.10 10.0.2.253 10.0.2.240/28
+do_exception ns4 10.0.3.10 10.0.3.1 10.0.1.253 10.0.1.240/28
+
+do_exception ns3 dead:3::1 dead:3::10 dead:2::fd dead:2:f0::/96
+do_exception ns4 dead:3::10 dead:3::1 dead:1::fd dead:1:f0::/96
+
+# ping to .254 should now be excluded from the tunnel
+check_xfrm 0 254
+if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .254 to fail"
+ ret=1
+else
+ echo "PASS: ping to .254 bypassed ipsec tunnel"
+fi
+
+# ping to .253 should use use ipsec due to direct policy exception.
+check_xfrm 1 253
+if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .253 to use ipsec tunnel"
+ ret=1
+else
+ echo "PASS: direct policy matches"
+fi
+
+# ping to .2 should use ipsec.
+check_xfrm 1 2
+if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .2 to use ipsec tunnel"
+ ret=1
+else
+ echo "PASS: policy matches"
+fi
+
+for i in 1 2 3 4;do ip netns del ns$i;done
+
+exit $ret