summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h28
-rw-r--r--include/linux/bpf_types.h6
-rw-r--r--include/linux/filter.h16
-rw-r--r--include/net/addrconf.h1
-rw-r--r--include/net/sock_reuseport.h19
-rw-r--r--include/net/tcp.h30
-rw-r--r--include/uapi/linux/bpf.h37
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/reuseport_array.c363
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/verifier.c9
-rw-r--r--net/core/filter.c354
-rw-r--r--net/core/sock_reuseport.c92
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/udp.c9
-rw-r--r--net/ipv6/inet6_hashtables.c14
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--tools/include/uapi/linux/bpf.h37
-rw-r--r--tools/lib/bpf/bpf.c1
-rw-r--r--tools/lib/bpf/bpf.h1
-rw-r--r--tools/lib/bpf/libbpf.c1
-rw-r--r--tools/testing/selftests/bpf/Makefile4
-rw-r--r--tools/testing/selftests/bpf/bpf_helpers.h4
-rw-r--r--tools/testing/selftests/bpf/bpf_util.h4
-rw-r--r--tools/testing/selftests/bpf/test_align.c5
-rw-r--r--tools/testing/selftests/bpf/test_btf.c5
-rw-r--r--tools/testing/selftests/bpf/test_maps.c262
-rw-r--r--tools/testing/selftests/bpf/test_select_reuseport.c688
-rw-r--r--tools/testing/selftests/bpf/test_select_reuseport_common.h36
-rw-r--r--tools/testing/selftests/bpf/test_select_reuseport_kern.c180
-rw-r--r--tools/testing/selftests/bpf/test_sock.c5
-rw-r--r--tools/testing/selftests/bpf/test_sock_addr.c5
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c5
35 files changed, 2167 insertions, 97 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd8790d2c6ed..db11662faea6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
}
struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
+int array_map_alloc_check(union bpf_attr *attr);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
@@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map)
}
#endif
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags);
+#else
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
+ void *key, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 map_flags)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */
+
/* verifier prototypes for helper functions called from eBPF programs */
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index add08be53b6f..cd26c090e7c0 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
#ifdef CONFIG_BPF_LIRC_MODE2
BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
#endif
+#ifdef CONFIG_INET
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
+#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
@@ -60,4 +63,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#if defined(CONFIG_XDP_SOCKETS)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
+#ifdef CONFIG_INET
+BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
+#endif
#endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b072dab32c0..5d565c50bcb2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -32,6 +32,7 @@ struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
+struct sock_reuseport;
/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -752,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
+void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
unsigned int len);
@@ -833,6 +835,20 @@ void bpf_warn_invalid_xdp_action(u32 act);
struct sock *do_sk_redirect_map(struct sk_buff *skb);
struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
+#ifdef CONFIG_INET
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ u32 hash);
+#else
+static inline struct sock *
+bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ u32 hash)
+{
+ return NULL;
+}
+#endif
+
#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 5f43f7a70fe6..6def0351bcc3 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
bool match_wildcard);
+bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0054b3a9b923..8a5f70c7cdf2 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -5,25 +5,36 @@
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/types.h>
+#include <linux/spinlock.h>
#include <net/sock.h>
+extern spinlock_t reuseport_lock;
+
struct sock_reuseport {
struct rcu_head rcu;
u16 max_socks; /* length of socks */
u16 num_socks; /* elements in socks */
+ /* The last synq overflow event timestamp of this
+ * reuse->socks[] group.
+ */
+ unsigned int synq_overflow_ts;
+ /* ID stays the same even after the size of socks[] grows. */
+ unsigned int reuseport_id;
+ bool bind_inany;
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
struct sock *socks[0]; /* array of sock pointers */
};
-extern int reuseport_alloc(struct sock *sk);
-extern int reuseport_add_sock(struct sock *sk, struct sock *sk2);
+extern int reuseport_alloc(struct sock *sk, bool bind_inany);
+extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
+ bool bind_inany);
extern void reuseport_detach_sock(struct sock *sk);
extern struct sock *reuseport_select_sock(struct sock *sk,
u32 hash,
struct sk_buff *skb,
int hdr_len);
-extern struct bpf_prog *reuseport_attach_prog(struct sock *sk,
- struct bpf_prog *prog);
+extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
+int reuseport_get_id(struct sock_reuseport *reuse);
#endif /* _SOCK_REUSEPORT_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d769dc20359b..d196901c9dba 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -36,6 +36,7 @@
#include <net/inet_hashtables.h>
#include <net/checksum.h>
#include <net/request_sock.h>
+#include <net/sock_reuseport.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
@@ -473,9 +474,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
*/
static inline void tcp_synq_overflow(const struct sock *sk)
{
- unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+ unsigned int last_overflow;
unsigned int now = jiffies;
+ if (sk->sk_reuseport) {
+ struct sock_reuseport *reuse;
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (likely(reuse)) {
+ last_overflow = READ_ONCE(reuse->synq_overflow_ts);
+ if (time_after32(now, last_overflow + HZ))
+ WRITE_ONCE(reuse->synq_overflow_ts, now);
+ return;
+ }
+ }
+
+ last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
if (time_after32(now, last_overflow + HZ))
tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
}
@@ -483,9 +497,21 @@ static inline void tcp_synq_overflow(const struct sock *sk)
/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
- unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+ unsigned int last_overflow;
unsigned int now = jiffies;
+ if (sk->sk_reuseport) {
+ struct sock_reuseport *reuse;
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (likely(reuse)) {
+ last_overflow = READ_ONCE(reuse->synq_overflow_ts);
+ return time_after32(now, last_overflow +
+ TCP_SYNCOOKIE_VALID);
+ }
+ }
+
+ last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID);
}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd5758dc35d3..3102a2a23c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -126,6 +126,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
+ BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
};
enum bpf_prog_type {
@@ -150,6 +151,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
+ BPF_PROG_TYPE_SK_REUSEPORT,
};
enum bpf_attach_type {
@@ -2113,6 +2115,14 @@ union bpf_attr {
* the shared data.
* Return
* Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ * It checks the selected sk is matching the incoming
+ * request in the skb.
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2196,7 +2206,8 @@ union bpf_attr {
FN(rc_keydown), \
FN(skb_cgroup_id), \
FN(get_current_cgroup_id), \
- FN(get_local_storage),
+ FN(get_local_storage), \
+ FN(sk_select_reuseport),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -2413,6 +2424,30 @@ struct sk_msg_md {
__u32 local_port; /* stored in host byte order */
};
+struct sk_reuseport_md {
+ /*
+ * Start of directly accessible data. It begins from
+ * the tcp/udp header.
+ */
+ void *data;
+ void *data_end; /* End of directly accessible data */
+ /*
+ * Total length of packet (starting from the tcp/udp header).
+ * Note that the directly accessible bytes (data_end - data)
+ * could be less than this "len". Those bytes could be
+ * indirectly read by a helper "bpf_skb_load_bytes()".
+ */
+ __u32 len;
+ /*
+ * Eth protocol in the mac header (network byte order). e.g.
+ * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+ */
+ __u32 eth_protocol;
+ __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+ __u32 bind_inany; /* Is sock bound to an INANY address? */
+ __u32 hash; /* A hash of the packet 4 tuples */
+};
+
#define BPF_TAG_SIZE 8
struct bpf_prog_info {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e8906cbad81f..0488b8258321 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2aa55d030c77..f6ca3e712831 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
}
/* Called from syscall */
-static int array_map_alloc_check(union bpf_attr *attr)
+int array_map_alloc_check(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
new file mode 100644
index 000000000000..18e225de80ff
--- /dev/null
+++ b/kernel/bpf/reuseport_array.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/sock_diag.h>
+#include <net/sock_reuseport.h>
+
+struct reuseport_array {
+ struct bpf_map map;
+ struct sock __rcu *ptrs[];
+};
+
+static struct reuseport_array *reuseport_array(struct bpf_map *map)
+{
+ return (struct reuseport_array *)map;
+}
+
+/* The caller must hold the reuseport_lock */
+void bpf_sk_reuseport_detach(struct sock *sk)
+{
+ struct sock __rcu **socks;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ socks = sk->sk_user_data;
+ if (socks) {
+ WRITE_ONCE(sk->sk_user_data, NULL);
+ /*
+ * Do not move this NULL assignment outside of
+ * sk->sk_callback_lock because there is
+ * a race with reuseport_array_free()
+ * which does not hold the reuseport_lock.
+ */
+ RCU_INIT_POINTER(*socks, NULL);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int reuseport_array_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64))
+ return -EINVAL;
+
+ return array_map_alloc_check(attr);
+}
+
+static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return rcu_dereference(array->ptrs[index]);
+}
+
+/* Called from syscall only */
+static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = *(u32 *)key;
+ struct sock *sk;
+ int err;
+
+ if (index >= map->max_entries)
+ return -E2BIG;
+
+ if (!rcu_access_pointer(array->ptrs[index]))
+ return -ENOENT;
+
+ spin_lock_bh(&reuseport_lock);
+
+ sk = rcu_dereference_protected(array->ptrs[index],
+ lockdep_is_held(&reuseport_lock));
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ WRITE_ONCE(sk->sk_user_data, NULL);
+ RCU_INIT_POINTER(array->ptrs[index], NULL);
+ write_unlock_bh(&sk->sk_callback_lock);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return err;
+}
+
+static void reuseport_array_free(struct bpf_map *map)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ struct sock *sk;
+ u32 i;
+
+ synchronize_rcu();
+
+ /*
+ * ops->map_*_elem() will not be able to access this
+ * array now. Hence, this function only races with
+ * bpf_sk_reuseport_detach() which was triggerred by
+ * close() or disconnect().
+ *
+ * This function and bpf_sk_reuseport_detach() are
+ * both removing sk from "array". Who removes it
+ * first does not matter.
+ *
+ * The only concern here is bpf_sk_reuseport_detach()
+ * may access "array" which is being freed here.
+ * bpf_sk_reuseport_detach() access this "array"
+ * through sk->sk_user_data _and_ with sk->sk_callback_lock
+ * held which is enough because this "array" is not freed
+ * until all sk->sk_user_data has stopped referencing this "array".
+ *
+ * Hence, due to the above, taking "reuseport_lock" is not
+ * needed here.
+ */
+
+ /*
+ * Since reuseport_lock is not taken, sk is accessed under
+ * rcu_read_lock()
+ */
+ rcu_read_lock();
+ for (i = 0; i < map->max_entries; i++) {
+ sk = rcu_dereference(array->ptrs[i]);
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ /*
+ * No need for WRITE_ONCE(). At this point,
+ * no one is reading it without taking the
+ * sk->sk_callback_lock.
+ */
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+ RCU_INIT_POINTER(array->ptrs[i], NULL);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Once reaching here, all sk->sk_user_data is not
+ * referenceing this "array". "array" can be freed now.
+ */
+ bpf_map_area_free(array);
+}
+
+static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
+{
+ int err, numa_node = bpf_map_attr_numa_node(attr);
+ struct reuseport_array *array;
+ u64 cost, array_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ array_size = sizeof(*array);
+ array_size += (u64)attr->max_entries * sizeof(struct sock *);
+
+ /* make sure there is no u32 overflow later in round_up() */
+ cost = array_size;
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-ENOMEM);
+ cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ err = bpf_map_precharge_memlock(cost);
+ if (err)
+ return ERR_PTR(err);
+
+ /* allocate all map elements and zero-initialize them */
+ array = bpf_map_area_alloc(array_size, numa_node);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&array->map, attr);
+ array->map.pages = cost;
+
+ return &array->map;
+}
+
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct sock *sk;
+ int err;
+
+ if (map->value_size != sizeof(u64))
+ return -ENOSPC;
+
+ rcu_read_lock();
+ sk = reuseport_array_lookup_elem(map, key);
+ if (sk) {
+ *(u64 *)value = sock_gen_cookie(sk);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int
+reuseport_array_update_check(const struct reuseport_array *array,
+ const struct sock *nsk,
+ const struct sock *osk,
+ const struct sock_reuseport *nsk_reuse,
+ u32 map_flags)
+{
+ if (osk && map_flags == BPF_NOEXIST)
+ return -EEXIST;
+
+ if (!osk && map_flags == BPF_EXIST)
+ return -ENOENT;
+
+ if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
+ return -ENOTSUPP;
+
+ if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
+ return -ENOTSUPP;
+
+ if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
+ return -ENOTSUPP;
+
+ /*
+ * sk must be hashed (i.e. listening in the TCP case or binded
+ * in the UDP case) and
+ * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
+ *
+ * Also, sk will be used in bpf helper that is protected by
+ * rcu_read_lock().
+ */
+ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
+ return -EINVAL;
+
+ /* READ_ONCE because the sk->sk_callback_lock may not be held here */
+ if (READ_ONCE(nsk->sk_user_data))
+ return -EBUSY;
+
+ return 0;
+}
+
+/*
+ * Called from syscall only.
+ * The "nsk" in the fd refcnt.
+ * The "osk" and "reuse" are protected by reuseport_lock.
+ */
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ struct sock *free_osk = NULL, *osk, *nsk;
+ struct sock_reuseport *reuse;
+ u32 index = *(u32 *)key;
+ struct socket *socket;
+ int err, fd;
+
+ if (map_flags > BPF_EXIST)
+ return -EINVAL;
+
+ if (index >= map->max_entries)
+ return -E2BIG;
+
+ if (map->value_size == sizeof(u64)) {
+ u64 fd64 = *(u64 *)value;
+
+ if (fd64 > S32_MAX)
+ return -EINVAL;
+ fd = fd64;
+ } else {
+ fd = *(int *)value;
+ }
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ nsk = socket->sk;
+ if (!nsk) {
+ err = -EINVAL;
+ goto put_file;
+ }
+
+ /* Quick checks before taking reuseport_lock */
+ err = reuseport_array_update_check(array, nsk,
+ rcu_access_pointer(array->ptrs[index]),
+ rcu_access_pointer(nsk->sk_reuseport_cb),
+ map_flags);
+ if (err)
+ goto put_file;
+
+ spin_lock_bh(&reuseport_lock);
+ /*
+ * Some of the checks only need reuseport_lock
+ * but it is done under sk_callback_lock also
+ * for simplicity reason.
+ */
+ write_lock_bh(&nsk->sk_callback_lock);
+
+ osk = rcu_dereference_protected(array->ptrs[index],
+ lockdep_is_held(&reuseport_lock));
+ reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
+ if (err)
+ goto put_file_unlock;
+
+ /* Ensure reuse->reuseport_id is set */
+ err = reuseport_get_id(reuse);
+ if (err < 0)
+ goto put_file_unlock;
+
+ WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]);
+ rcu_assign_pointer(array->ptrs[index], nsk);
+ free_osk = osk;
+ err = 0;
+
+put_file_unlock:
+ write_unlock_bh(&nsk->sk_callback_lock);
+
+ if (free_osk) {
+ write_lock_bh(&free_osk->sk_callback_lock);
+ WRITE_ONCE(free_osk->sk_user_data, NULL);
+ write_unlock_bh(&free_osk->sk_callback_lock);
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+put_file:
+ fput(socket->file);
+ return err;
+}
+
+/* Called from syscall */
+static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+const struct bpf_map_ops reuseport_array_ops = {
+ .map_alloc_check = reuseport_array_alloc_check,
+ .map_alloc = reuseport_array_alloc,
+ .map_free = reuseport_array_free,
+ .map_lookup_elem = reuseport_array_lookup_elem,
+ .map_get_next_key = reuseport_array_get_next_key,
+ .map_delete_elem = reuseport_array_delete_elem,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5af4e9e2722d..57f4d076141b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ attr->flags);
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 587468a9c37d..ca90679a7fe5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
@@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_msg_redirect_hash)
goto error;
break;
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ if (func_id != BPF_FUNC_sk_select_reuseport)
+ goto error;
+ break;
default:
break;
}
@@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
goto error;
break;
+ case BPF_FUNC_sk_select_reuseport:
+ if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
+ goto error;
+ break;
default:
break;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 2de7dd9f2a57..22906b31d43f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
return 0;
}
-static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
-{
- struct bpf_prog *old_prog;
- int err;
-
- if (bpf_prog_size(prog->len) > sysctl_optmem_max)
- return -ENOMEM;
-
- if (sk_unhashed(sk) && sk->sk_reuseport) {
- err = reuseport_alloc(sk);
- if (err)
- return err;
- } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
- /* The socket wasn't bound with SO_REUSEPORT */
- return -EINVAL;
- }
-
- old_prog = reuseport_attach_prog(sk, prog);
- if (old_prog)
- bpf_prog_destroy(old_prog);
-
- return 0;
-}
-
static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
@@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __reuseport_attach_prog(prog, sk);
- if (err < 0) {
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ err = -ENOMEM;
+ else
+ err = reuseport_attach_prog(sk, prog);
+
+ if (err)
__bpf_prog_release(prog);
- return err;
- }
- return 0;
+ return err;
}
static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
@@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
- struct bpf_prog *prog = __get_bpf(ufd, sk);
+ struct bpf_prog *prog;
int err;
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __reuseport_attach_prog(prog, sk);
- if (err < 0) {
- bpf_prog_put(prog);
- return err;
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
+ /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
+ * bpf prog (e.g. sockmap). It depends on the
+ * limitation imposed by bpf_prog_load().
+ * Hence, sysctl_optmem_max is not checked.
+ */
+ if ((sk->sk_type != SOCK_STREAM &&
+ sk->sk_type != SOCK_DGRAM) ||
+ (sk->sk_protocol != IPPROTO_UDP &&
+ sk->sk_protocol != IPPROTO_TCP) ||
+ (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)) {
+ err = -ENOTSUPP;
+ goto err_prog_put;
+ }
+ } else {
+ /* BPF_PROG_TYPE_SOCKET_FILTER */
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+ err = -ENOMEM;
+ goto err_prog_put;
+ }
}
- return 0;
+ err = reuseport_attach_prog(sk, prog);
+err_prog_put:
+ if (err)
+ bpf_prog_put(prog);
+
+ return err;
+}
+
+void sk_reuseport_prog_free(struct bpf_prog *prog)
+{
+ if (!prog)
+ return;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ bpf_prog_put(prog);
+ else
+ bpf_prog_destroy(prog);
}
struct bpf_scratchpad {
@@ -7013,3 +7030,270 @@ out:
release_sock(sk);
return ret;
}
+
+#ifdef CONFIG_INET
+struct sk_reuseport_kern {
+ struct sk_buff *skb;
+ struct sock *sk;
+ struct sock *selected_sk;
+ void *data_end;
+ u32 hash;
+ u32 reuseport_id;
+ bool bind_inany;
+};
+
+static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
+ struct sock_reuseport *reuse,
+ struct sock *sk, struct sk_buff *skb,
+ u32 hash)
+{
+ reuse_kern->skb = skb;
+ reuse_kern->sk = sk;
+ reuse_kern->selected_sk = NULL;
+ reuse_kern->data_end = skb->data + skb_headlen(skb);
+ reuse_kern->hash = hash;
+ reuse_kern->reuseport_id = reuse->reuseport_id;
+ reuse_kern->bind_inany = reuse->bind_inany;
+}
+
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ u32 hash)
+{
+ struct sk_reuseport_kern reuse_kern;
+ enum sk_action action;
+
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+ action = BPF_PROG_RUN(prog, &reuse_kern);
+
+ if (action == SK_PASS)
+ return reuse_kern.selected_sk;
+ else
+ return ERR_PTR(-ECONNREFUSED);
+}
+
+BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
+ struct bpf_map *, map, void *, key, u32, flags)
+{
+ struct sock_reuseport *reuse;
+ struct sock *selected_sk;
+
+ selected_sk = map->ops->map_lookup_elem(map, key);
+ if (!selected_sk)
+ return -ENOENT;
+
+ reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
+ if (!reuse)
+ /* selected_sk is unhashed (e.g. by close()) after the
+ * above map_lookup_elem(). Treat selected_sk has already
+ * been removed from the map.
+ */
+ return -ENOENT;
+
+ if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
+ struct sock *sk;
+
+ if (unlikely(!reuse_kern->reuseport_id))
+ /* There is a small race between adding the
+ * sk to the map and setting the
+ * reuse_kern->reuseport_id.
+ * Treat it as the sk has not been added to
+ * the bpf map yet.
+ */
+ return -ENOENT;
+
+ sk = reuse_kern->sk;
+ if (sk->sk_protocol != selected_sk->sk_protocol)
+ return -EPROTOTYPE;
+ else if (sk->sk_family != selected_sk->sk_family)
+ return -EAFNOSUPPORT;
+
+ /* Catch all. Likely bound to a different sockaddr. */
+ return -EBADFD;
+ }
+
+ reuse_kern->selected_sk = selected_sk;
+
+ return 0;
+}
+
+static const struct bpf_func_proto sk_select_reuseport_proto = {
+ .func = sk_select_reuseport,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(sk_reuseport_load_bytes,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len)
+{
+ return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
+ .func = sk_reuseport_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(sk_reuseport_load_bytes_relative,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len, u32, start_header)
+{
+ return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
+ len, start_header);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
+ .func = sk_reuseport_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_reuseport_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sk_select_reuseport:
+ return &sk_select_reuseport_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &sk_reuseport_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &sk_reuseport_load_bytes_relative_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static bool
+sk_reuseport_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const u32 size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
+ off % size || type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_reuseport_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, hash):
+ return size == size_default;
+
+ /* Fields that allow narrowing */
+ case offsetof(struct sk_reuseport_md, eth_protocol):
+ if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+ return false;
+ case offsetof(struct sk_reuseport_md, ip_protocol):
+ case offsetof(struct sk_reuseport_md, bind_inany):
+ case offsetof(struct sk_reuseport_md, len):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+
+ default:
+ return false;
+ }
+}
+
+#define SK_REUSEPORT_LOAD_FIELD(F) ({ \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ si->dst_reg, si->src_reg, \
+ bpf_target_off(struct sk_reuseport_kern, F, \
+ FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ target_size)); \
+ })
+
+#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sk_buff, \
+ skb, \
+ SKB_FIELD)
+
+#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \
+ struct sock, \
+ sk, \
+ SK_FIELD, BPF_SIZE, EXTRA_OFF)
+
+static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct sk_reuseport_md, data):
+ SK_REUSEPORT_LOAD_SKB_FIELD(data);
+ break;
+
+ case offsetof(struct sk_reuseport_md, len):
+ SK_REUSEPORT_LOAD_SKB_FIELD(len);
+ break;
+
+ case offsetof(struct sk_reuseport_md, eth_protocol):
+ SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
+ break;
+
+ case offsetof(struct sk_reuseport_md, ip_protocol):
+ BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
+ SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
+ BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+ SK_FL_PROTO_SHIFT);
+ /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
+ * aware. No further narrowing or masking is needed.
+ */
+ *target_size = 1;
+ break;
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ SK_REUSEPORT_LOAD_FIELD(data_end);
+ break;
+
+ case offsetof(struct sk_reuseport_md, hash):
+ SK_REUSEPORT_LOAD_FIELD(hash);
+ break;
+
+ case offsetof(struct sk_reuseport_md, bind_inany):
+ SK_REUSEPORT_LOAD_FIELD(bind_inany);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
+ .get_func_proto = sk_reuseport_func_proto,
+ .is_valid_access = sk_reuseport_is_valid_access,
+ .convert_ctx_access = sk_reuseport_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_reuseport_prog_ops = {
+};
+#endif /* CONFIG_INET */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 064acb04be0f..ba5cba56f574 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -8,11 +8,34 @@
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
+#include <linux/idr.h>
+#include <linux/filter.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
-static DEFINE_SPINLOCK(reuseport_lock);
+DEFINE_SPINLOCK(reuseport_lock);
+
+#define REUSEPORT_MIN_ID 1
+static DEFINE_IDA(reuseport_ida);
+
+int reuseport_get_id(struct sock_reuseport *reuse)
+{
+ int id;
+
+ if (reuse->reuseport_id)
+ return reuse->reuseport_id;
+
+ id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
+ /* Called under reuseport_lock */
+ GFP_ATOMIC);
+ if (id < 0)
+ return id;
+
+ reuse->reuseport_id = id;
+
+ return reuse->reuseport_id;
+}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
@@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
return reuse;
}
-int reuseport_alloc(struct sock *sk)
+int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
@@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk)
/* Allocation attempts can occur concurrently via the setsockopt path
* and the bind/hash path. Nothing to do when we lose the race.
*/
- if (rcu_dereference_protected(sk->sk_reuseport_cb,
- lockdep_is_held(&reuseport_lock)))
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (reuse) {
+ /* Only set reuse->bind_inany if the bind_inany is true.
+ * Otherwise, it will overwrite the reuse->bind_inany
+ * which was set by the bind/hash path.
+ */
+ if (bind_inany)
+ reuse->bind_inany = bind_inany;
goto out;
+ }
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
@@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk)
reuse->socks[0] = sk;
reuse->num_socks = 1;
+ reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
+ more_reuse->reuseport_id = reuse->reuseport_id;
+ more_reuse->bind_inany = reuse->bind_inany;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
+ more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
@@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head)
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
- if (reuse->prog)
- bpf_prog_destroy(reuse->prog);
+ sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
+ if (reuse->reuseport_id)
+ ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
* @sk2: Socket belonging to the existing reuseport group.
* May return ENOMEM and not add socket to group under memory pressure.
*/
-int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
{
struct sock_reuseport *old_reuse, *reuse;
if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
- int err = reuseport_alloc(sk2);
+ int err = reuseport_alloc(sk2, bind_inany);
if (err)
return err;
@@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk)
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+
+ /* At least one of the sk in this reuseport group is added to
+ * a bpf map. Notify the bpf side. The bpf map logic will
+ * remove the sk if it is indeed added to a bpf map.
+ */
+ if (reuse->reuseport_id)
+ bpf_sk_reuseport_detach(sk);
+
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
@@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk)
}
EXPORT_SYMBOL(reuseport_detach_sock);
-static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
- struct bpf_prog *prog, struct sk_buff *skb,
- int hdr_len)
+static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
@@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
- if (prog && skb)
- sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ if (!prog || !skb)
+ goto select_by_hash;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ else
+ sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
+select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2)
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
@@ -252,12 +302,21 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
-struct bpf_prog *
-reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
+ int err = reuseport_alloc(sk, false);
+
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
@@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
- return old_prog;
+ sk_reuseport_prog_free(old_prog);
+ return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 33a88e045efd..dfd5009f96ef 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
}
EXPORT_SYMBOL(inet_rcv_saddr_equal);
+bool inet_rcv_saddr_any(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#endif
+ return !sk->sk_rcv_saddr;
+}
+
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3647167c8fa3..f5c9ef2586de 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net,
saddr, sport, daddr, hnum,
dif, sdif);
if (result)
- return result;
+ goto done;
/* Lookup lhash2 with INADDR_ANY */
@@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net,
if (ilb2->count > ilb->count)
goto port_lookup;
- return inet_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
- dif, sdif);
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ goto done;
port_lookup:
sk_for_each_rcu(sk, &ilb->head) {
@@ -352,12 +353,15 @@ port_lookup:
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
- return result;
+ goto done;
}
result = sk;
hiscore = score;
}
}
+done:
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
inet_csk(sk2)->icsk_bind_hash == tb &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false))
- return reuseport_add_sock(sk, sk2);
+ return reuseport_add_sock(sk, sk2,
+ inet_rcv_saddr_any(sk));
}
- return reuseport_alloc(sk);
+ return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 060e841dde40..f4e35b2ff8b8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false)) {
- return reuseport_add_sock(sk, sk2);
+ return reuseport_add_sock(sk, sk2,
+ inet_rcv_saddr_any(sk));
}
}
- return reuseport_alloc(sk);
+ return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
/**
@@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
begin:
@@ -512,6 +515,8 @@ begin:
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
+ if (unlikely(IS_ERR(result)))
+ return NULL;
if (result)
return result;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 595ad408dba0..3d7c7460a0c5 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net,
saddr, sport, daddr, hnum,
dif, sdif);
if (result)
- return result;
+ goto done;
/* Lookup lhash2 with in6addr_any */
@@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net,
if (ilb2->count > ilb->count)
goto port_lookup;
- return inet6_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
- dif, sdif);
+ result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ goto done;
port_lookup:
sk_for_each(sk, &ilb->head) {
@@ -214,12 +215,15 @@ port_lookup:
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
- return result;
+ goto done;
}
result = sk;
hiscore = score;
}
}
+done:
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f6b96956a8ed..83f4c77c79d8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
exact_dif, hslot2,
skb);
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
begin:
@@ -249,6 +251,8 @@ begin:
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
+ if (unlikely(IS_ERR(result)))
+ return NULL;
if (result)
return result;
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index dd5758dc35d3..3102a2a23c31 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -126,6 +126,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
+ BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
};
enum bpf_prog_type {
@@ -150,6 +151,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
+ BPF_PROG_TYPE_SK_REUSEPORT,
};
enum bpf_attach_type {
@@ -2113,6 +2115,14 @@ union bpf_attr {
* the shared data.
* Return
* Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ * It checks the selected sk is matching the incoming
+ * request in the skb.
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2196,7 +2206,8 @@ union bpf_attr {
FN(rc_keydown), \
FN(skb_cgroup_id), \
FN(get_current_cgroup_id), \
- FN(get_local_storage),
+ FN(get_local_storage), \
+ FN(sk_select_reuseport),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -2413,6 +2424,30 @@ struct sk_msg_md {
__u32 local_port; /* stored in host byte order */
};
+struct sk_reuseport_md {
+ /*
+ * Start of directly accessible data. It begins from
+ * the tcp/udp header.
+ */
+ void *data;
+ void *data_end; /* End of directly accessible data */
+ /*
+ * Total length of packet (starting from the tcp/udp header).
+ * Note that the directly accessible bytes (data_end - data)
+ * could be less than this "len". Those bytes could be
+ * indirectly read by a helper "bpf_skb_load_bytes()".
+ */
+ __u32 len;
+ /*
+ * Eth protocol in the mac header (network byte order). e.g.
+ * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+ */
+ __u32 eth_protocol;
+ __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+ __u32 bind_inany; /* Is sock bound to an INANY address? */
+ __u32 hash; /* A hash of the packet 4 tuples */
+};
+
#define BPF_TAG_SIZE 8
struct bpf_prog_info {
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9ddc89dae962..60aa4ca8b2c5 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -92,6 +92,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
+ attr.inner_map_fd = create_attr->inner_map_fd;
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 0639a30a457d..6f38164b2618 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -39,6 +39,7 @@ struct bpf_create_map_attr {
__u32 btf_key_type_id;
__u32 btf_value_type_id;
__u32 map_ifindex;
+ __u32 inner_map_fd;
};
int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 40211b51427a..2abd0f112627 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1501,6 +1501,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_LIRC_MODE2:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
return false;
case BPF_PROG_TYPE_UNSPEC:
case BPF_PROG_TYPE_KPROBE:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 17a7a5818ee1..daed162043c2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
- test_socket_cookie test_cgroup_storage
+ test_socket_cookie test_cgroup_storage test_select_reuseport
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
- get_cgroup_id_kern.o socket_cookie_prog.o
+ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 9ba1c72d7cf5..5c32266c2c38 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -111,6 +111,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
int size, int flags) =
(void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) =
+ (void *) BPF_FUNC_sk_select_reuseport;
static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
(void *) BPF_FUNC_get_stack;
static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
@@ -173,6 +175,8 @@ struct bpf_map_def {
static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =
(void *) BPF_FUNC_skb_load_bytes;
+static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) =
+ (void *) BPF_FUNC_skb_load_bytes_relative;
static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
(void *) BPF_FUNC_skb_store_bytes;
static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =
diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
index d0811b3d6a6f..315a44fa32af 100644
--- a/tools/testing/selftests/bpf/bpf_util.h
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -44,4 +44,8 @@ static inline unsigned int bpf_num_possible_cpus(void)
name[bpf_num_possible_cpus()]
#define bpf_percpu(name, cpu) name[(cpu)].v
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
#endif /* __BPF_UTIL__ */
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index 6b1b302310fe..5f377ec53f2f 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -18,10 +18,7 @@
#include "../../../include/linux/filter.h"
#include "bpf_rlimit.h"
-
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
+#include "bpf_util.h"
#define MAX_INSNS 512
#define MAX_MATCHES 16
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 7fa8c800c540..6b5cfeb7a9cc 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -19,6 +19,7 @@
#include <bpf/btf.h>
#include "bpf_rlimit.h"
+#include "bpf_util.h"
static uint32_t pass_cnt;
static uint32_t error_cnt;
@@ -93,10 +94,6 @@ static int __base_pr(const char *format, ...)
#define MAX_NR_RAW_TYPES 1024
#define BTF_LOG_BUF_SIZE 65535
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
static struct args {
unsigned int raw_test_num;
unsigned int file_test_num;
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 6c253343a6f9..4b7c74f5faa7 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -17,7 +17,8 @@
#include <stdlib.h>
#include <sys/wait.h>
-
+#include <sys/socket.h>
+#include <netinet/in.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
@@ -26,8 +27,21 @@
#include "bpf_util.h"
#include "bpf_rlimit.h"
+#ifndef ENOTSUPP
+#define ENOTSUPP 524
+#endif
+
static int map_flags;
+#define CHECK(condition, tag, format...) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
+ printf(format); \
+ exit(-1); \
+ } \
+})
+
static void test_hashmap(int task, void *data)
{
long long key, next_key, first_key, value;
@@ -1150,6 +1164,250 @@ static void test_map_wronly(void)
assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM);
}
+static void prepare_reuseport_grp(int type, int map_fd,
+ __s64 *fds64, __u64 *sk_cookies,
+ unsigned int n)
+{
+ socklen_t optlen, addrlen;
+ struct sockaddr_in6 s6;
+ const __u32 index0 = 0;
+ const int optval = 1;
+ unsigned int i;
+ u64 sk_cookie;
+ __s64 fd64;
+ int err;
+
+ s6.sin6_family = AF_INET6;
+ s6.sin6_addr = in6addr_any;
+ s6.sin6_port = 0;
+ addrlen = sizeof(s6);
+ optlen = sizeof(sk_cookie);
+
+ for (i = 0; i < n; i++) {
+ fd64 = socket(AF_INET6, type, 0);
+ CHECK(fd64 == -1, "socket()",
+ "sock_type:%d fd64:%lld errno:%d\n",
+ type, fd64, errno);
+
+ err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT,
+ &optval, sizeof(optval));
+ CHECK(err == -1, "setsockopt(SO_REUSEEPORT)",
+ "err:%d errno:%d\n", err, errno);
+
+ /* reuseport_array does not allow unbound sk */
+ err = bpf_map_update_elem(map_fd, &index0, &fd64,
+ BPF_ANY);
+ CHECK(err != -1 || errno != EINVAL,
+ "reuseport array update unbound sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6));
+ CHECK(err == -1, "bind()",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ if (i == 0) {
+ err = getsockname(fd64, (struct sockaddr *)&s6,
+ &addrlen);
+ CHECK(err == -1, "getsockname()",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ }
+
+ err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie,
+ &optlen);
+ CHECK(err == -1, "getsockopt(SO_COOKIE)",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ if (type == SOCK_STREAM) {
+ /*
+ * reuseport_array does not allow
+ * non-listening tcp sk.
+ */
+ err = bpf_map_update_elem(map_fd, &index0, &fd64,
+ BPF_ANY);
+ CHECK(err != -1 || errno != EINVAL,
+ "reuseport array update non-listening sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ err = listen(fd64, 0);
+ CHECK(err == -1, "listen()",
+ "sock_type:%d, err:%d errno:%d\n",
+ type, err, errno);
+ }
+
+ fds64[i] = fd64;
+ sk_cookies[i] = sk_cookie;
+ }
+}
+
+static void test_reuseport_array(void)
+{
+#define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; })
+
+ const __u32 array_size = 4, index0 = 0, index3 = 3;
+ int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type;
+ __u64 grpa_cookies[2], sk_cookie, map_cookie;
+ __s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1;
+ const __u32 bad_index = array_size;
+ int map_fd, err, t, f;
+ __u32 fds_idx = 0;
+ int fd;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ sizeof(__u32), sizeof(__u64), array_size, 0);
+ CHECK(map_fd == -1, "reuseport array create",
+ "map_fd:%d, errno:%d\n", map_fd, errno);
+
+ /* Test lookup/update/delete with invalid index */
+ err = bpf_map_delete_elem(map_fd, &bad_index);
+ CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries",
+ "err:%d errno:%d\n", err, errno);
+
+ err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY);
+ CHECK(err != -1 || errno != E2BIG,
+ "reuseport array update >=max_entries",
+ "err:%d errno:%d\n", err, errno);
+
+ err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array update >=max_entries",
+ "err:%d errno:%d\n", err, errno);
+
+ /* Test lookup/delete non existence elem */
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array lookup not-exist elem",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_delete_elem(map_fd, &index3);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array del not-exist elem",
+ "err:%d errno:%d\n", err, errno);
+
+ for (t = 0; t < ARRAY_SIZE(types); t++) {
+ type = types[t];
+
+ prepare_reuseport_grp(type, map_fd, grpa_fds64,
+ grpa_cookies, ARRAY_SIZE(grpa_fds64));
+
+ /* Test BPF_* update flags */
+ /* BPF_EXIST failure case */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_EXIST);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array update empty elem BPF_EXIST",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_NOEXIST success case */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_NOEXIST);
+ CHECK(err == -1,
+ "reuseport array update empty elem BPF_NOEXIST",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_EXIST success case. */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_EXIST);
+ CHECK(err == -1,
+ "reuseport array update same elem BPF_EXIST",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_NOEXIST failure case */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_NOEXIST);
+ CHECK(err != -1 || errno != EEXIST,
+ "reuseport array update non-empty elem BPF_NOEXIST",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_ANY case (always succeed) */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_ANY);
+ CHECK(err == -1,
+ "reuseport array update same sk with BPF_ANY",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ fd64 = grpa_fds64[fds_idx];
+ sk_cookie = grpa_cookies[fds_idx];
+
+ /* The same sk cannot be added to reuseport_array twice */
+ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY);
+ CHECK(err != -1 || errno != EBUSY,
+ "reuseport array update same sk with same index",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY);
+ CHECK(err != -1 || errno != EBUSY,
+ "reuseport array update same sk with different index",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ /* Test delete elem */
+ err = bpf_map_delete_elem(map_fd, &index3);
+ CHECK(err == -1, "reuseport array delete sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ /* Add it back with BPF_NOEXIST */
+ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
+ CHECK(err == -1,
+ "reuseport array re-add with BPF_NOEXIST after del",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ /* Test cookie */
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err == -1 || sk_cookie != map_cookie,
+ "reuseport array lookup re-added sk",
+ "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn",
+ type, err, errno, sk_cookie, map_cookie);
+
+ /* Test elem removed by close() */
+ for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++)
+ close(grpa_fds64[f]);
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array lookup after close()",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ }
+
+ /* Test SOCK_RAW */
+ fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP);
+ CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n",
+ err, errno);
+ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
+ CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW",
+ "err:%d errno:%d\n", err, errno);
+ close(fd64);
+
+ /* Close the 64 bit value map */
+ close(map_fd);
+
+ /* Test 32 bit fd */
+ map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ sizeof(__u32), sizeof(__u32), array_size, 0);
+ CHECK(map_fd == -1, "reuseport array create",
+ "map_fd:%d, errno:%d\n", map_fd, errno);
+ prepare_reuseport_grp(SOCK_STREAM, map_fd, &fd64, &sk_cookie, 1);
+ fd = fd64;
+ err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST);
+ CHECK(err == -1, "reuseport array update 32 bit fd",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err != -1 || errno != ENOSPC,
+ "reuseport array lookup 32 bit fd",
+ "err:%d errno:%d\n", err, errno);
+ close(fd);
+ close(map_fd);
+}
+
static void run_all_tests(void)
{
test_hashmap(0, NULL);
@@ -1170,6 +1428,8 @@ static void run_all_tests(void)
test_map_rdonly();
test_map_wronly();
+
+ test_reuseport_array();
}
int main(void)
diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c
new file mode 100644
index 000000000000..75646d9b34aa
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_select_reuseport.c
@@ -0,0 +1,688 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <sys/types.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "test_select_reuseport_common.h"
+
+#define MIN_TCPHDR_LEN 20
+#define UDPHDR_LEN 8
+
+#define TCP_SYNCOOKIE_SYSCTL "/proc/sys/net/ipv4/tcp_syncookies"
+#define TCP_FO_SYSCTL "/proc/sys/net/ipv4/tcp_fastopen"
+#define REUSEPORT_ARRAY_SIZE 32
+
+static int result_map, tmp_index_ovr_map, linum_map, data_check_map;
+static enum result expected_results[NR_RESULTS];
+static int sk_fds[REUSEPORT_ARRAY_SIZE];
+static int reuseport_array, outer_map;
+static int select_by_skb_data_prog;
+static int saved_tcp_syncookie;
+static struct bpf_object *obj;
+static int saved_tcp_fo;
+static __u32 index_zero;
+static int epfd;
+
+static union sa46 {
+ struct sockaddr_in6 v6;
+ struct sockaddr_in v4;
+ sa_family_t family;
+} srv_sa;
+
+#define CHECK(condition, tag, format...) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
+ printf(format); \
+ exit(-1); \
+ } \
+})
+
+static void create_maps(void)
+{
+ struct bpf_create_map_attr attr = {};
+
+ /* Creating reuseport_array */
+ attr.name = "reuseport_array";
+ attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
+ attr.key_size = sizeof(__u32);
+ attr.value_size = sizeof(__u32);
+ attr.max_entries = REUSEPORT_ARRAY_SIZE;
+
+ reuseport_array = bpf_create_map_xattr(&attr);
+ CHECK(reuseport_array == -1, "creating reuseport_array",
+ "reuseport_array:%d errno:%d\n", reuseport_array, errno);
+
+ /* Creating outer_map */
+ attr.name = "outer_map";
+ attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS;
+ attr.key_size = sizeof(__u32);
+ attr.value_size = sizeof(__u32);
+ attr.max_entries = 1;
+ attr.inner_map_fd = reuseport_array;
+ outer_map = bpf_create_map_xattr(&attr);
+ CHECK(outer_map == -1, "creating outer_map",
+ "outer_map:%d errno:%d\n", outer_map, errno);
+}
+
+static void prepare_bpf_obj(void)
+{
+ struct bpf_program *prog;
+ struct bpf_map *map;
+ int err;
+ struct bpf_object_open_attr attr = {
+ .file = "test_select_reuseport_kern.o",
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ };
+
+ obj = bpf_object__open_xattr(&attr);
+ CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o",
+ "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj));
+
+ prog = bpf_program__next(NULL, obj);
+ CHECK(!prog, "get first bpf_program", "!prog\n");
+ bpf_program__set_type(prog, attr.prog_type);
+
+ map = bpf_object__find_map_by_name(obj, "outer_map");
+ CHECK(!map, "find outer_map", "!map\n");
+ err = bpf_map__reuse_fd(map, outer_map);
+ CHECK(err, "reuse outer_map", "err:%d\n", err);
+
+ err = bpf_object__load(obj);
+ CHECK(err, "load bpf_object", "err:%d\n", err);
+
+ select_by_skb_data_prog = bpf_program__fd(prog);
+ CHECK(select_by_skb_data_prog == -1, "get prog fd",
+ "select_by_skb_data_prog:%d\n", select_by_skb_data_prog);
+
+ map = bpf_object__find_map_by_name(obj, "result_map");
+ CHECK(!map, "find result_map", "!map\n");
+ result_map = bpf_map__fd(map);
+ CHECK(result_map == -1, "get result_map fd",
+ "result_map:%d\n", result_map);
+
+ map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map");
+ CHECK(!map, "find tmp_index_ovr_map", "!map\n");
+ tmp_index_ovr_map = bpf_map__fd(map);
+ CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd",
+ "tmp_index_ovr_map:%d\n", tmp_index_ovr_map);
+
+ map = bpf_object__find_map_by_name(obj, "linum_map");
+ CHECK(!map, "find linum_map", "!map\n");
+ linum_map = bpf_map__fd(map);
+ CHECK(linum_map == -1, "get linum_map fd",
+ "linum_map:%d\n", linum_map);
+
+ map = bpf_object__find_map_by_name(obj, "data_check_map");
+ CHECK(!map, "find data_check_map", "!map\n");
+ data_check_map = bpf_map__fd(map);
+ CHECK(data_check_map == -1, "get data_check_map fd",
+ "data_check_map:%d\n", data_check_map);
+}
+
+static void sa46_init_loopback(union sa46 *sa, sa_family_t family)
+{
+ memset(sa, 0, sizeof(*sa));
+ sa->family = family;
+ if (sa->family == AF_INET6)
+ sa->v6.sin6_addr = in6addr_loopback;
+ else
+ sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+}
+
+static void sa46_init_inany(union sa46 *sa, sa_family_t family)
+{
+ memset(sa, 0, sizeof(*sa));
+ sa->family = family;
+ if (sa->family == AF_INET6)
+ sa->v6.sin6_addr = in6addr_any;
+ else
+ sa->v4.sin_addr.s_addr = INADDR_ANY;
+}
+
+static int read_int_sysctl(const char *sysctl)
+{
+ char buf[16];
+ int fd, ret;
+
+ fd = open(sysctl, 0);
+ CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n",
+ sysctl, fd, errno);
+
+ ret = read(fd, buf, sizeof(buf));
+ CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n",
+ sysctl, ret, errno);
+ close(fd);
+
+ return atoi(buf);
+}
+
+static void write_int_sysctl(const char *sysctl, int v)
+{
+ int fd, ret, size;
+ char buf[16];
+
+ fd = open(sysctl, O_RDWR);
+ CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n",
+ sysctl, fd, errno);
+
+ size = snprintf(buf, sizeof(buf), "%d", v);
+ ret = write(fd, buf, size);
+ CHECK(ret != size, "write(sysctl)",
+ "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno);
+ close(fd);
+}
+
+static void restore_sysctls(void)
+{
+ write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo);
+ write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie);
+}
+
+static void enable_fastopen(void)
+{
+ int fo;
+
+ fo = read_int_sysctl(TCP_FO_SYSCTL);
+ write_int_sysctl(TCP_FO_SYSCTL, fo | 7);
+}
+
+static void enable_syncookie(void)
+{
+ write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2);
+}
+
+static void disable_syncookie(void)
+{
+ write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0);
+}
+
+static __u32 get_linum(void)
+{
+ __u32 linum;
+ int err;
+
+ err = bpf_map_lookup_elem(linum_map, &index_zero, &linum);
+ CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n",
+ err, errno);
+
+ return linum;
+}
+
+static void check_data(int type, sa_family_t family, const struct cmd *cmd,
+ int cli_fd)
+{
+ struct data_check expected = {}, result;
+ union sa46 cli_sa;
+ socklen_t addrlen;
+ int err;
+
+ addrlen = sizeof(cli_sa);
+ err = getsockname(cli_fd, (struct sockaddr *)&cli_sa,
+ &addrlen);
+ CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n",
+ err, errno);
+
+ err = bpf_map_lookup_elem(data_check_map, &index_zero, &result);
+ CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n",
+ err, errno);
+
+ if (type == SOCK_STREAM) {
+ expected.len = MIN_TCPHDR_LEN;
+ expected.ip_protocol = IPPROTO_TCP;
+ } else {
+ expected.len = UDPHDR_LEN;
+ expected.ip_protocol = IPPROTO_UDP;
+ }
+
+ if (family == AF_INET6) {
+ expected.eth_protocol = htons(ETH_P_IPV6);
+ expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] &&
+ !srv_sa.v6.sin6_addr.s6_addr32[2] &&
+ !srv_sa.v6.sin6_addr.s6_addr32[1] &&
+ !srv_sa.v6.sin6_addr.s6_addr32[0];
+
+ memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32,
+ sizeof(cli_sa.v6.sin6_addr));
+ memcpy(&expected.skb_addrs[4], &in6addr_loopback,
+ sizeof(in6addr_loopback));
+ expected.skb_ports[0] = cli_sa.v6.sin6_port;
+ expected.skb_ports[1] = srv_sa.v6.sin6_port;
+ } else {
+ expected.eth_protocol = htons(ETH_P_IP);
+ expected.bind_inany = !srv_sa.v4.sin_addr.s_addr;
+
+ expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr;
+ expected.skb_addrs[1] = htonl(INADDR_LOOPBACK);
+ expected.skb_ports[0] = cli_sa.v4.sin_port;
+ expected.skb_ports[1] = srv_sa.v4.sin_port;
+ }
+
+ if (memcmp(&result, &expected, offsetof(struct data_check,
+ equal_check_end))) {
+ printf("unexpected data_check\n");
+ printf(" result: (0x%x, %u, %u)\n",
+ result.eth_protocol, result.ip_protocol,
+ result.bind_inany);
+ printf("expected: (0x%x, %u, %u)\n",
+ expected.eth_protocol, expected.ip_protocol,
+ expected.bind_inany);
+ CHECK(1, "data_check result != expected",
+ "bpf_prog_linum:%u\n", get_linum());
+ }
+
+ CHECK(!result.hash, "data_check result.hash empty",
+ "result.hash:%u", result.hash);
+
+ expected.len += cmd ? sizeof(*cmd) : 0;
+ if (type == SOCK_STREAM)
+ CHECK(expected.len > result.len, "expected.len > result.len",
+ "expected.len:%u result.len:%u bpf_prog_linum:%u\n",
+ expected.len, result.len, get_linum());
+ else
+ CHECK(expected.len != result.len, "expected.len != result.len",
+ "expected.len:%u result.len:%u bpf_prog_linum:%u\n",
+ expected.len, result.len, get_linum());
+}
+
+static void check_results(void)
+{
+ __u32 results[NR_RESULTS];
+ __u32 i, broken = 0;
+ int err;
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ err = bpf_map_lookup_elem(result_map, &i, &results[i]);
+ CHECK(err == -1, "lookup_elem(result_map)",
+ "i:%u err:%d errno:%d\n", i, err, errno);
+ }
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ if (results[i] != expected_results[i]) {
+ broken = i;
+ break;
+ }
+ }
+
+ if (i == NR_RESULTS)
+ return;
+
+ printf("unexpected result\n");
+ printf(" result: [");
+ printf("%u", results[0]);
+ for (i = 1; i < NR_RESULTS; i++)
+ printf(", %u", results[i]);
+ printf("]\n");
+
+ printf("expected: [");
+ printf("%u", expected_results[0]);
+ for (i = 1; i < NR_RESULTS; i++)
+ printf(", %u", expected_results[i]);
+ printf("]\n");
+
+ CHECK(expected_results[broken] != results[broken],
+ "unexpected result",
+ "expected_results[%u] != results[%u] bpf_prog_linum:%u\n",
+ broken, broken, get_linum());
+}
+
+static int send_data(int type, sa_family_t family, void *data, size_t len,
+ enum result expected)
+{
+ union sa46 cli_sa;
+ int fd, err;
+
+ fd = socket(family, type, 0);
+ CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno);
+
+ sa46_init_loopback(&cli_sa, family);
+ err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa));
+ CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno);
+
+ err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa,
+ sizeof(srv_sa));
+ CHECK(err != len && expected >= PASS,
+ "sendto()", "family:%u err:%d errno:%d expected:%d\n",
+ family, err, errno, expected);
+
+ return fd;
+}
+
+static void do_test(int type, sa_family_t family, struct cmd *cmd,
+ enum result expected)
+{
+ int nev, srv_fd, cli_fd;
+ struct epoll_event ev;
+ struct cmd rcv_cmd;
+ ssize_t nread;
+
+ cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0,
+ expected);
+ nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0);
+ CHECK((nev <= 0 && expected >= PASS) ||
+ (nev > 0 && expected < PASS),
+ "nev <> expected",
+ "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n",
+ nev, expected, type, family,
+ cmd ? cmd->reuseport_index : -1,
+ cmd ? cmd->pass_on_failure : -1);
+ check_results();
+ check_data(type, family, cmd, cli_fd);
+
+ if (expected < PASS)
+ return;
+
+ CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT &&
+ cmd->reuseport_index != ev.data.u32,
+ "check cmd->reuseport_index",
+ "cmd:(%u, %u) ev.data.u32:%u\n",
+ cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32);
+
+ srv_fd = sk_fds[ev.data.u32];
+ if (type == SOCK_STREAM) {
+ int new_fd = accept(srv_fd, NULL, 0);
+
+ CHECK(new_fd == -1, "accept(srv_fd)",
+ "ev.data.u32:%u new_fd:%d errno:%d\n",
+ ev.data.u32, new_fd, errno);
+
+ nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT);
+ CHECK(nread != sizeof(rcv_cmd),
+ "recv(new_fd)",
+ "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
+ ev.data.u32, nread, sizeof(rcv_cmd), errno);
+
+ close(new_fd);
+ } else {
+ nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT);
+ CHECK(nread != sizeof(rcv_cmd),
+ "recv(sk_fds)",
+ "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
+ ev.data.u32, nread, sizeof(rcv_cmd), errno);
+ }
+
+ close(cli_fd);
+}
+
+static void test_err_inner_map(int type, sa_family_t family)
+{
+ struct cmd cmd = {
+ .reuseport_index = 0,
+ .pass_on_failure = 0,
+ };
+
+ printf("%s: ", __func__);
+ expected_results[DROP_ERR_INNER_MAP]++;
+ do_test(type, family, &cmd, DROP_ERR_INNER_MAP);
+ printf("OK\n");
+}
+
+static void test_err_skb_data(int type, sa_family_t family)
+{
+ printf("%s: ", __func__);
+ expected_results[DROP_ERR_SKB_DATA]++;
+ do_test(type, family, NULL, DROP_ERR_SKB_DATA);
+ printf("OK\n");
+}
+
+static void test_err_sk_select_port(int type, sa_family_t family)
+{
+ struct cmd cmd = {
+ .reuseport_index = REUSEPORT_ARRAY_SIZE,
+ .pass_on_failure = 0,
+ };
+
+ printf("%s: ", __func__);
+ expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++;
+ do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT);
+ printf("OK\n");
+}
+
+static void test_pass(int type, sa_family_t family)
+{
+ struct cmd cmd;
+ int i;
+
+ printf("%s: ", __func__);
+ cmd.pass_on_failure = 0;
+ for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) {
+ expected_results[PASS]++;
+ cmd.reuseport_index = i;
+ do_test(type, family, &cmd, PASS);
+ }
+ printf("OK\n");
+}
+
+static void test_syncookie(int type, sa_family_t family)
+{
+ int err, tmp_index = 1;
+ struct cmd cmd = {
+ .reuseport_index = 0,
+ .pass_on_failure = 0,
+ };
+
+ if (type != SOCK_STREAM)
+ return;
+
+ printf("%s: ", __func__);
+ /*
+ * +1 for TCP-SYN and
+ * +1 for the TCP-ACK (ack the syncookie)
+ */
+ expected_results[PASS] += 2;
+ enable_syncookie();
+ /*
+ * Simulate TCP-SYN and TCP-ACK are handled by two different sk:
+ * TCP-SYN: select sk_fds[tmp_index = 1] tmp_index is from the
+ * tmp_index_ovr_map
+ * TCP-ACK: select sk_fds[reuseport_index = 0] reuseport_index
+ * is from the cmd.reuseport_index
+ */
+ err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero,
+ &tmp_index, BPF_ANY);
+ CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)",
+ "err:%d errno:%d\n", err, errno);
+ do_test(type, family, &cmd, PASS);
+ err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero,
+ &tmp_index);
+ CHECK(err == -1 || tmp_index != -1,
+ "lookup_elem(tmp_index_ovr_map)",
+ "err:%d errno:%d tmp_index:%d\n",
+ err, errno, tmp_index);
+ disable_syncookie();
+ printf("OK\n");
+}
+
+static void test_pass_on_err(int type, sa_family_t family)
+{
+ struct cmd cmd = {
+ .reuseport_index = REUSEPORT_ARRAY_SIZE,
+ .pass_on_failure = 1,
+ };
+
+ printf("%s: ", __func__);
+ expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1;
+ do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT);
+ printf("OK\n");
+}
+
+static void prepare_sk_fds(int type, sa_family_t family, bool inany)
+{
+ const int first = REUSEPORT_ARRAY_SIZE - 1;
+ int i, err, optval = 1;
+ struct epoll_event ev;
+ socklen_t addrlen;
+
+ if (inany)
+ sa46_init_inany(&srv_sa, family);
+ else
+ sa46_init_loopback(&srv_sa, family);
+ addrlen = sizeof(srv_sa);
+
+ /*
+ * The sk_fds[] is filled from the back such that the order
+ * is exactly opposite to the (struct sock_reuseport *)reuse->socks[].
+ */
+ for (i = first; i >= 0; i--) {
+ sk_fds[i] = socket(family, type, 0);
+ CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n",
+ i, sk_fds[i], errno);
+ err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT,
+ &optval, sizeof(optval));
+ CHECK(err == -1, "setsockopt(SO_REUSEPORT)",
+ "sk_fds[%d] err:%d errno:%d\n",
+ i, err, errno);
+
+ if (i == first) {
+ err = setsockopt(sk_fds[i], SOL_SOCKET,
+ SO_ATTACH_REUSEPORT_EBPF,
+ &select_by_skb_data_prog,
+ sizeof(select_by_skb_data_prog));
+ CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)",
+ "err:%d errno:%d\n", err, errno);
+ }
+
+ err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen);
+ CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n",
+ i, err, errno);
+
+ if (type == SOCK_STREAM) {
+ err = listen(sk_fds[i], 10);
+ CHECK(err == -1, "listen()",
+ "sk_fds[%d] err:%d errno:%d\n",
+ i, err, errno);
+ }
+
+ err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i],
+ BPF_NOEXIST);
+ CHECK(err == -1, "update_elem(reuseport_array)",
+ "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
+
+ if (i == first) {
+ socklen_t addrlen = sizeof(srv_sa);
+
+ err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa,
+ &addrlen);
+ CHECK(err == -1, "getsockname()",
+ "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
+ }
+ }
+
+ epfd = epoll_create(1);
+ CHECK(epfd == -1, "epoll_create(1)",
+ "epfd:%d errno:%d\n", epfd, errno);
+
+ ev.events = EPOLLIN;
+ for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) {
+ ev.data.u32 = i;
+ err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev);
+ CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i);
+ }
+}
+
+static void setup_per_test(int type, unsigned short family, bool inany)
+{
+ int ovr = -1, err;
+
+ prepare_sk_fds(type, family, inany);
+ err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr,
+ BPF_ANY);
+ CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)",
+ "err:%d errno:%d\n", err, errno);
+}
+
+static void cleanup_per_test(void)
+{
+ int i, err;
+
+ for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++)
+ close(sk_fds[i]);
+ close(epfd);
+
+ err = bpf_map_delete_elem(outer_map, &index_zero);
+ CHECK(err == -1, "delete_elem(outer_map)",
+ "err:%d errno:%d\n", err, errno);
+}
+
+static void cleanup(void)
+{
+ close(outer_map);
+ close(reuseport_array);
+ bpf_object__close(obj);
+}
+
+static void test_all(void)
+{
+ /* Extra SOCK_STREAM to test bind_inany==true */
+ const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM };
+ const char * const type_strings[] = { "TCP", "UDP", "TCP" };
+ const char * const family_strings[] = { "IPv6", "IPv4" };
+ const unsigned short families[] = { AF_INET6, AF_INET };
+ const bool bind_inany[] = { false, false, true };
+ int t, f, err;
+
+ for (f = 0; f < ARRAY_SIZE(families); f++) {
+ unsigned short family = families[f];
+
+ for (t = 0; t < ARRAY_SIZE(types); t++) {
+ bool inany = bind_inany[t];
+ int type = types[t];
+
+ printf("######## %s/%s %s ########\n",
+ family_strings[f], type_strings[t],
+ inany ? " INANY " : "LOOPBACK");
+
+ setup_per_test(type, family, inany);
+
+ test_err_inner_map(type, family);
+
+ /* Install reuseport_array to the outer_map */
+ err = bpf_map_update_elem(outer_map, &index_zero,
+ &reuseport_array, BPF_ANY);
+ CHECK(err == -1, "update_elem(outer_map)",
+ "err:%d errno:%d\n", err, errno);
+
+ test_err_skb_data(type, family);
+ test_err_sk_select_port(type, family);
+ test_pass(type, family);
+ test_syncookie(type, family);
+ test_pass_on_err(type, family);
+
+ cleanup_per_test();
+ printf("\n");
+ }
+ }
+}
+
+int main(int argc, const char **argv)
+{
+ create_maps();
+ prepare_bpf_obj();
+ saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL);
+ saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL);
+ enable_fastopen();
+ disable_syncookie();
+ atexit(restore_sysctls);
+
+ test_all();
+
+ cleanup();
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_select_reuseport_common.h b/tools/testing/selftests/bpf/test_select_reuseport_common.h
new file mode 100644
index 000000000000..08eb2a9f145f
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_select_reuseport_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef __TEST_SELECT_REUSEPORT_COMMON_H
+#define __TEST_SELECT_REUSEPORT_COMMON_H
+
+#include <linux/types.h>
+
+enum result {
+ DROP_ERR_INNER_MAP,
+ DROP_ERR_SKB_DATA,
+ DROP_ERR_SK_SELECT_REUSEPORT,
+ DROP_MISC,
+ PASS,
+ PASS_ERR_SK_SELECT_REUSEPORT,
+ NR_RESULTS,
+};
+
+struct cmd {
+ __u32 reuseport_index;
+ __u32 pass_on_failure;
+};
+
+struct data_check {
+ __u32 ip_protocol;
+ __u32 skb_addrs[8];
+ __u16 skb_ports[2];
+ __u16 eth_protocol;
+ __u8 bind_inany;
+ __u8 equal_check_end[0];
+
+ __u32 len;
+ __u32 hash;
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/test_select_reuseport_kern.c
new file mode 100644
index 000000000000..5b54ec637ada
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_select_reuseport_kern.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+#include "test_select_reuseport_common.h"
+
+int _version SEC("version") = 1;
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+struct bpf_map_def SEC("maps") outer_map = {
+ .type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u32),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") result_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u32),
+ .max_entries = NR_RESULTS,
+};
+
+struct bpf_map_def SEC("maps") tmp_index_ovr_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(int),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") linum_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u32),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") data_check_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct data_check),
+ .max_entries = 1,
+};
+
+#define GOTO_DONE(_result) ({ \
+ result = (_result); \
+ linum = __LINE__; \
+ goto done; \
+})
+
+SEC("select_by_skb_data")
+int _select_by_skb_data(struct sk_reuseport_md *reuse_md)
+{
+ __u32 linum, index = 0, flags = 0, index_zero = 0;
+ __u32 *result_cnt, *linum_value;
+ struct data_check data_check = {};
+ struct cmd *cmd, cmd_copy;
+ void *data, *data_end;
+ void *reuseport_array;
+ enum result result;
+ int *index_ovr;
+ int err;
+
+ data = reuse_md->data;
+ data_end = reuse_md->data_end;
+ data_check.len = reuse_md->len;
+ data_check.eth_protocol = reuse_md->eth_protocol;
+ data_check.ip_protocol = reuse_md->ip_protocol;
+ data_check.hash = reuse_md->hash;
+ data_check.bind_inany = reuse_md->bind_inany;
+ if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) {
+ if (bpf_skb_load_bytes_relative(reuse_md,
+ offsetof(struct iphdr, saddr),
+ data_check.skb_addrs, 8,
+ BPF_HDR_START_NET))
+ GOTO_DONE(DROP_MISC);
+ } else {
+ if (bpf_skb_load_bytes_relative(reuse_md,
+ offsetof(struct ipv6hdr, saddr),
+ data_check.skb_addrs, 32,
+ BPF_HDR_START_NET))
+ GOTO_DONE(DROP_MISC);
+ }
+
+ /*
+ * The ip_protocol could be a compile time decision
+ * if the bpf_prog.o is dedicated to either TCP or
+ * UDP.
+ *
+ * Otherwise, reuse_md->ip_protocol or
+ * the protocol field in the iphdr can be used.
+ */
+ if (data_check.ip_protocol == IPPROTO_TCP) {
+ struct tcphdr *th = data;
+
+ if (th + 1 > data_end)
+ GOTO_DONE(DROP_MISC);
+
+ data_check.skb_ports[0] = th->source;
+ data_check.skb_ports[1] = th->dest;
+
+ if ((th->doff << 2) + sizeof(*cmd) > data_check.len)
+ GOTO_DONE(DROP_ERR_SKB_DATA);
+ if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy,
+ sizeof(cmd_copy)))
+ GOTO_DONE(DROP_MISC);
+ cmd = &cmd_copy;
+ } else if (data_check.ip_protocol == IPPROTO_UDP) {
+ struct udphdr *uh = data;
+
+ if (uh + 1 > data_end)
+ GOTO_DONE(DROP_MISC);
+
+ data_check.skb_ports[0] = uh->source;
+ data_check.skb_ports[1] = uh->dest;
+
+ if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len)
+ GOTO_DONE(DROP_ERR_SKB_DATA);
+ if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) {
+ if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr),
+ &cmd_copy, sizeof(cmd_copy)))
+ GOTO_DONE(DROP_MISC);
+ cmd = &cmd_copy;
+ } else {
+ cmd = data + sizeof(struct udphdr);
+ }
+ } else {
+ GOTO_DONE(DROP_MISC);
+ }
+
+ reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero);
+ if (!reuseport_array)
+ GOTO_DONE(DROP_ERR_INNER_MAP);
+
+ index = cmd->reuseport_index;
+ index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero);
+ if (!index_ovr)
+ GOTO_DONE(DROP_MISC);
+
+ if (*index_ovr != -1) {
+ index = *index_ovr;
+ *index_ovr = -1;
+ }
+ err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index,
+ flags);
+ if (!err)
+ GOTO_DONE(PASS);
+
+ if (cmd->pass_on_failure)
+ GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT);
+ else
+ GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT);
+
+done:
+ result_cnt = bpf_map_lookup_elem(&result_map, &result);
+ if (!result_cnt)
+ return SK_DROP;
+
+ bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY);
+ bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY);
+
+ (*result_cnt)++;
+ return result < PASS ? SK_DROP : SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index f4d99fabc56d..b8ebe2f58074 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -14,10 +14,7 @@
#include "cgroup_helpers.h"
#include "bpf_rlimit.h"
-
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
+#include "bpf_util.h"
#define CG_PATH "/foo"
#define MAX_INSNS 512
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 2e45c92d1111..aeeb76a54d63 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -20,15 +20,12 @@
#include "cgroup_helpers.h"
#include "bpf_rlimit.h"
+#include "bpf_util.h"
#ifndef ENOTSUPP
# define ENOTSUPP 524
#endif
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
#define CG_PATH "/foo"
#define CONNECT4_PROG_PATH "./connect4_prog.o"
#define CONNECT6_PROG_PATH "./connect6_prog.o"
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 452cf5c6c784..67c412d19c09 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -42,12 +42,9 @@
#endif
#include "bpf_rlimit.h"
#include "bpf_rand.h"
+#include "bpf_util.h"
#include "../../../include/linux/filter.h"
-#ifndef ARRAY_SIZE
-# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
#define MAX_INSNS BPF_MAXINSNS
#define MAX_FIXUPS 8
#define MAX_NR_MAPS 8