summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xtools/ebpf/Makefile.ebpf21
-rw-r--r--tools/ebpf/rss.bpf.c571
2 files changed, 592 insertions, 0 deletions
diff --git a/tools/ebpf/Makefile.ebpf b/tools/ebpf/Makefile.ebpf
new file mode 100755
index 0000000000..8f327ae3b8
--- /dev/null
+++ b/tools/ebpf/Makefile.ebpf
@@ -0,0 +1,21 @@
+OBJS = rss.bpf.o
+
+LLC ?= llc
+CLANG ?= clang
+INC_FLAGS = `$(CLANG) -print-file-name=include`
+EXTRA_CFLAGS ?= -O2 -emit-llvm -fno-stack-protector
+
+all: $(OBJS)
+
+.PHONY: clean
+
+clean:
+ rm -f $(OBJS)
+
+$(OBJS): %.o:%.c
+ $(CLANG) $(INC_FLAGS) \
+ -D__KERNEL__ -D__ASM_SYSREG_H \
+ -I../include $(LINUXINCLUDE) \
+ $(EXTRA_CFLAGS) -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
+ bpftool gen skeleton rss.bpf.o > rss.bpf.skeleton.h
+ cp rss.bpf.skeleton.h ../../ebpf/
diff --git a/tools/ebpf/rss.bpf.c b/tools/ebpf/rss.bpf.c
new file mode 100644
index 0000000000..e85ec55f9b
--- /dev/null
+++ b/tools/ebpf/rss.bpf.c
@@ -0,0 +1,571 @@
+/*
+ * eBPF RSS program
+ *
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Andrew Melnychenko <andrew@daynix.com>
+ * Yuri Benditovich <yuri.benditovich@daynix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Prepare:
+ * Requires llvm, clang, bpftool, linux kernel tree
+ *
+ * Build rss.bpf.skeleton.h:
+ * make -f Makefile.ebpf clean all
+ */
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <linux/virtio_net.h>
+
+#define INDIRECTION_TABLE_SIZE 128
+#define HASH_CALCULATION_BUFFER_SIZE 36
+
+struct rss_config_t {
+ __u8 redirect;
+ __u8 populate_hash;
+ __u32 hash_types;
+ __u16 indirections_len;
+ __u16 default_queue;
+} __attribute__((packed));
+
+struct toeplitz_key_data_t {
+ __u32 leftmost_32_bits;
+ __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
+};
+
+struct packet_hash_info_t {
+ __u8 is_ipv4;
+ __u8 is_ipv6;
+ __u8 is_udp;
+ __u8 is_tcp;
+ __u8 is_ipv6_ext_src;
+ __u8 is_ipv6_ext_dst;
+ __u8 is_fragmented;
+
+ __u16 src_port;
+ __u16 dst_port;
+
+ union {
+ struct {
+ __be32 in_src;
+ __be32 in_dst;
+ };
+
+ struct {
+ struct in6_addr in6_src;
+ struct in6_addr in6_dst;
+ struct in6_addr in6_ext_src;
+ struct in6_addr in6_ext_dst;
+ };
+ };
+};
+
+struct bpf_map_def SEC("maps")
+tap_rss_map_configurations = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct rss_config_t),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps")
+tap_rss_map_toeplitz_key = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct toeplitz_key_data_t),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps")
+tap_rss_map_indirection_table = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u16),
+ .max_entries = INDIRECTION_TABLE_SIZE,
+};
+
+static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
+ const void *ptr, size_t size) {
+ __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
+ *bytes_written += size;
+}
+
+static inline
+void net_toeplitz_add(__u32 *result,
+ __u8 *input,
+ __u32 len
+ , struct toeplitz_key_data_t *key) {
+
+ __u32 accumulator = *result;
+ __u32 leftmost_32_bits = key->leftmost_32_bits;
+ __u32 byte;
+
+ for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
+ __u8 input_byte = input[byte];
+ __u8 key_byte = key->next_byte[byte];
+ __u8 bit;
+
+ for (bit = 0; bit < 8; bit++) {
+ if (input_byte & (1 << 7)) {
+ accumulator ^= leftmost_32_bits;
+ }
+
+ leftmost_32_bits =
+ (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
+
+ input_byte <<= 1;
+ key_byte <<= 1;
+ }
+ }
+
+ *result = accumulator;
+}
+
+
+static inline int ip6_extension_header_type(__u8 hdr_type)
+{
+ switch (hdr_type) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_FRAGMENT:
+ case IPPROTO_ICMPV6:
+ case IPPROTO_NONE:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_MH:
+ return 1;
+ default:
+ return 0;
+ }
+}
+/*
+ * According to
+ * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
+ * we expect that there are would be no more than 11 extensions in IPv6 header,
+ * also there is 27 TLV options for Destination and Hop-by-hop extensions.
+ * Need to choose reasonable amount of maximum extensions/options we may
+ * check to find ext src/dst.
+ */
+#define IP6_EXTENSIONS_COUNT 11
+#define IP6_OPTIONS_COUNT 30
+
+static inline int parse_ipv6_ext(struct __sk_buff *skb,
+ struct packet_hash_info_t *info,
+ __u8 *l4_protocol, size_t *l4_offset)
+{
+ int err = 0;
+
+ if (!ip6_extension_header_type(*l4_protocol)) {
+ return 0;
+ }
+
+ struct ipv6_opt_hdr ext_hdr = {};
+
+ for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
+
+ err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
+ sizeof(ext_hdr), BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ if (*l4_protocol == IPPROTO_ROUTING) {
+ struct ipv6_rt_hdr ext_rt = {};
+
+ err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
+ sizeof(ext_rt), BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
+ (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
+ (ext_rt.segments_left == 1)) {
+
+ err = bpf_skb_load_bytes_relative(skb,
+ *l4_offset + offsetof(struct rt2_hdr, addr),
+ &info->in6_ext_dst, sizeof(info->in6_ext_dst),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->is_ipv6_ext_dst = 1;
+ }
+
+ } else if (*l4_protocol == IPPROTO_DSTOPTS) {
+ struct ipv6_opt_t {
+ __u8 type;
+ __u8 length;
+ } __attribute__((packed)) opt = {};
+
+ size_t opt_offset = sizeof(ext_hdr);
+
+ for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
+ err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
+ &opt, sizeof(opt), BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ if (opt.type == IPV6_TLV_HAO) {
+ err = bpf_skb_load_bytes_relative(skb,
+ *l4_offset + opt_offset
+ + offsetof(struct ipv6_destopt_hao, addr),
+ &info->in6_ext_src, sizeof(info->in6_ext_src),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->is_ipv6_ext_src = 1;
+ break;
+ }
+
+ opt_offset += (opt.type == IPV6_TLV_PAD1) ?
+ 1 : opt.length + sizeof(opt);
+
+ if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
+ break;
+ }
+ }
+ } else if (*l4_protocol == IPPROTO_FRAGMENT) {
+ info->is_fragmented = true;
+ }
+
+ *l4_protocol = ext_hdr.nexthdr;
+ *l4_offset += (ext_hdr.hdrlen + 1) * 8;
+
+ if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
+ return 0;
+ }
+ }
+
+ return 0;
+error:
+ return err;
+}
+
+static __be16 parse_eth_type(struct __sk_buff *skb)
+{
+ unsigned int offset = 12;
+ __be16 ret = 0;
+ int err = 0;
+
+ err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
+ BPF_HDR_START_MAC);
+ if (err) {
+ return 0;
+ }
+
+ switch (bpf_ntohs(ret)) {
+ case ETH_P_8021AD:
+ offset += 4;
+ case ETH_P_8021Q:
+ offset += 4;
+ err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
+ BPF_HDR_START_MAC);
+ default:
+ break;
+ }
+
+ if (err) {
+ return 0;
+ }
+
+ return ret;
+}
+
+static inline int parse_packet(struct __sk_buff *skb,
+ struct packet_hash_info_t *info)
+{
+ int err = 0;
+
+ if (!info || !skb) {
+ return -1;
+ }
+
+ size_t l4_offset = 0;
+ __u8 l4_protocol = 0;
+ __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
+ if (l3_protocol == 0) {
+ err = -1;
+ goto error;
+ }
+
+ if (l3_protocol == ETH_P_IP) {
+ info->is_ipv4 = 1;
+
+ struct iphdr ip = {};
+ err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->in_src = ip.saddr;
+ info->in_dst = ip.daddr;
+ info->is_fragmented = !!ip.frag_off;
+
+ l4_protocol = ip.protocol;
+ l4_offset = ip.ihl * 4;
+ } else if (l3_protocol == ETH_P_IPV6) {
+ info->is_ipv6 = 1;
+
+ struct ipv6hdr ip6 = {};
+ err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->in6_src = ip6.saddr;
+ info->in6_dst = ip6.daddr;
+
+ l4_protocol = ip6.nexthdr;
+ l4_offset = sizeof(ip6);
+
+ err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
+ if (err) {
+ goto error;
+ }
+ }
+
+ if (l4_protocol != 0 && !info->is_fragmented) {
+ if (l4_protocol == IPPROTO_TCP) {
+ info->is_tcp = 1;
+
+ struct tcphdr tcp = {};
+ err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->src_port = tcp.source;
+ info->dst_port = tcp.dest;
+ } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
+ info->is_udp = 1;
+
+ struct udphdr udp = {};
+ err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->src_port = udp.source;
+ info->dst_port = udp.dest;
+ }
+ }
+
+ return 0;
+
+error:
+ return err;
+}
+
+static inline __u32 calculate_rss_hash(struct __sk_buff *skb,
+ struct rss_config_t *config, struct toeplitz_key_data_t *toe)
+{
+ __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
+ size_t bytes_written = 0;
+ __u32 result = 0;
+ int err = 0;
+ struct packet_hash_info_t packet_info = {};
+
+ err = parse_packet(skb, &packet_info);
+ if (err) {
+ return 0;
+ }
+
+ if (packet_info.is_ipv4) {
+ if (packet_info.is_tcp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_src,
+ sizeof(packet_info.in_src));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_dst,
+ sizeof(packet_info.in_dst));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+ } else if (packet_info.is_udp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_src,
+ sizeof(packet_info.in_src));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_dst,
+ sizeof(packet_info.in_dst));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+ } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_src,
+ sizeof(packet_info.in_src));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_dst,
+ sizeof(packet_info.in_dst));
+ }
+ } else if (packet_info.is_ipv6) {
+ if (packet_info.is_tcp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
+
+ if (packet_info.is_ipv6_ext_src &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_src,
+ sizeof(packet_info.in6_ext_src));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_src,
+ sizeof(packet_info.in6_src));
+ }
+ if (packet_info.is_ipv6_ext_dst &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_dst,
+ sizeof(packet_info.in6_ext_dst));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_dst,
+ sizeof(packet_info.in6_dst));
+ }
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+ } else if (packet_info.is_udp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
+
+ if (packet_info.is_ipv6_ext_src &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_src,
+ sizeof(packet_info.in6_ext_src));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_src,
+ sizeof(packet_info.in6_src));
+ }
+ if (packet_info.is_ipv6_ext_dst &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_dst,
+ sizeof(packet_info.in6_ext_dst));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_dst,
+ sizeof(packet_info.in6_dst));
+ }
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+
+ } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
+ if (packet_info.is_ipv6_ext_src &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_src,
+ sizeof(packet_info.in6_ext_src));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_src,
+ sizeof(packet_info.in6_src));
+ }
+ if (packet_info.is_ipv6_ext_dst &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_dst,
+ sizeof(packet_info.in6_ext_dst));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_dst,
+ sizeof(packet_info.in6_dst));
+ }
+ }
+ }
+
+ if (bytes_written) {
+ net_toeplitz_add(&result, rss_input, bytes_written, toe);
+ }
+
+ return result;
+}
+
+SEC("tun_rss_steering")
+int tun_rss_steering_prog(struct __sk_buff *skb)
+{
+
+ struct rss_config_t *config;
+ struct toeplitz_key_data_t *toe;
+
+ __u32 key = 0;
+ __u32 hash = 0;
+
+ config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
+ toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
+
+ if (config && toe) {
+ if (!config->redirect) {
+ return config->default_queue;
+ }
+
+ hash = calculate_rss_hash(skb, config, toe);
+ if (hash) {
+ __u32 table_idx = hash % config->indirections_len;
+ __u16 *queue = 0;
+
+ queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
+ &table_idx);
+
+ if (queue) {
+ return *queue;
+ }
+ }
+
+ return config->default_queue;
+ }
+
+ return -1;
+}
+
+char _license[] SEC("license") = "GPL v2";